diff --git a/.gitignore b/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..ee6f9832b24aaf1330d4fdcc726e05988eecc0f9
--- /dev/null
+++ b/.gitignore
@@ -0,0 +1,142 @@
+# macOS
+.DS_Store
+.AppleDouble
+.LSOverride
+.Spotlight-V100
+.Trashes
+
+# Python
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Virtual environments
+.venv/
+venv/
+env/
+.envrc
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache/
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+pytestdebug.log
+coverage/
+
+# Type checkers / linters
+.mypy_cache/
+.dmypy.json
+dmypy.json
+.pyre/
+.pytype/
+.ruff_cache/
+
+# Jupyter Notebook
+.ipynb_checkpoints/
+profile_default/
+
+# IPython
+ipython_config.py
+
+# VSCode
+.vscode/
+
+# IDEs
+.idea/
+*.iml
+*.sublime-project
+*.sublime-workspace
+
+# Logs and temp files
+logs/
+*.log
+log/
+tmp/
+temp/
+
+# TensorBoard
+events.out.tfevents.*
+
+# ML experiment tracking
+wandb/
+mlruns/
+lightning_logs/
+checkpoints/
+runs/
+
+# Data & outputs (uncomment if you keep these out of git)
+# data/
+# datasets/
+# output/
+# outputs/
+# models/
+# results/
+
+# System files (Windows)
+Thumbs.db
+ehthumbs.db
+Desktop.ini
+
+# Secrets and environment files
+.env
+.env.*
+*.env
+*.secret
+*.key
+*.pem
+
+# Node (if present)
+node_modules/
+npm-debug.log*
+yarn-debug.log*
+yarn-error.log*
+pnpm-debug.log*
+
+# Pyenv / Poetry
+.python-version
+poetry.lock
+
+# Editor swap/backup
+*~
+*.swp
+*.swo
\ No newline at end of file
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..261eeb9e9f8b2b4b0d119366dda99c6fd7d35c64
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,201 @@
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+
+   1. Definitions.
+
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+
+   END OF TERMS AND CONDITIONS
+
+   APPENDIX: How to apply the Apache License to your work.
+
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+
+   Copyright [yyyy] [name of copyright owner]
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+       http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.
diff --git a/README.md b/README.md
index f9e86bda25c712ddc6a159bef3359e81b7acd276..9ae55a56abba20313ae516fb7b8aa43c07d41a85 100644
--- a/README.md
+++ b/README.md
@@ -1,14 +1,305 @@
 ---
-title: ML Starter
-emoji: 🌖
-colorFrom: purple
-colorTo: gray
+title: ML Starter MCP Server
+emoji: 🧠
+colorFrom: blue
+colorTo: green
 sdk: gradio
-sdk_version: 6.0.1
+sdk_version: "6.0.0"
 app_file: app.py
-pinned: false
 license: apache-2.0
-short_description: MCP server that exposes a problem-specific ML codes
+pinned: true
+short_description: Pure-retrieval MCP server that indexes the ML Starter knowledge base with deterministic semantics search.
+tags:
+  - building-mcp-track-enterprise
+  - gradio
+  - mcp
+  - retrieval
+  - embeddings
+  - python
+  - knowledge-base
+  - semantic-search
+  - sentence-transformers
+  - huggingface
 ---
 
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
+# ML Starter MCP Server
+<p align="center">
+  <img src="https://dummyimage.com/1000x180/020617/ffffff&text=ML+Starter+MCP+Server" height="90px" alt="ML Starter Banner">
+</p>
+
+Gradio-powered **remote-only** MCP server that exposes a curated ML knowledge base through deterministic, read-only tooling. Ideal for editors like Claude Desktop, VS Code (Kilo Code), or Cursor that want a trustworthy retrieval endpoint with **no side-effects**.
+
+![Python](https://img.shields.io/badge/python-3.10%2B-blue) ![License](https://img.shields.io/badge/license-Apache%202.0-green) ![Status](https://img.shields.io/badge/Status-Active-success) ![MCP](https://img.shields.io/badge/MCP-enabled-brightgreen) ![Retrieval](https://img.shields.io/badge/Retrieval-pure-lightgrey) ![SentenceTransformers](https://img.shields.io/badge/Embeddings-all--MiniLM--L6--v2-6f42c1)
+
+---
+
+## 🧩 Overview
+
+The **ML Starter MCP Server** indexes the entire `knowledge_base/` tree (audio, vision, NLP, RL, etc.) and makes it searchable through:
+
+* `list_items` – enumerate every tutorial/script with metadata.
+* `semantic_search` – vector search over docstrings and lead context to find the single best code example for a natural-language brief.
+* `get_code` – return the full Python source for a safe, validated path.
+
+The server is deterministic (seeded numpy/torch), write-protected, and designed to run as a **Gradio MCP SSE endpoint** suitable for Hugging Face Spaces or on-prem deployments.
+
+---
+
+## 📚 ML Starter Knowledge Base
+
+* Root: `knowledge_base/`
+* Domains:
+  * `audio/`
+  * `generative/`
+  * `graph/`
+  * `nlp/`
+  * `rl/`
+  * `structured_data/`
+  * `timeseries/`
+  * `vision/`
+* Each file stores a complete, runnable ML example with docstring summaries leveraged during indexing.
+
+### Features exposed via MCP
+
+* ✅ Vector search via `sentence-transformers/all-MiniLM-L6-v2` with cosine similarity.
+* ⚙️ Safe path resolution ensures only in-repo `.py` files can be fetched.
+* 🧮 Metadata-first outputs (category, filename, semantic score) for quick triage.
+* 🛡️ Read-only contract; zero KB mutations, uploads, or side effects.
+* 🌐 Spaces-ready networking with auto `0.0.0.0` binding when environment variables are provided by the platform.
+
+---
+
+## 🚀 Quick Start
+
+### Installation
+
+```bash
+pip install -r requirements.txt
+```
+
+### Running the MCP Server
+
+```bash
+python -m mcp_server.server --host 127.0.0.1 --port 7860
+```
+
+* **SSE Endpoint:** `http://127.0.0.1:7860/gradio_api/mcp/sse`
+* Launch with `mcp_server=True` (handled by `mcp_server/server.py`).
+
+### VS Code Kilo Code Settings
+
+```json
+{
+  "mcpServers": {
+    "ml-starter-kb": {
+      "url": "http://127.0.0.1:7860/gradio_api/mcp/sse",
+      "disabled": false,
+      "timeout": 60,
+      "alwaysAllow": [],
+      "disabledTools": []
+    }
+  }
+}
+```
+
+### Environment Variables
+
+```bash
+export TOKENIZERS_PARALLELISM=false
+export PYTORCH_ENABLE_MPS_FALLBACK=1  # optional, improves macOS stability
+```
+
+---
+
+## 🧠 MCP Usage
+
+Any MCP-capable client can connect to the SSE endpoint to:
+
+* Browse the full inventory of ML tutorials.
+* Submit a markdown problem statement and receive the best-matching file path plus relevance score.
+* Fetch the code immediately and render it inline (clients typically syntax-highlight the response).
+
+The Gradio UI mirrors these capabilities via three tabs (List Items, Semantic Search, Get Code) for manual exploration.
+
+---
+
+## 🔤 Supported Embeddings
+
+* `sentence-transformers/all-MiniLM-L6-v2`
+
+### Configuration Example
+
+```yaml
+embedding_model: sentence-transformers/all-MiniLM-L6-v2
+batch_size: 32
+similarity: cosine
+```
+
+---
+
+## 🔍 Retrieval Strategy
+
+| Component            | Description                                                  |
+|----------------------|--------------------------------------------------------------|
+| Index Type           | In-memory cosine index backed by numpy vectors               |
+| Chunking             | File-level (docstring + prefix)                              |
+| Similarity Function  | Dot product on L2-normalized vectors                         |
+| Results Returned     | Top-1 match (deterministic)                                  |
+
+### Configuration Example
+
+```yaml
+retriever: cosine
+max_results: 1
+```
+
+---
+
+## 🧩 Folder Structure
+
+```
+ml-starter/
+├── app.py                  # Optional Gradio hook
+├── mcp_server/
+│   ├── server.py           # Remote MCP entrypoint & UI builder
+│   ├── loader.py           # KB scanning + safe path resolution
+│   ├── embeddings.py       # MiniLM wrapper + cosine index
+│   └── tools/
+│       ├── list_items.py   # list_items()
+│       ├── semantic_search.py  # semantic_search()
+│       └── get_code.py     # get_code()
+├── knowledge_base/         # ML examples grouped by domain
+├── requirements.txt
+└── README.md
+```
+
+---
+
+## 🔧 MCP Tools (`mcp_server/server.py`)
+
+| MCP Tool       | Python Function                    | Description                                                                             |
+|----------------|------------------------------------|-----------------------------------------------------------------------------------------|
+| `list_items`   | `list_items()`                      | Enumerates every KB entry with category, filename, absolute path, and summary metadata. |
+| `semantic_search` | `semantic_search(problem_markdown: str)` | Embeds the prompt and returns the single best match plus cosine score.                  |
+| `get_code`     | `get_code(path: str)`              | Streams back the full Python source for a validated KB path.                            |
+
+`server.py` registers these functions with Gradio's MCP adapter, wires docstrings into tool descriptions, and ensures the SSE endpoint stays read-only.
+
+---
+
+## 🎬 Demo
+
+* In progress
+
+---
+
+## 📥 Inputs
+
+### 1. `list_items`
+
+No input parameters; returns the entire catalog.
+
+### 2. `semantic_search`
+
+<details>
+<summary>Input Model</summary>
+
+| Field            | Type   | Description                                             | Example                                                         |
+|------------------|--------|---------------------------------------------------------|-----------------------------------------------------------------|
+| problem_markdown | str    | Natural-language description of the ML task or need.    | "I need a transformer example for multilingual NER."           |
+</details>
+
+### 3. `get_code`
+
+<details>
+<summary>Input Model</summary>
+
+| Field | Type | Description                                   | Example                                              |
+|-------|------|-----------------------------------------------|------------------------------------------------------|
+| path  | str  | KB-relative or absolute path to a `.py` file. | "knowledge_base/nlp/text_classification_from_scratch.py" |
+</details>
+
+---
+
+## 📤 Outputs
+
+### 1. `list_items`
+
+<details>
+<summary>Response Example</summary>
+
+```json
+[
+  {
+    "id": "nlp/text_classification_with_transformer.py",
+    "category": "nlp",
+    "filename": "text_classification_with_transformer.py",
+    "path": "knowledge_base/nlp/text_classification_with_transformer.py",
+    "summary": "Fine-tune a Transformer for sentiment classification."
+  }
+]
+```
+</details>
+
+### 2. `semantic_search`
+
+<details>
+<summary>Response Example</summary>
+
+```json
+{
+  "best_match": "knowledge_base/nlp/text_classification_with_transformer.py",
+  "score": 0.89
+}
+```
+</details>
+
+### 3. `get_code`
+
+<details>
+<summary>Response Example</summary>
+
+```json
+{
+  "path": "knowledge_base/vision/grad_cam.py",
+  "source": "<full Python source>"
+}
+```
+</details>
+
+Each response is deterministic for the same corpus and embeddings, allowing MCP clients to trust caching and diffing workflows.
+
+---
+
+## 👥 Team
+
+**Team Name:** Hepheon
+
+**Team Members:**
+- **Tutkum Akyildiz** - [@Tutkum](https://huggingface.co/Tutkum) - Product
+- **Emre Atilgan** - [@emreatilgan](https://huggingface.co/emreatilgan) - Tech
+
+---
+
+## 🛠️ Next Steps
+
+Today the knowledge base focuses on curated **Keras** walkthroughs. Upcoming updates will expand coverage to include:
+
+* TensorFlow
+* PyTorch 
+* scikit-learn
+* ...
+
+These additions will land in the same deterministic retrieval flow, making mixed-framework discovery as seamless as the current experience.
+
+---
+
+## 📘 License
+
+This project is licensed under the Apache License 2.0. See the [LICENSE](LICENSE) file for full terms.
+
+---
+
+<p align="center">
+  <sub>Built with ❤️ for the ML Starter knowledge base • Apache 2.0</sub>
+</p>
diff --git a/app.py b/app.py
new file mode 100644
index 0000000000000000000000000000000000000000..da0ef98745cd465195afd418c249ab640dc44bff
--- /dev/null
+++ b/app.py
@@ -0,0 +1,21 @@
+from __future__ import annotations
+
+import os
+
+import gradio as gr
+
+from mcp_server.server import create_gradio_blocks
+
+# Expose a demo/app object for Hugging Face Spaces auto-discovery
+demo: gr.Blocks = create_gradio_blocks()
+app: gr.Blocks = demo
+
+if __name__ == "__main__":
+    # Respect common env vars used by Spaces/containers
+    host = os.getenv("GRADIO_SERVER_NAME") or os.getenv("HOST") or "0.0.0.0"
+    port_str = os.getenv("GRADIO_SERVER_PORT") or os.getenv("PORT") or "7860"
+    try:
+        port = int(port_str)
+    except Exception:
+        port = 7860
+    demo.launch(server_name=host, server_port=port, mcp_server=True)
\ No newline at end of file
diff --git a/knowledge_base/audio/ctc_asr.py b/knowledge_base/audio/ctc_asr.py
new file mode 100644
index 0000000000000000000000000000000000000000..349b4f13bbdd01fce17b01f0a6f1c088dda5a24a
--- /dev/null
+++ b/knowledge_base/audio/ctc_asr.py
@@ -0,0 +1,464 @@
+"""
+Title: Automatic Speech Recognition using CTC
+Authors: [Mohamed Reda Bouadjenek](https://rbouadjenek.github.io/) and [Ngoc Dung Huynh](https://www.linkedin.com/in/parkerhuynh/)
+Date created: 2021/09/26
+Last modified: 2021/09/26
+Description: Training a CTC-based model for automatic speech recognition.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+Speech recognition is an interdisciplinary subfield of computer science
+and computational linguistics that develops methodologies and technologies
+that enable the recognition and translation of spoken language into text
+by computers. It is also known as automatic speech recognition (ASR),
+computer speech recognition or speech to text (STT). It incorporates
+knowledge and research in the computer science, linguistics and computer
+engineering fields.
+
+This demonstration shows how to combine a 2D CNN, RNN and a Connectionist
+Temporal Classification (CTC) loss to build an ASR. CTC is an algorithm
+used to train deep neural networks in speech recognition, handwriting
+recognition and other sequence problems. CTC is used when  we don’t know
+how the input aligns with the output (how the characters in the transcript
+align to the audio). The model we create is similar to
+[DeepSpeech2](https://nvidia.github.io/OpenSeq2Seq/html/speech-recognition/deepspeech2.html).
+
+We will use the LJSpeech dataset from the
+[LibriVox](https://librivox.org/) project. It consists of short
+audio clips of a single speaker reading passages from 7 non-fiction books.
+
+We will evaluate the quality of the model using
+[Word Error Rate (WER)](https://en.wikipedia.org/wiki/Word_error_rate).
+WER is obtained by adding up
+the substitutions, insertions, and deletions that occur in a sequence of
+recognized words. Divide that number by the total number of words originally
+spoken. The result is the WER. To get the WER score you need to install the
+[jiwer](https://pypi.org/project/jiwer/) package. You can use the following command line:
+
+```
+pip install jiwer
+```
+
+**References:**
+
+- [LJSpeech Dataset](https://keithito.com/LJ-Speech-Dataset/)
+- [Speech recognition](https://en.wikipedia.org/wiki/Speech_recognition)
+- [Sequence Modeling With CTC](https://distill.pub/2017/ctc/)
+- [DeepSpeech2](https://nvidia.github.io/OpenSeq2Seq/html/speech-recognition/deepspeech2.html)
+
+"""
+
+"""
+## Setup
+"""
+
+import pandas as pd
+import numpy as np
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+import matplotlib.pyplot as plt
+from IPython import display
+from jiwer import wer
+
+
+"""
+## Load the LJSpeech Dataset
+
+Let's download the [LJSpeech Dataset](https://keithito.com/LJ-Speech-Dataset/).
+The dataset contains 13,100 audio files as `wav` files in the `/wavs/` folder.
+The label (transcript) for each audio file is a string
+given in the `metadata.csv` file. The fields are:
+
+- **ID**: this is the name of the corresponding .wav file
+- **Transcription**: words spoken by the reader (UTF-8)
+- **Normalized transcription**: transcription with numbers,
+ordinals, and monetary units expanded into full words (UTF-8).
+
+For this demo we will use on the "Normalized transcription" field.
+
+Each audio file is a single-channel 16-bit PCM WAV with a sample rate of 22,050 Hz.
+"""
+
+data_url = "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2"
+data_path = keras.utils.get_file("LJSpeech-1.1", data_url, untar=True)
+wavs_path = data_path + "/wavs/"
+metadata_path = data_path + "/metadata.csv"
+
+
+# Read metadata file and parse it
+metadata_df = pd.read_csv(metadata_path, sep="|", header=None, quoting=3)
+metadata_df.columns = ["file_name", "transcription", "normalized_transcription"]
+metadata_df = metadata_df[["file_name", "normalized_transcription"]]
+metadata_df = metadata_df.sample(frac=1).reset_index(drop=True)
+metadata_df.head(3)
+
+
+"""
+We now split the data into training and validation set.
+"""
+
+split = int(len(metadata_df) * 0.90)
+df_train = metadata_df[:split]
+df_val = metadata_df[split:]
+
+print(f"Size of the training set: {len(df_train)}")
+print(f"Size of the training set: {len(df_val)}")
+
+
+"""
+## Preprocessing
+
+We first prepare the vocabulary to be used.
+"""
+
+# The set of characters accepted in the transcription.
+characters = [x for x in "abcdefghijklmnopqrstuvwxyz'?! "]
+# Mapping characters to integers
+char_to_num = keras.layers.StringLookup(vocabulary=characters, oov_token="")
+# Mapping integers back to original characters
+num_to_char = keras.layers.StringLookup(
+    vocabulary=char_to_num.get_vocabulary(), oov_token="", invert=True
+)
+
+print(
+    f"The vocabulary is: {char_to_num.get_vocabulary()} "
+    f"(size ={char_to_num.vocabulary_size()})"
+)
+
+"""
+Next, we create the function that describes the transformation that we apply to each
+element of our dataset.
+"""
+
+# An integer scalar Tensor. The window length in samples.
+frame_length = 256
+# An integer scalar Tensor. The number of samples to step.
+frame_step = 160
+# An integer scalar Tensor. The size of the FFT to apply.
+# If not provided, uses the smallest power of 2 enclosing frame_length.
+fft_length = 384
+
+
+def encode_single_sample(wav_file, label):
+    ###########################################
+    ##  Process the Audio
+    ##########################################
+    # 1. Read wav file
+    file = tf.io.read_file(wavs_path + wav_file + ".wav")
+    # 2. Decode the wav file
+    audio, _ = tf.audio.decode_wav(file)
+    audio = tf.squeeze(audio, axis=-1)
+    # 3. Change type to float
+    audio = tf.cast(audio, tf.float32)
+    # 4. Get the spectrogram
+    spectrogram = tf.signal.stft(
+        audio, frame_length=frame_length, frame_step=frame_step, fft_length=fft_length
+    )
+    # 5. We only need the magnitude, which can be derived by applying tf.abs
+    spectrogram = tf.abs(spectrogram)
+    spectrogram = tf.math.pow(spectrogram, 0.5)
+    # 6. normalisation
+    means = tf.math.reduce_mean(spectrogram, 1, keepdims=True)
+    stddevs = tf.math.reduce_std(spectrogram, 1, keepdims=True)
+    spectrogram = (spectrogram - means) / (stddevs + 1e-10)
+    ###########################################
+    ##  Process the label
+    ##########################################
+    # 7. Convert label to Lower case
+    label = tf.strings.lower(label)
+    # 8. Split the label
+    label = tf.strings.unicode_split(label, input_encoding="UTF-8")
+    # 9. Map the characters in label to numbers
+    label = char_to_num(label)
+    # 10. Return a dict as our model is expecting two inputs
+    return spectrogram, label
+
+
+"""
+## Creating `Dataset` objects
+
+We create a `tf.data.Dataset` object that yields
+the transformed elements, in the same order as they
+appeared in the input.
+"""
+
+batch_size = 32
+# Define the training dataset
+train_dataset = tf.data.Dataset.from_tensor_slices(
+    (list(df_train["file_name"]), list(df_train["normalized_transcription"]))
+)
+train_dataset = (
+    train_dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE)
+    .padded_batch(batch_size)
+    .prefetch(buffer_size=tf.data.AUTOTUNE)
+)
+
+# Define the validation dataset
+validation_dataset = tf.data.Dataset.from_tensor_slices(
+    (list(df_val["file_name"]), list(df_val["normalized_transcription"]))
+)
+validation_dataset = (
+    validation_dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE)
+    .padded_batch(batch_size)
+    .prefetch(buffer_size=tf.data.AUTOTUNE)
+)
+
+
+"""
+## Visualize the data
+
+Let's visualize an example in our dataset, including the
+audio clip, the spectrogram and the corresponding label.
+"""
+
+fig = plt.figure(figsize=(8, 5))
+for batch in train_dataset.take(1):
+    spectrogram = batch[0][0].numpy()
+    spectrogram = np.array([np.trim_zeros(x) for x in np.transpose(spectrogram)])
+    label = batch[1][0]
+    # Spectrogram
+    label = tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
+    ax = plt.subplot(2, 1, 1)
+    ax.imshow(spectrogram, vmax=1)
+    ax.set_title(label)
+    ax.axis("off")
+    # Wav
+    file = tf.io.read_file(wavs_path + list(df_train["file_name"])[0] + ".wav")
+    audio, _ = tf.audio.decode_wav(file)
+    audio = audio.numpy()
+    ax = plt.subplot(2, 1, 2)
+    plt.plot(audio)
+    ax.set_title("Signal Wave")
+    ax.set_xlim(0, len(audio))
+    display.display(display.Audio(np.transpose(audio), rate=16000))
+plt.show()
+
+"""
+## Model
+
+We first define the CTC Loss function.
+"""
+
+
+def CTCLoss(y_true, y_pred):
+    # Compute the training-time loss value
+    batch_len = tf.cast(tf.shape(y_true)[0], dtype="int64")
+    input_length = tf.cast(tf.shape(y_pred)[1], dtype="int64")
+    label_length = tf.cast(tf.shape(y_true)[1], dtype="int64")
+
+    input_length = input_length * tf.ones(shape=(batch_len, 1), dtype="int64")
+    label_length = label_length * tf.ones(shape=(batch_len, 1), dtype="int64")
+
+    loss = keras.backend.ctc_batch_cost(y_true, y_pred, input_length, label_length)
+    return loss
+
+
+"""
+We now define our model. We will define a model similar to
+[DeepSpeech2](https://nvidia.github.io/OpenSeq2Seq/html/speech-recognition/deepspeech2.html).
+"""
+
+
+def build_model(input_dim, output_dim, rnn_layers=5, rnn_units=128):
+    """Model similar to DeepSpeech2."""
+    # Model's input
+    input_spectrogram = layers.Input((None, input_dim), name="input")
+    # Expand the dimension to use 2D CNN.
+    x = layers.Reshape((-1, input_dim, 1), name="expand_dim")(input_spectrogram)
+    # Convolution layer 1
+    x = layers.Conv2D(
+        filters=32,
+        kernel_size=[11, 41],
+        strides=[2, 2],
+        padding="same",
+        use_bias=False,
+        name="conv_1",
+    )(x)
+    x = layers.BatchNormalization(name="conv_1_bn")(x)
+    x = layers.ReLU(name="conv_1_relu")(x)
+    # Convolution layer 2
+    x = layers.Conv2D(
+        filters=32,
+        kernel_size=[11, 21],
+        strides=[1, 2],
+        padding="same",
+        use_bias=False,
+        name="conv_2",
+    )(x)
+    x = layers.BatchNormalization(name="conv_2_bn")(x)
+    x = layers.ReLU(name="conv_2_relu")(x)
+    # Reshape the resulted volume to feed the RNNs layers
+    x = layers.Reshape((-1, x.shape[-2] * x.shape[-1]))(x)
+    # RNN layers
+    for i in range(1, rnn_layers + 1):
+        recurrent = layers.GRU(
+            units=rnn_units,
+            activation="tanh",
+            recurrent_activation="sigmoid",
+            use_bias=True,
+            return_sequences=True,
+            reset_after=True,
+            name=f"gru_{i}",
+        )
+        x = layers.Bidirectional(
+            recurrent, name=f"bidirectional_{i}", merge_mode="concat"
+        )(x)
+        if i < rnn_layers:
+            x = layers.Dropout(rate=0.5)(x)
+    # Dense layer
+    x = layers.Dense(units=rnn_units * 2, name="dense_1")(x)
+    x = layers.ReLU(name="dense_1_relu")(x)
+    x = layers.Dropout(rate=0.5)(x)
+    # Classification layer
+    output = layers.Dense(units=output_dim + 1, activation="softmax")(x)
+    # Model
+    model = keras.Model(input_spectrogram, output, name="DeepSpeech_2")
+    # Optimizer
+    opt = keras.optimizers.Adam(learning_rate=1e-4)
+    # Compile the model and return
+    model.compile(optimizer=opt, loss=CTCLoss)
+    return model
+
+
+# Get the model
+model = build_model(
+    input_dim=fft_length // 2 + 1,
+    output_dim=char_to_num.vocabulary_size(),
+    rnn_units=512,
+)
+model.summary(line_length=110)
+
+"""
+## Training and Evaluating
+"""
+
+
+# A utility function to decode the output of the network
+def decode_batch_predictions(pred):
+    input_len = np.ones(pred.shape[0]) * pred.shape[1]
+    # Use greedy search. For complex tasks, you can use beam search
+    results = keras.backend.ctc_decode(pred, input_length=input_len, greedy=True)[0][0]
+    # Iterate over the results and get back the text
+    output_text = []
+    for result in results:
+        result = tf.strings.reduce_join(num_to_char(result)).numpy().decode("utf-8")
+        output_text.append(result)
+    return output_text
+
+
+# A callback class to output a few transcriptions during training
+class CallbackEval(keras.callbacks.Callback):
+    """Displays a batch of outputs after every epoch."""
+
+    def __init__(self, dataset):
+        super().__init__()
+        self.dataset = dataset
+
+    def on_epoch_end(self, epoch: int, logs=None):
+        predictions = []
+        targets = []
+        for batch in self.dataset:
+            X, y = batch
+            batch_predictions = model.predict(X)
+            batch_predictions = decode_batch_predictions(batch_predictions)
+            predictions.extend(batch_predictions)
+            for label in y:
+                label = (
+                    tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
+                )
+                targets.append(label)
+        wer_score = wer(targets, predictions)
+        print("-" * 100)
+        print(f"Word Error Rate: {wer_score:.4f}")
+        print("-" * 100)
+        for i in np.random.randint(0, len(predictions), 2):
+            print(f"Target    : {targets[i]}")
+            print(f"Prediction: {predictions[i]}")
+            print("-" * 100)
+
+
+"""
+Let's start the training process.
+"""
+
+# Define the number of epochs.
+epochs = 1
+# Callback function to check transcription on the val set.
+validation_callback = CallbackEval(validation_dataset)
+# Train the model
+history = model.fit(
+    train_dataset,
+    validation_data=validation_dataset,
+    epochs=epochs,
+    callbacks=[validation_callback],
+)
+
+
+"""
+## Inference
+"""
+
+# Let's check results on more validation samples
+predictions = []
+targets = []
+for batch in validation_dataset:
+    X, y = batch
+    batch_predictions = model.predict(X)
+    batch_predictions = decode_batch_predictions(batch_predictions)
+    predictions.extend(batch_predictions)
+    for label in y:
+        label = tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
+        targets.append(label)
+wer_score = wer(targets, predictions)
+print("-" * 100)
+print(f"Word Error Rate: {wer_score:.4f}")
+print("-" * 100)
+for i in np.random.randint(0, len(predictions), 5):
+    print(f"Target    : {targets[i]}")
+    print(f"Prediction: {predictions[i]}")
+    print("-" * 100)
+
+
+"""
+## Conclusion
+
+In practice, you should train for around 50 epochs or more. Each epoch
+takes approximately 5-6mn using a `GeForce RTX 2080 Ti` GPU.
+The model we trained at 50 epochs has a `Word Error Rate (WER) ≈ 16% to 17%`.
+
+Some of the transcriptions around epoch 50:
+
+**Audio file: LJ017-0009.wav**
+```
+- Target    : sir thomas overbury was undoubtedly poisoned by lord rochester in the reign
+of james the first
+- Prediction: cer thomas overbery was undoubtedly poisoned by lordrochester in the reign
+of james the first
+```
+
+**Audio file: LJ003-0340.wav**
+```
+- Target    : the committee does not seem to have yet understood that newgate could be
+only and properly replaced
+- Prediction: the committee does not seem to have yet understood that newgate could be
+only and proberly replace
+```
+
+**Audio file: LJ011-0136.wav**
+```
+- Target    : still no sentence of death was carried out for the offense and in eighteen
+thirtytwo
+- Prediction: still no sentence of death was carried out for the offense and in eighteen
+thirtytwo
+```
+
+Example available on HuggingFace.
+| Trained Model | Demo |
+| :--: | :--: |
+| [![Generic badge](https://img.shields.io/badge/🤗%20Model-CTC%20ASR-black.svg)](https://huggingface.co/keras-io/ctc_asr) | [![Generic badge](https://img.shields.io/badge/🤗%20Spaces-CTC%20ASR-black.svg)](https://huggingface.co/spaces/keras-io/ctc_asr) |
+
+"""
diff --git a/knowledge_base/audio/melgan_spectrogram_inversion.py b/knowledge_base/audio/melgan_spectrogram_inversion.py
new file mode 100644
index 0000000000000000000000000000000000000000..a4f7bf232553489b42cd873e69277b38ac3661ae
--- /dev/null
+++ b/knowledge_base/audio/melgan_spectrogram_inversion.py
@@ -0,0 +1,607 @@
+"""
+Title: MelGAN-based spectrogram inversion using feature matching
+Author: [Darshan Deshpande](https://twitter.com/getdarshan)
+Date created: 02/09/2021
+Last modified: 15/09/2021
+Description: Inversion of audio from mel-spectrograms using the MelGAN architecture and feature matching.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+Autoregressive vocoders have been ubiquitous for a majority of the history of speech processing,
+but for most of their existence they have lacked parallelism.
+[MelGAN](https://arxiv.org/abs/1910.06711) is a
+non-autoregressive, fully convolutional vocoder architecture used for purposes ranging
+from spectral inversion and speech enhancement to present-day state-of-the-art
+speech synthesis when used as a decoder
+with models like Tacotron2 or FastSpeech that convert text to mel spectrograms.
+
+In this tutorial, we will have a look at the MelGAN architecture and how it can achieve
+fast spectral inversion, i.e. conversion of spectrograms to audio waves. The MelGAN
+implemented in this tutorial is similar to the original implementation with only the
+difference of method of padding for convolutions where we will use 'same' instead of
+reflect padding.
+"""
+
+"""
+## Importing and Defining Hyperparameters
+"""
+
+"""shell
+pip install -qqq tensorflow_addons
+pip install -qqq tensorflow-io
+"""
+
+import tensorflow as tf
+import tensorflow_io as tfio
+from tensorflow import keras
+from tensorflow.keras import layers
+from tensorflow_addons import layers as addon_layers
+
+# Setting logger level to avoid input shape warnings
+tf.get_logger().setLevel("ERROR")
+
+# Defining hyperparameters
+
+DESIRED_SAMPLES = 8192
+LEARNING_RATE_GEN = 1e-5
+LEARNING_RATE_DISC = 1e-6
+BATCH_SIZE = 16
+
+mse = keras.losses.MeanSquaredError()
+mae = keras.losses.MeanAbsoluteError()
+
+"""
+## Loading the Dataset
+
+This example uses the [LJSpeech dataset](https://keithito.com/LJ-Speech-Dataset/).
+
+The LJSpeech dataset is primarily used for text-to-speech and consists of 13,100 discrete
+speech samples taken from 7 non-fiction books, having a total length of approximately 24
+hours. The MelGAN training is only concerned with the audio waves so we process only the
+WAV files and ignore the audio annotations.
+"""
+
+"""shell
+wget https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2
+tar -xf /content/LJSpeech-1.1.tar.bz2
+"""
+
+"""
+We create a `tf.data.Dataset` to load and process the audio files on the fly.
+The `preprocess()` function takes the file path as input and returns two instances of the
+wave, one for input and one as the ground truth for comparison. The input wave will be
+mapped to a spectrogram using the custom `MelSpec` layer as shown later in this example.
+"""
+
+# Splitting the dataset into training and testing splits
+wavs = tf.io.gfile.glob("LJSpeech-1.1/wavs/*.wav")
+print(f"Number of audio files: {len(wavs)}")
+
+
+# Mapper function for loading the audio. This function returns two instances of the wave
+def preprocess(filename):
+    audio = tf.audio.decode_wav(tf.io.read_file(filename), 1, DESIRED_SAMPLES).audio
+    return audio, audio
+
+
+# Create tf.data.Dataset objects and apply preprocessing
+train_dataset = tf.data.Dataset.from_tensor_slices((wavs,))
+train_dataset = train_dataset.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE)
+
+"""
+## Defining custom layers for MelGAN
+
+The MelGAN architecture consists of 3 main modules:
+
+1. The residual block
+2. Dilated convolutional block
+3. Discriminator block
+
+![MelGAN](https://i.imgur.com/ZdxwzPG.png)
+"""
+
+"""
+Since the network takes a mel-spectrogram as input, we will create an additional custom
+layer
+which can convert the raw audio wave to a spectrogram on-the-fly. We use the raw audio
+tensor from `train_dataset` and map it to a mel-spectrogram using the `MelSpec` layer
+below.
+"""
+
+# Custom keras layer for on-the-fly audio to spectrogram conversion
+
+
+class MelSpec(layers.Layer):
+    def __init__(
+        self,
+        frame_length=1024,
+        frame_step=256,
+        fft_length=None,
+        sampling_rate=22050,
+        num_mel_channels=80,
+        freq_min=125,
+        freq_max=7600,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.frame_length = frame_length
+        self.frame_step = frame_step
+        self.fft_length = fft_length
+        self.sampling_rate = sampling_rate
+        self.num_mel_channels = num_mel_channels
+        self.freq_min = freq_min
+        self.freq_max = freq_max
+        # Defining mel filter. This filter will be multiplied with the STFT output
+        self.mel_filterbank = tf.signal.linear_to_mel_weight_matrix(
+            num_mel_bins=self.num_mel_channels,
+            num_spectrogram_bins=self.frame_length // 2 + 1,
+            sample_rate=self.sampling_rate,
+            lower_edge_hertz=self.freq_min,
+            upper_edge_hertz=self.freq_max,
+        )
+
+    def call(self, audio, training=True):
+        # We will only perform the transformation during training.
+        if training:
+            # Taking the Short Time Fourier Transform. Ensure that the audio is padded.
+            # In the paper, the STFT output is padded using the 'REFLECT' strategy.
+            stft = tf.signal.stft(
+                tf.squeeze(audio, -1),
+                self.frame_length,
+                self.frame_step,
+                self.fft_length,
+                pad_end=True,
+            )
+
+            # Taking the magnitude of the STFT output
+            magnitude = tf.abs(stft)
+
+            # Multiplying the Mel-filterbank with the magnitude and scaling it using the db scale
+            mel = tf.matmul(tf.square(magnitude), self.mel_filterbank)
+            log_mel_spec = tfio.audio.dbscale(mel, top_db=80)
+            return log_mel_spec
+        else:
+            return audio
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "frame_length": self.frame_length,
+                "frame_step": self.frame_step,
+                "fft_length": self.fft_length,
+                "sampling_rate": self.sampling_rate,
+                "num_mel_channels": self.num_mel_channels,
+                "freq_min": self.freq_min,
+                "freq_max": self.freq_max,
+            }
+        )
+        return config
+
+
+"""
+The residual convolutional block extensively uses dilations and has a total receptive
+field of 27 timesteps per block. The dilations must grow as a power of the `kernel_size`
+to ensure reduction of hissing noise in the output. The network proposed by the paper is
+as follows:
+
+![ConvBlock](https://i.imgur.com/sFnnsCll.jpg)
+"""
+
+# Creating the residual stack block
+
+
+def residual_stack(input, filters):
+    """Convolutional residual stack with weight normalization.
+
+    Args:
+        filters: int, determines filter size for the residual stack.
+
+    Returns:
+        Residual stack output.
+    """
+    c1 = addon_layers.WeightNormalization(
+        layers.Conv1D(filters, 3, dilation_rate=1, padding="same"), data_init=False
+    )(input)
+    lrelu1 = layers.LeakyReLU()(c1)
+    c2 = addon_layers.WeightNormalization(
+        layers.Conv1D(filters, 3, dilation_rate=1, padding="same"), data_init=False
+    )(lrelu1)
+    add1 = layers.Add()([c2, input])
+
+    lrelu2 = layers.LeakyReLU()(add1)
+    c3 = addon_layers.WeightNormalization(
+        layers.Conv1D(filters, 3, dilation_rate=3, padding="same"), data_init=False
+    )(lrelu2)
+    lrelu3 = layers.LeakyReLU()(c3)
+    c4 = addon_layers.WeightNormalization(
+        layers.Conv1D(filters, 3, dilation_rate=1, padding="same"), data_init=False
+    )(lrelu3)
+    add2 = layers.Add()([add1, c4])
+
+    lrelu4 = layers.LeakyReLU()(add2)
+    c5 = addon_layers.WeightNormalization(
+        layers.Conv1D(filters, 3, dilation_rate=9, padding="same"), data_init=False
+    )(lrelu4)
+    lrelu5 = layers.LeakyReLU()(c5)
+    c6 = addon_layers.WeightNormalization(
+        layers.Conv1D(filters, 3, dilation_rate=1, padding="same"), data_init=False
+    )(lrelu5)
+    add3 = layers.Add()([c6, add2])
+
+    return add3
+
+
+"""
+Each convolutional block uses the dilations offered by the residual stack
+and upsamples the input data by the `upsampling_factor`.
+"""
+
+# Dilated convolutional block consisting of the Residual stack
+
+
+def conv_block(input, conv_dim, upsampling_factor):
+    """Dilated Convolutional Block with weight normalization.
+
+    Args:
+        conv_dim: int, determines filter size for the block.
+        upsampling_factor: int, scale for upsampling.
+
+    Returns:
+        Dilated convolution block.
+    """
+    conv_t = addon_layers.WeightNormalization(
+        layers.Conv1DTranspose(conv_dim, 16, upsampling_factor, padding="same"),
+        data_init=False,
+    )(input)
+    lrelu1 = layers.LeakyReLU()(conv_t)
+    res_stack = residual_stack(lrelu1, conv_dim)
+    lrelu2 = layers.LeakyReLU()(res_stack)
+    return lrelu2
+
+
+"""
+The discriminator block consists of convolutions and downsampling layers. This block is
+essential for the implementation of the feature matching technique.
+
+Each discriminator outputs a list of feature maps that will be compared during training
+to compute the feature matching loss.
+"""
+
+
+def discriminator_block(input):
+    conv1 = addon_layers.WeightNormalization(
+        layers.Conv1D(16, 15, 1, "same"), data_init=False
+    )(input)
+    lrelu1 = layers.LeakyReLU()(conv1)
+    conv2 = addon_layers.WeightNormalization(
+        layers.Conv1D(64, 41, 4, "same", groups=4), data_init=False
+    )(lrelu1)
+    lrelu2 = layers.LeakyReLU()(conv2)
+    conv3 = addon_layers.WeightNormalization(
+        layers.Conv1D(256, 41, 4, "same", groups=16), data_init=False
+    )(lrelu2)
+    lrelu3 = layers.LeakyReLU()(conv3)
+    conv4 = addon_layers.WeightNormalization(
+        layers.Conv1D(1024, 41, 4, "same", groups=64), data_init=False
+    )(lrelu3)
+    lrelu4 = layers.LeakyReLU()(conv4)
+    conv5 = addon_layers.WeightNormalization(
+        layers.Conv1D(1024, 41, 4, "same", groups=256), data_init=False
+    )(lrelu4)
+    lrelu5 = layers.LeakyReLU()(conv5)
+    conv6 = addon_layers.WeightNormalization(
+        layers.Conv1D(1024, 5, 1, "same"), data_init=False
+    )(lrelu5)
+    lrelu6 = layers.LeakyReLU()(conv6)
+    conv7 = addon_layers.WeightNormalization(
+        layers.Conv1D(1, 3, 1, "same"), data_init=False
+    )(lrelu6)
+    return [lrelu1, lrelu2, lrelu3, lrelu4, lrelu5, lrelu6, conv7]
+
+
+"""
+### Create the generator
+"""
+
+
+def create_generator(input_shape):
+    inp = keras.Input(input_shape)
+    x = MelSpec()(inp)
+    x = layers.Conv1D(512, 7, padding="same")(x)
+    x = layers.LeakyReLU()(x)
+    x = conv_block(x, 256, 8)
+    x = conv_block(x, 128, 8)
+    x = conv_block(x, 64, 2)
+    x = conv_block(x, 32, 2)
+    x = addon_layers.WeightNormalization(
+        layers.Conv1D(1, 7, padding="same", activation="tanh")
+    )(x)
+    return keras.Model(inp, x)
+
+
+# We use a dynamic input shape for the generator since the model is fully convolutional
+generator = create_generator((None, 1))
+generator.summary()
+
+"""
+### Create the discriminator
+"""
+
+
+def create_discriminator(input_shape):
+    inp = keras.Input(input_shape)
+    out_map1 = discriminator_block(inp)
+    pool1 = layers.AveragePooling1D()(inp)
+    out_map2 = discriminator_block(pool1)
+    pool2 = layers.AveragePooling1D()(pool1)
+    out_map3 = discriminator_block(pool2)
+    return keras.Model(inp, [out_map1, out_map2, out_map3])
+
+
+# We use a dynamic input shape for the discriminator
+# This is done because the input shape for the generator is unknown
+discriminator = create_discriminator((None, 1))
+
+discriminator.summary()
+
+"""
+## Defining the loss functions
+
+**Generator Loss**
+
+The generator architecture uses a combination of two losses
+
+1. Mean Squared Error:
+
+This is the standard MSE generator loss calculated between ones and the outputs from the
+discriminator with _N_ layers.
+
+<p align="center">
+<img src="https://i.imgur.com/dz4JS3I.png" width=300px;></img>
+</p>
+
+2. Feature Matching Loss:
+
+This loss involves extracting the outputs of every layer from the discriminator for both
+the generator and ground truth and compare each layer output _k_ using Mean Absolute Error.
+
+<p align="center">
+<img src="https://i.imgur.com/gEpSBar.png" width=400px;></img>
+</p>
+
+**Discriminator Loss**
+
+The discriminator uses the Mean Absolute Error and compares the real data predictions
+with ones and generated predictions with zeros.
+
+<p align="center">
+<img src="https://i.imgur.com/bbEnJ3t.png" width=425px;></img>
+</p>
+"""
+
+# Generator loss
+
+
+def generator_loss(real_pred, fake_pred):
+    """Loss function for the generator.
+
+    Args:
+        real_pred: Tensor, output of the ground truth wave passed through the discriminator.
+        fake_pred: Tensor, output of the generator prediction passed through the discriminator.
+
+    Returns:
+        Loss for the generator.
+    """
+    gen_loss = []
+    for i in range(len(fake_pred)):
+        gen_loss.append(mse(tf.ones_like(fake_pred[i][-1]), fake_pred[i][-1]))
+
+    return tf.reduce_mean(gen_loss)
+
+
+def feature_matching_loss(real_pred, fake_pred):
+    """Implements the feature matching loss.
+
+    Args:
+        real_pred: Tensor, output of the ground truth wave passed through the discriminator.
+        fake_pred: Tensor, output of the generator prediction passed through the discriminator.
+
+    Returns:
+        Feature Matching Loss.
+    """
+    fm_loss = []
+    for i in range(len(fake_pred)):
+        for j in range(len(fake_pred[i]) - 1):
+            fm_loss.append(mae(real_pred[i][j], fake_pred[i][j]))
+
+    return tf.reduce_mean(fm_loss)
+
+
+def discriminator_loss(real_pred, fake_pred):
+    """Implements the discriminator loss.
+
+    Args:
+        real_pred: Tensor, output of the ground truth wave passed through the discriminator.
+        fake_pred: Tensor, output of the generator prediction passed through the discriminator.
+
+    Returns:
+        Discriminator Loss.
+    """
+    real_loss, fake_loss = [], []
+    for i in range(len(real_pred)):
+        real_loss.append(mse(tf.ones_like(real_pred[i][-1]), real_pred[i][-1]))
+        fake_loss.append(mse(tf.zeros_like(fake_pred[i][-1]), fake_pred[i][-1]))
+
+    # Calculating the final discriminator loss after scaling
+    disc_loss = tf.reduce_mean(real_loss) + tf.reduce_mean(fake_loss)
+    return disc_loss
+
+
+"""
+Defining the MelGAN model for training.
+This subclass overrides the `train_step()` method to implement the training logic.
+"""
+
+
+class MelGAN(keras.Model):
+    def __init__(self, generator, discriminator, **kwargs):
+        """MelGAN trainer class
+
+        Args:
+            generator: keras.Model, Generator model
+            discriminator: keras.Model, Discriminator model
+        """
+        super().__init__(**kwargs)
+        self.generator = generator
+        self.discriminator = discriminator
+
+    def compile(
+        self,
+        gen_optimizer,
+        disc_optimizer,
+        generator_loss,
+        feature_matching_loss,
+        discriminator_loss,
+    ):
+        """MelGAN compile method.
+
+        Args:
+            gen_optimizer: keras.optimizer, optimizer to be used for training
+            disc_optimizer: keras.optimizer, optimizer to be used for training
+            generator_loss: callable, loss function for generator
+            feature_matching_loss: callable, loss function for feature matching
+            discriminator_loss: callable, loss function for discriminator
+        """
+        super().compile()
+
+        # Optimizers
+        self.gen_optimizer = gen_optimizer
+        self.disc_optimizer = disc_optimizer
+
+        # Losses
+        self.generator_loss = generator_loss
+        self.feature_matching_loss = feature_matching_loss
+        self.discriminator_loss = discriminator_loss
+
+        # Trackers
+        self.gen_loss_tracker = keras.metrics.Mean(name="gen_loss")
+        self.disc_loss_tracker = keras.metrics.Mean(name="disc_loss")
+
+    def train_step(self, batch):
+        x_batch_train, y_batch_train = batch
+
+        with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
+            # Generating the audio wave
+            gen_audio_wave = generator(x_batch_train, training=True)
+
+            # Generating the features using the discriminator
+            real_pred = discriminator(y_batch_train)
+            fake_pred = discriminator(gen_audio_wave)
+
+            # Calculating the generator losses
+            gen_loss = generator_loss(real_pred, fake_pred)
+            fm_loss = feature_matching_loss(real_pred, fake_pred)
+
+            # Calculating final generator loss
+            gen_fm_loss = gen_loss + 10 * fm_loss
+
+            # Calculating the discriminator losses
+            disc_loss = discriminator_loss(real_pred, fake_pred)
+
+        # Calculating and applying the gradients for generator and discriminator
+        grads_gen = gen_tape.gradient(gen_fm_loss, generator.trainable_weights)
+        grads_disc = disc_tape.gradient(disc_loss, discriminator.trainable_weights)
+        gen_optimizer.apply_gradients(zip(grads_gen, generator.trainable_weights))
+        disc_optimizer.apply_gradients(zip(grads_disc, discriminator.trainable_weights))
+
+        self.gen_loss_tracker.update_state(gen_fm_loss)
+        self.disc_loss_tracker.update_state(disc_loss)
+
+        return {
+            "gen_loss": self.gen_loss_tracker.result(),
+            "disc_loss": self.disc_loss_tracker.result(),
+        }
+
+
+"""
+## Training
+
+The paper suggests that the training with dynamic shapes takes around 400,000 steps (~500
+epochs). For this example, we will run it only for a single epoch (819 steps).
+Longer training time (greater than 300 epochs) will almost certainly provide better results.
+"""
+
+gen_optimizer = keras.optimizers.Adam(
+    LEARNING_RATE_GEN, beta_1=0.5, beta_2=0.9, clipnorm=1
+)
+disc_optimizer = keras.optimizers.Adam(
+    LEARNING_RATE_DISC, beta_1=0.5, beta_2=0.9, clipnorm=1
+)
+
+# Start training
+generator = create_generator((None, 1))
+discriminator = create_discriminator((None, 1))
+
+mel_gan = MelGAN(generator, discriminator)
+mel_gan.compile(
+    gen_optimizer,
+    disc_optimizer,
+    generator_loss,
+    feature_matching_loss,
+    discriminator_loss,
+)
+mel_gan.fit(
+    train_dataset.shuffle(200).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE), epochs=1
+)
+
+"""
+## Testing the model
+
+The trained model can now be used for real time text-to-speech translation tasks.
+To test how fast the MelGAN inference can be, let us take a sample audio mel-spectrogram
+and convert it. Note that the actual model pipeline will not include the `MelSpec` layer
+and hence this layer will be disabled during inference. The inference input will be a
+mel-spectrogram processed similar to the `MelSpec` layer configuration.
+
+For testing this, we will create a randomly uniformly distributed tensor to simulate the
+behavior of the inference pipeline.
+"""
+
+# Sampling a random tensor to mimic a batch of 128 spectrograms of shape [50, 80]
+audio_sample = tf.random.uniform([128, 50, 80])
+
+"""
+Timing the inference speed of a single sample. Running this, you can see that the average
+inference time per spectrogram ranges from 8 milliseconds to 10 milliseconds on a K80 GPU which is
+pretty fast.
+"""
+pred = generator.predict(audio_sample, batch_size=32, verbose=1)
+"""
+## Conclusion
+
+The MelGAN is a highly effective architecture for spectral inversion that has a Mean
+Opinion Score (MOS) of 3.61 that  considerably outperforms the Griffin
+Lim algorithm having a MOS of just 1.57. In contrast with this, the MelGAN compares with
+the state-of-the-art WaveGlow and WaveNet architectures on text-to-speech and speech
+enhancement tasks on
+the LJSpeech and VCTK datasets <sup>[1]</sup>.
+
+This tutorial highlights:
+
+1. The advantages of using dilated convolutions that grow with the filter size
+2. Implementation of a custom layer for on-the-fly conversion of audio waves to
+mel-spectrograms
+3. Effectiveness of using the feature matching loss function for training GAN generators.
+
+Further reading
+
+1. [MelGAN paper](https://arxiv.org/abs/1910.06711) (Kundan Kumar et al.) to
+understand the reasoning behind the architecture and training process
+2. For in-depth understanding of the feature matching loss, you can refer to [Improved
+Techniques for Training GANs](https://arxiv.org/abs/1606.03498) (Tim Salimans et
+al.).
+"""
diff --git a/knowledge_base/audio/speaker_recognition_using_cnn.py b/knowledge_base/audio/speaker_recognition_using_cnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b95f280846528604d0839a86e9b6aaada0ffe8e
--- /dev/null
+++ b/knowledge_base/audio/speaker_recognition_using_cnn.py
@@ -0,0 +1,489 @@
+"""
+Title: Speaker Recognition
+Author: [Fadi Badine](https://twitter.com/fadibadine)
+Date created: 14/06/2020
+Last modified: 19/07/2023
+Description: Classify speakers using Fast Fourier Transform (FFT) and a 1D Convnet.
+Accelerator: GPU
+Converted to Keras 3 by: [Fadi Badine](https://twitter.com/fadibadine)
+"""
+
+"""
+## Introduction
+
+This example demonstrates how to create a model to classify speakers from the
+frequency domain representation of speech recordings, obtained via Fast Fourier
+Transform (FFT).
+
+It shows the following:
+
+- How to use `tf.data` to load, preprocess and feed audio streams into a model
+- How to create a 1D convolutional network with residual
+connections for audio classification.
+
+Our process:
+
+- We prepare a dataset of speech samples from different speakers, with the speaker as label.
+- We add background noise to these samples to augment our data.
+- We take the FFT of these samples.
+- We train a 1D convnet to predict the correct speaker given a noisy FFT speech sample.
+
+Note:
+
+- This example should be run with TensorFlow 2.3 or higher, or `tf-nightly`.
+- The noise samples in the dataset need to be resampled to a sampling rate of 16000 Hz
+before using the code in this example. In order to do this, you will need to have
+installed `ffmpg`.
+"""
+
+"""
+## Setup
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import shutil
+import numpy as np
+
+import tensorflow as tf
+import keras
+
+from pathlib import Path
+from IPython.display import display, Audio
+
+# Get the data from https://www.kaggle.com/kongaevans/speaker-recognition-dataset/
+# and save it to ./speaker-recognition-dataset.zip
+# then unzip it to ./16000_pcm_speeches
+"""shell
+kaggle datasets download -d kongaevans/speaker-recognition-dataset
+unzip -qq speaker-recognition-dataset.zip
+"""
+
+DATASET_ROOT = "16000_pcm_speeches"
+
+# The folders in which we will put the audio samples and the noise samples
+AUDIO_SUBFOLDER = "audio"
+NOISE_SUBFOLDER = "noise"
+
+DATASET_AUDIO_PATH = os.path.join(DATASET_ROOT, AUDIO_SUBFOLDER)
+DATASET_NOISE_PATH = os.path.join(DATASET_ROOT, NOISE_SUBFOLDER)
+
+# Percentage of samples to use for validation
+VALID_SPLIT = 0.1
+
+# Seed to use when shuffling the dataset and the noise
+SHUFFLE_SEED = 43
+
+# The sampling rate to use.
+# This is the one used in all the audio samples.
+# We will resample all the noise to this sampling rate.
+# This will also be the output size of the audio wave samples
+# (since all samples are of 1 second long)
+SAMPLING_RATE = 16000
+
+# The factor to multiply the noise with according to:
+#   noisy_sample = sample + noise * prop * scale
+#      where prop = sample_amplitude / noise_amplitude
+SCALE = 0.5
+
+BATCH_SIZE = 128
+EPOCHS = 1  # For a real training run, use EPOCHS = 100
+
+
+"""
+## Data preparation
+
+The dataset is composed of 7 folders, divided into 2 groups:
+
+- Speech samples, with 5 folders for 5 different speakers. Each folder contains
+1500 audio files, each 1 second long and sampled at 16000 Hz.
+- Background noise samples, with 2 folders and a total of 6 files. These files
+are longer than 1 second (and originally not sampled at 16000 Hz, but we will resample them to 16000 Hz).
+We will use those 6 files to create 354 1-second-long noise samples to be used for training.
+
+Let's sort these 2 categories into 2 folders:
+
+- An `audio` folder which will contain all the per-speaker speech sample folders
+- A `noise` folder which will contain all the noise samples
+"""
+
+"""
+Before sorting the audio and noise categories into 2 folders,
+we have the following directory structure:
+
+```
+main_directory/
+...speaker_a/
+...speaker_b/
+...speaker_c/
+...speaker_d/
+...speaker_e/
+...other/
+..._background_noise_/
+```
+
+After sorting, we end up with the following structure:
+
+```
+main_directory/
+...audio/
+......speaker_a/
+......speaker_b/
+......speaker_c/
+......speaker_d/
+......speaker_e/
+...noise/
+......other/
+......_background_noise_/
+```
+"""
+
+for folder in os.listdir(DATASET_ROOT):
+    if os.path.isdir(os.path.join(DATASET_ROOT, folder)):
+        if folder in [AUDIO_SUBFOLDER, NOISE_SUBFOLDER]:
+            # If folder is `audio` or `noise`, do nothing
+            continue
+        elif folder in ["other", "_background_noise_"]:
+            # If folder is one of the folders that contains noise samples,
+            # move it to the `noise` folder
+            shutil.move(
+                os.path.join(DATASET_ROOT, folder),
+                os.path.join(DATASET_NOISE_PATH, folder),
+            )
+        else:
+            # Otherwise, it should be a speaker folder, then move it to
+            # `audio` folder
+            shutil.move(
+                os.path.join(DATASET_ROOT, folder),
+                os.path.join(DATASET_AUDIO_PATH, folder),
+            )
+
+"""
+## Noise preparation
+
+In this section:
+
+- We load all noise samples (which should have been resampled to 16000)
+- We split those noise samples to chunks of 16000 samples which
+correspond to 1 second duration each
+"""
+
+# Get the list of all noise files
+noise_paths = []
+for subdir in os.listdir(DATASET_NOISE_PATH):
+    subdir_path = Path(DATASET_NOISE_PATH) / subdir
+    if os.path.isdir(subdir_path):
+        noise_paths += [
+            os.path.join(subdir_path, filepath)
+            for filepath in os.listdir(subdir_path)
+            if filepath.endswith(".wav")
+        ]
+if not noise_paths:
+    raise RuntimeError(f"Could not find any files at {DATASET_NOISE_PATH}")
+print(
+    "Found {} files belonging to {} directories".format(
+        len(noise_paths), len(os.listdir(DATASET_NOISE_PATH))
+    )
+)
+
+"""
+Resample all noise samples to 16000 Hz
+"""
+
+command = (
+    "for dir in `ls -1 " + DATASET_NOISE_PATH + "`; do "
+    "for file in `ls -1 " + DATASET_NOISE_PATH + "/$dir/*.wav`; do "
+    "sample_rate=`ffprobe -hide_banner -loglevel panic -show_streams "
+    "$file | grep sample_rate | cut -f2 -d=`; "
+    "if [ $sample_rate -ne 16000 ]; then "
+    "ffmpeg -hide_banner -loglevel panic -y "
+    "-i $file -ar 16000 temp.wav; "
+    "mv temp.wav $file; "
+    "fi; done; done"
+)
+os.system(command)
+
+
+# Split noise into chunks of 16,000 steps each
+def load_noise_sample(path):
+    sample, sampling_rate = tf.audio.decode_wav(
+        tf.io.read_file(path), desired_channels=1
+    )
+    if sampling_rate == SAMPLING_RATE:
+        # Number of slices of 16000 each that can be generated from the noise sample
+        slices = int(sample.shape[0] / SAMPLING_RATE)
+        sample = tf.split(sample[: slices * SAMPLING_RATE], slices)
+        return sample
+    else:
+        print("Sampling rate for {} is incorrect. Ignoring it".format(path))
+        return None
+
+
+noises = []
+for path in noise_paths:
+    sample = load_noise_sample(path)
+    if sample:
+        noises.extend(sample)
+noises = tf.stack(noises)
+
+print(
+    "{} noise files were split into {} noise samples where each is {} sec. long".format(
+        len(noise_paths), noises.shape[0], noises.shape[1] // SAMPLING_RATE
+    )
+)
+
+"""
+## Dataset generation
+"""
+
+
+def paths_and_labels_to_dataset(audio_paths, labels):
+    """Constructs a dataset of audios and labels."""
+    path_ds = tf.data.Dataset.from_tensor_slices(audio_paths)
+    audio_ds = path_ds.map(
+        lambda x: path_to_audio(x), num_parallel_calls=tf.data.AUTOTUNE
+    )
+    label_ds = tf.data.Dataset.from_tensor_slices(labels)
+    return tf.data.Dataset.zip((audio_ds, label_ds))
+
+
+def path_to_audio(path):
+    """Reads and decodes an audio file."""
+    audio = tf.io.read_file(path)
+    audio, _ = tf.audio.decode_wav(audio, 1, SAMPLING_RATE)
+    return audio
+
+
+def add_noise(audio, noises=None, scale=0.5):
+    if noises is not None:
+        # Create a random tensor of the same size as audio ranging from
+        # 0 to the number of noise stream samples that we have.
+        tf_rnd = tf.random.uniform(
+            (tf.shape(audio)[0],), 0, noises.shape[0], dtype=tf.int32
+        )
+        noise = tf.gather(noises, tf_rnd, axis=0)
+
+        # Get the amplitude proportion between the audio and the noise
+        prop = tf.math.reduce_max(audio, axis=1) / tf.math.reduce_max(noise, axis=1)
+        prop = tf.repeat(tf.expand_dims(prop, axis=1), tf.shape(audio)[1], axis=1)
+
+        # Adding the rescaled noise to audio
+        audio = audio + noise * prop * scale
+
+    return audio
+
+
+def audio_to_fft(audio):
+    # Since tf.signal.fft applies FFT on the innermost dimension,
+    # we need to squeeze the dimensions and then expand them again
+    # after FFT
+    audio = tf.squeeze(audio, axis=-1)
+    fft = tf.signal.fft(
+        tf.cast(tf.complex(real=audio, imag=tf.zeros_like(audio)), tf.complex64)
+    )
+    fft = tf.expand_dims(fft, axis=-1)
+
+    # Return the absolute value of the first half of the FFT
+    # which represents the positive frequencies
+    return tf.math.abs(fft[:, : (audio.shape[1] // 2), :])
+
+
+# Get the list of audio file paths along with their corresponding labels
+
+class_names = os.listdir(DATASET_AUDIO_PATH)
+print(
+    "Our class names: {}".format(
+        class_names,
+    )
+)
+
+audio_paths = []
+labels = []
+for label, name in enumerate(class_names):
+    print(
+        "Processing speaker {}".format(
+            name,
+        )
+    )
+    dir_path = Path(DATASET_AUDIO_PATH) / name
+    speaker_sample_paths = [
+        os.path.join(dir_path, filepath)
+        for filepath in os.listdir(dir_path)
+        if filepath.endswith(".wav")
+    ]
+    audio_paths += speaker_sample_paths
+    labels += [label] * len(speaker_sample_paths)
+
+print(
+    "Found {} files belonging to {} classes.".format(len(audio_paths), len(class_names))
+)
+
+# Shuffle
+rng = np.random.RandomState(SHUFFLE_SEED)
+rng.shuffle(audio_paths)
+rng = np.random.RandomState(SHUFFLE_SEED)
+rng.shuffle(labels)
+
+# Split into training and validation
+num_val_samples = int(VALID_SPLIT * len(audio_paths))
+print("Using {} files for training.".format(len(audio_paths) - num_val_samples))
+train_audio_paths = audio_paths[:-num_val_samples]
+train_labels = labels[:-num_val_samples]
+
+print("Using {} files for validation.".format(num_val_samples))
+valid_audio_paths = audio_paths[-num_val_samples:]
+valid_labels = labels[-num_val_samples:]
+
+# Create 2 datasets, one for training and the other for validation
+train_ds = paths_and_labels_to_dataset(train_audio_paths, train_labels)
+train_ds = train_ds.shuffle(buffer_size=BATCH_SIZE * 8, seed=SHUFFLE_SEED).batch(
+    BATCH_SIZE
+)
+
+valid_ds = paths_and_labels_to_dataset(valid_audio_paths, valid_labels)
+valid_ds = valid_ds.shuffle(buffer_size=32 * 8, seed=SHUFFLE_SEED).batch(32)
+
+
+# Add noise to the training set
+train_ds = train_ds.map(
+    lambda x, y: (add_noise(x, noises, scale=SCALE), y),
+    num_parallel_calls=tf.data.AUTOTUNE,
+)
+
+# Transform audio wave to the frequency domain using `audio_to_fft`
+train_ds = train_ds.map(
+    lambda x, y: (audio_to_fft(x), y), num_parallel_calls=tf.data.AUTOTUNE
+)
+train_ds = train_ds.prefetch(tf.data.AUTOTUNE)
+
+valid_ds = valid_ds.map(
+    lambda x, y: (audio_to_fft(x), y), num_parallel_calls=tf.data.AUTOTUNE
+)
+valid_ds = valid_ds.prefetch(tf.data.AUTOTUNE)
+
+"""
+## Model Definition
+"""
+
+
+def residual_block(x, filters, conv_num=3, activation="relu"):
+    # Shortcut
+    s = keras.layers.Conv1D(filters, 1, padding="same")(x)
+    for i in range(conv_num - 1):
+        x = keras.layers.Conv1D(filters, 3, padding="same")(x)
+        x = keras.layers.Activation(activation)(x)
+    x = keras.layers.Conv1D(filters, 3, padding="same")(x)
+    x = keras.layers.Add()([x, s])
+    x = keras.layers.Activation(activation)(x)
+    return keras.layers.MaxPool1D(pool_size=2, strides=2)(x)
+
+
+def build_model(input_shape, num_classes):
+    inputs = keras.layers.Input(shape=input_shape, name="input")
+
+    x = residual_block(inputs, 16, 2)
+    x = residual_block(x, 32, 2)
+    x = residual_block(x, 64, 3)
+    x = residual_block(x, 128, 3)
+    x = residual_block(x, 128, 3)
+
+    x = keras.layers.AveragePooling1D(pool_size=3, strides=3)(x)
+    x = keras.layers.Flatten()(x)
+    x = keras.layers.Dense(256, activation="relu")(x)
+    x = keras.layers.Dense(128, activation="relu")(x)
+
+    outputs = keras.layers.Dense(num_classes, activation="softmax", name="output")(x)
+
+    return keras.models.Model(inputs=inputs, outputs=outputs)
+
+
+model = build_model((SAMPLING_RATE // 2, 1), len(class_names))
+
+model.summary()
+
+# Compile the model using Adam's default learning rate
+model.compile(
+    optimizer="Adam",
+    loss="sparse_categorical_crossentropy",
+    metrics=["accuracy"],
+)
+
+# Add callbacks:
+# 'EarlyStopping' to stop training when the model is not enhancing anymore
+# 'ModelCheckPoint' to always keep the model that has the best val_accuracy
+model_save_filename = "model.keras"
+
+earlystopping_cb = keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)
+mdlcheckpoint_cb = keras.callbacks.ModelCheckpoint(
+    model_save_filename, monitor="val_accuracy", save_best_only=True
+)
+
+"""
+## Training
+"""
+
+history = model.fit(
+    train_ds,
+    epochs=EPOCHS,
+    validation_data=valid_ds,
+    callbacks=[earlystopping_cb, mdlcheckpoint_cb],
+)
+
+"""
+## Evaluation
+"""
+
+print(model.evaluate(valid_ds))
+
+"""
+We get ~ 98% validation accuracy.
+"""
+
+"""
+## Demonstration
+
+Let's take some samples and:
+
+- Predict the speaker
+- Compare the prediction with the real speaker
+- Listen to the audio to see that despite the samples being noisy,
+the model is still pretty accurate
+"""
+
+SAMPLES_TO_DISPLAY = 10
+
+test_ds = paths_and_labels_to_dataset(valid_audio_paths, valid_labels)
+test_ds = test_ds.shuffle(buffer_size=BATCH_SIZE * 8, seed=SHUFFLE_SEED).batch(
+    BATCH_SIZE
+)
+
+test_ds = test_ds.map(
+    lambda x, y: (add_noise(x, noises, scale=SCALE), y),
+    num_parallel_calls=tf.data.AUTOTUNE,
+)
+
+for audios, labels in test_ds.take(1):
+    # Get the signal FFT
+    ffts = audio_to_fft(audios)
+    # Predict
+    y_pred = model.predict(ffts)
+    # Take random samples
+    rnd = np.random.randint(0, BATCH_SIZE, SAMPLES_TO_DISPLAY)
+    audios = audios.numpy()[rnd, :, :]
+    labels = labels.numpy()[rnd]
+    y_pred = np.argmax(y_pred, axis=-1)[rnd]
+
+    for index in range(SAMPLES_TO_DISPLAY):
+        # For every sample, print the true and predicted label
+        # as well as run the voice with the noise
+        print(
+            "Speaker:\33{} {}\33[0m\tPredicted:\33{} {}\33[0m".format(
+                "[92m" if labels[index] == y_pred[index] else "[91m",
+                class_names[labels[index]],
+                "[92m" if labels[index] == y_pred[index] else "[91m",
+                class_names[y_pred[index]],
+            )
+        )
+        display(Audio(audios[index, :, :].squeeze(), rate=SAMPLING_RATE))
diff --git a/knowledge_base/audio/stft.py b/knowledge_base/audio/stft.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6f74a9ad908dff08ccd34287340c8a4f7900a3e
--- /dev/null
+++ b/knowledge_base/audio/stft.py
@@ -0,0 +1,409 @@
+"""
+Title: Audio Classification with the STFTSpectrogram layer
+Author: [Mostafa M. Amin](https://mostafa-amin.com)
+Date created: 2024/10/04
+Last modified: 2024/10/04
+Description: Introducing the `STFTSpectrogram` layer to extract spectrograms for audio classification.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+Preprocessing audio as spectrograms is an essential step in the vast majority
+of audio-based applications. Spectrograms represent the frequency content of a
+signal over time, are widely used for this purpose. In this tutorial, we'll
+demonstrate how to use the `STFTSpectrogram` layer in Keras to convert raw
+audio waveforms into spectrograms **within the model**. We'll then feed
+these spectrograms into an LSTM network followed by Dense layers to perform
+audio classification on the Speech Commands dataset.
+
+We will:
+
+- Load the ESC-10 dataset.
+- Preprocess the raw audio waveforms and generate spectrograms using
+   `STFTSpectrogram`.
+- Build two models, one using spectrograms as 1D signals and the other is using
+   as images (2D signals) with a pretrained image model.
+- Train and evaluate the models.
+
+## Setup
+
+### Importing the necessary libraries
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "jax"
+
+import keras
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import scipy.io.wavfile
+from keras import layers
+from scipy.signal import resample
+
+keras.utils.set_random_seed(41)
+
+"""
+### Define some variables
+"""
+
+BASE_DATA_DIR = "./datasets/esc-50_extracted/ESC-50-master/"
+BATCH_SIZE = 16
+NUM_CLASSES = 10
+EPOCHS = 200
+SAMPLE_RATE = 16000
+
+"""
+## Download and Preprocess the ESC-10 Dataset
+
+We'll use the Dataset for Environmental Sound Classification dataset (ESC-10).
+This dataset consists of five-second .wav files of environmental sounds.
+
+### Download and Extract the dataset
+"""
+
+keras.utils.get_file(
+    "esc-50.zip",
+    "https://github.com/karoldvl/ESC-50/archive/master.zip",
+    cache_dir="./",
+    cache_subdir="datasets",
+    extract=True,
+)
+
+"""
+### Read the CSV file
+"""
+
+pd_data = pd.read_csv(os.path.join(BASE_DATA_DIR, "meta", "esc50.csv"))
+# filter ESC-50 to ESC-10 and reassign the targets
+pd_data = pd_data[pd_data["esc10"]]
+targets = sorted(pd_data["target"].unique().tolist())
+assert len(targets) == NUM_CLASSES
+old_target_to_new_target = {old: new for new, old in enumerate(targets)}
+pd_data["target"] = pd_data["target"].map(lambda t: old_target_to_new_target[t])
+pd_data
+
+"""
+### Define functions to read and preprocess the WAV files
+"""
+
+
+def read_wav_file(path, target_sr=SAMPLE_RATE):
+    sr, wav = scipy.io.wavfile.read(os.path.join(BASE_DATA_DIR, "audio", path))
+    wav = wav.astype(np.float32) / 32768.0  # normalize to [-1, 1]
+    num_samples = int(len(wav) * target_sr / sr)  # resample to 16 kHz
+    wav = resample(wav, num_samples)
+    return wav[:, None]  # Add a channel dimension (of size 1)
+
+
+"""
+Create a function that uses the `STFTSpectrogram` to compute a spectrogram,
+then plots it.
+"""
+
+
+def plot_single_spectrogram(sample_wav_data):
+    spectrogram = layers.STFTSpectrogram(
+        mode="log",
+        frame_length=SAMPLE_RATE * 20 // 1000,
+        frame_step=SAMPLE_RATE * 5 // 1000,
+        fft_length=1024,
+        trainable=False,
+    )(sample_wav_data[None, ...])[0, ...]
+
+    # Plot the spectrogram
+    plt.imshow(spectrogram.T, origin="lower")
+    plt.title("Single Channel Spectrogram")
+    plt.xlabel("Time")
+    plt.ylabel("Frequency")
+    plt.show()
+
+
+"""
+Create a function that uses the `STFTSpectrogram` to compute three
+spectrograms with multiple bandwidths, then aligns them as an image
+with different channels, to get a multi-bandwith spectrogram,
+then plots the spectrogram.
+"""
+
+
+def plot_multi_bandwidth_spectrogram(sample_wav_data):
+    # All spectrograms must use the same `fft_length`, `frame_step`, and
+    # `padding="same"` in order to produce spectrograms with identical shapes,
+    # hence aligning them together. `expand_dims` ensures that the shapes are
+    # compatible with image models.
+
+    spectrograms = np.concatenate(
+        [
+            layers.STFTSpectrogram(
+                mode="log",
+                frame_length=SAMPLE_RATE * x // 1000,
+                frame_step=SAMPLE_RATE * 5 // 1000,
+                fft_length=1024,
+                padding="same",
+                expand_dims=True,
+            )(sample_wav_data[None, ...])[0, ...]
+            for x in [5, 10, 20]
+        ],
+        axis=-1,
+    ).transpose([1, 0, 2])
+
+    # normalize each color channel for better viewing
+    mn = spectrograms.min(axis=(0, 1), keepdims=True)
+    mx = spectrograms.max(axis=(0, 1), keepdims=True)
+    spectrograms = (spectrograms - mn) / (mx - mn)
+
+    plt.imshow(spectrograms, origin="lower")
+    plt.title("Multi-bandwidth Spectrogram")
+    plt.xlabel("Time")
+    plt.ylabel("Frequency")
+    plt.show()
+
+
+"""
+Demonstrate a sample wav file.
+"""
+
+sample_wav_data = read_wav_file(pd_data["filename"].tolist()[52])
+plt.plot(sample_wav_data[:, 0])
+plt.show()
+
+"""
+Plot a Spectrogram
+"""
+
+plot_single_spectrogram(sample_wav_data)
+
+"""
+Plot a multi-bandwidth spectrogram
+"""
+
+plot_multi_bandwidth_spectrogram(sample_wav_data)
+
+"""
+### Define functions to construct a TF Dataset
+"""
+
+
+def read_dataset(df, folds):
+    msk = df["fold"].isin(folds)
+    filenames = df["filename"][msk]
+    targets = df["target"][msk].values
+    waves = np.array([read_wav_file(fil) for fil in filenames], dtype=np.float32)
+    return waves, targets
+
+
+"""
+### Create the datasets
+"""
+
+train_x, train_y = read_dataset(pd_data, [1, 2, 3])
+valid_x, valid_y = read_dataset(pd_data, [4])
+test_x, test_y = read_dataset(pd_data, [5])
+
+"""
+## Training the Models
+
+In this tutorial we demonstrate the different usecases of the `STFTSpectrogram`
+layer.
+
+The first model will use a non-trainable `STFTSpectrogram` layer, so it is
+intended purely for preprocessing. Additionally, the model will use 1D signals,
+hence it make use of Conv1D layers.
+
+The second model will use a trainable `STFTSpectrogram` layer with the
+`expand_dims` option, which expands the shapes to be compatible with image
+models.
+
+### Create the 1D model
+
+1. Create a non-trainable spectrograms, extracting a 1D time signal.
+2. Apply `Conv1D` layers with `LayerNormalization` simialar to the
+   classic VGG design.
+4. Apply global maximum pooling to have fixed set of features.
+5. Add `Dense` layers to make the final predictions based on the features.
+"""
+
+model1d = keras.Sequential(
+    [
+        layers.InputLayer((None, 1)),
+        layers.STFTSpectrogram(
+            mode="log",
+            frame_length=SAMPLE_RATE * 40 // 1000,
+            frame_step=SAMPLE_RATE * 15 // 1000,
+            trainable=False,
+        ),
+        layers.Conv1D(64, 64, activation="relu"),
+        layers.Conv1D(128, 16, activation="relu"),
+        layers.LayerNormalization(),
+        layers.MaxPooling1D(4),
+        layers.Conv1D(128, 8, activation="relu"),
+        layers.Conv1D(256, 8, activation="relu"),
+        layers.Conv1D(512, 4, activation="relu"),
+        layers.LayerNormalization(),
+        layers.Dropout(0.5),
+        layers.GlobalMaxPooling1D(),
+        layers.Dense(256, activation="relu"),
+        layers.Dense(256, activation="relu"),
+        layers.Dropout(0.5),
+        layers.Dense(NUM_CLASSES, activation="softmax"),
+    ],
+    name="model_1d_non_trainble_stft",
+)
+model1d.compile(
+    optimizer=keras.optimizers.Adam(1e-5),
+    loss="sparse_categorical_crossentropy",
+    metrics=["accuracy"],
+)
+model1d.summary()
+
+"""
+Train the model and restore the best weights.
+"""
+
+history_model1d = model1d.fit(
+    train_x,
+    train_y,
+    batch_size=BATCH_SIZE,
+    validation_data=(valid_x, valid_y),
+    epochs=EPOCHS,
+    callbacks=[
+        keras.callbacks.EarlyStopping(
+            monitor="val_loss",
+            patience=EPOCHS,
+            restore_best_weights=True,
+        )
+    ],
+)
+
+"""
+### Create the 2D model
+
+1. Create three spectrograms with multiple band-widths from the raw input.
+2. Concatenate the three spectrograms to have three channels.
+3. Load `MobileNet` and set the weights from the weights trained on `ImageNet`.
+4. Apply global maximum pooling to have fixed set of features.
+5. Add `Dense` layers to make the final predictions based on the features.
+"""
+
+input = layers.Input((None, 1))
+spectrograms = [
+    layers.STFTSpectrogram(
+        mode="log",
+        frame_length=SAMPLE_RATE * frame_size // 1000,
+        frame_step=SAMPLE_RATE * 15 // 1000,
+        fft_length=2048,
+        padding="same",
+        expand_dims=True,
+        # trainable=True,  # trainable by default
+    )(input)
+    for frame_size in [30, 40, 50]  # frame size in milliseconds
+]
+
+multi_spectrograms = layers.Concatenate(axis=-1)(spectrograms)
+
+img_model = keras.applications.MobileNet(include_top=False, pooling="max")
+output = img_model(multi_spectrograms)
+
+output = layers.Dropout(0.5)(output)
+output = layers.Dense(256, activation="relu")(output)
+output = layers.Dense(256, activation="relu")(output)
+output = layers.Dense(NUM_CLASSES, activation="softmax")(output)
+model2d = keras.Model(input, output, name="model_2d_trainble_stft")
+
+model2d.compile(
+    optimizer=keras.optimizers.Adam(1e-4),
+    loss="sparse_categorical_crossentropy",
+    metrics=["accuracy"],
+)
+model2d.summary()
+
+"""
+Train the model and restore the best weights.
+"""
+
+history_model2d = model2d.fit(
+    train_x,
+    train_y,
+    batch_size=BATCH_SIZE,
+    validation_data=(valid_x, valid_y),
+    epochs=EPOCHS,
+    callbacks=[
+        keras.callbacks.EarlyStopping(
+            monitor="val_loss",
+            patience=EPOCHS,
+            restore_best_weights=True,
+        )
+    ],
+)
+
+"""
+### Plot Training History
+"""
+
+epochs_range = range(EPOCHS)
+
+plt.figure(figsize=(14, 5))
+plt.subplot(1, 2, 1)
+plt.plot(
+    epochs_range,
+    history_model1d.history["accuracy"],
+    label="Training Accuracy,1D model with non-trainable STFT",
+)
+plt.plot(
+    epochs_range,
+    history_model1d.history["val_accuracy"],
+    label="Validation Accuracy, 1D model with non-trainable STFT",
+)
+plt.plot(
+    epochs_range,
+    history_model2d.history["accuracy"],
+    label="Training Accuracy, 2D model with trainable STFT",
+)
+plt.plot(
+    epochs_range,
+    history_model2d.history["val_accuracy"],
+    label="Validation Accuracy, 2D model with trainable STFT",
+)
+plt.legend(loc="lower right")
+plt.title("Training and Validation Accuracy")
+
+plt.subplot(1, 2, 2)
+plt.plot(
+    epochs_range,
+    history_model1d.history["loss"],
+    label="Training Loss,1D model with non-trainable STFT",
+)
+plt.plot(
+    epochs_range,
+    history_model1d.history["val_loss"],
+    label="Validation Loss, 1D model with non-trainable STFT",
+)
+plt.plot(
+    epochs_range,
+    history_model2d.history["loss"],
+    label="Training Loss, 2D model with trainable STFT",
+)
+plt.plot(
+    epochs_range,
+    history_model2d.history["val_loss"],
+    label="Validation Loss, 2D model with trainable STFT",
+)
+plt.legend(loc="upper right")
+plt.title("Training and Validation Loss")
+plt.show()
+
+"""
+### Evaluate on Test Data
+
+Running the models on the test set.
+"""
+
+_, test_acc = model1d.evaluate(test_x, test_y)
+print(f"1D model wit non-trainable STFT -> Test Accuracy: {test_acc * 100:.2f}%")
+
+_, test_acc = model2d.evaluate(test_x, test_y)
+print(f"2D model with trainable STFT -> Test Accuracy: {test_acc * 100:.2f}%")
diff --git a/knowledge_base/audio/transformer_asr.py b/knowledge_base/audio/transformer_asr.py
new file mode 100644
index 0000000000000000000000000000000000000000..b661c73c83abdf1b0c5d734b7e386405511d315c
--- /dev/null
+++ b/knowledge_base/audio/transformer_asr.py
@@ -0,0 +1,542 @@
+"""
+Title: Automatic Speech Recognition with Transformer
+Author: [Apoorv Nandan](https://twitter.com/NandanApoorv)
+Date created: 2021/01/13
+Last modified: 2021/01/13
+Description: Training a sequence-to-sequence Transformer for automatic speech recognition.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+Automatic speech recognition (ASR) consists of transcribing audio speech segments into text.
+ASR can be treated as a sequence-to-sequence problem, where the
+audio can be represented as a sequence of feature vectors
+and the text as a sequence of characters, words, or subword tokens.
+
+For this demonstration, we will use the LJSpeech dataset from the
+[LibriVox](https://librivox.org/) project. It consists of short
+audio clips of a single speaker reading passages from 7 non-fiction books.
+Our model will be similar to the original Transformer (both encoder and decoder)
+as proposed in the paper, "Attention is All You Need".
+
+
+**References:**
+
+- [Attention is All You Need](https://papers.nips.cc/paper/2017/file/3f5ee243547dee91fbd053c1c4a845aa-Paper.pdf)
+- [Very Deep Self-Attention Networks for End-to-End Speech Recognition](https://arxiv.org/abs/1904.13377)
+- [Speech Transformers](https://ieeexplore.ieee.org/document/8462506)
+- [LJSpeech Dataset](https://keithito.com/LJ-Speech-Dataset/)
+"""
+
+import re
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+from glob import glob
+import tensorflow as tf
+import keras
+from keras import layers
+
+
+"""
+## Define the Transformer Input Layer
+
+When processing past target tokens for the decoder, we compute the sum of
+position embeddings and token embeddings.
+
+When processing audio features, we apply convolutional layers to downsample
+them (via convolution strides) and process local relationships.
+"""
+
+
+class TokenEmbedding(layers.Layer):
+    def __init__(self, num_vocab=1000, maxlen=100, num_hid=64):
+        super().__init__()
+        self.emb = keras.layers.Embedding(num_vocab, num_hid)
+        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=num_hid)
+
+    def call(self, x):
+        maxlen = tf.shape(x)[-1]
+        x = self.emb(x)
+        positions = tf.range(start=0, limit=maxlen, delta=1)
+        positions = self.pos_emb(positions)
+        return x + positions
+
+
+class SpeechFeatureEmbedding(layers.Layer):
+    def __init__(self, num_hid=64, maxlen=100):
+        super().__init__()
+        self.conv1 = keras.layers.Conv1D(
+            num_hid, 11, strides=2, padding="same", activation="relu"
+        )
+        self.conv2 = keras.layers.Conv1D(
+            num_hid, 11, strides=2, padding="same", activation="relu"
+        )
+        self.conv3 = keras.layers.Conv1D(
+            num_hid, 11, strides=2, padding="same", activation="relu"
+        )
+
+    def call(self, x):
+        x = self.conv1(x)
+        x = self.conv2(x)
+        return self.conv3(x)
+
+
+"""
+## Transformer Encoder Layer
+"""
+
+
+class TransformerEncoder(layers.Layer):
+    def __init__(self, embed_dim, num_heads, feed_forward_dim, rate=0.1):
+        super().__init__()
+        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
+        self.ffn = keras.Sequential(
+            [
+                layers.Dense(feed_forward_dim, activation="relu"),
+                layers.Dense(embed_dim),
+            ]
+        )
+        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
+        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
+        self.dropout1 = layers.Dropout(rate)
+        self.dropout2 = layers.Dropout(rate)
+
+    def call(self, inputs, training=False):
+        attn_output = self.att(inputs, inputs)
+        attn_output = self.dropout1(attn_output, training=training)
+        out1 = self.layernorm1(inputs + attn_output)
+        ffn_output = self.ffn(out1)
+        ffn_output = self.dropout2(ffn_output, training=training)
+        return self.layernorm2(out1 + ffn_output)
+
+
+"""
+## Transformer Decoder Layer
+"""
+
+
+class TransformerDecoder(layers.Layer):
+    def __init__(self, embed_dim, num_heads, feed_forward_dim, dropout_rate=0.1):
+        super().__init__()
+        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
+        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
+        self.layernorm3 = layers.LayerNormalization(epsilon=1e-6)
+        self.self_att = layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=embed_dim
+        )
+        self.enc_att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
+        self.self_dropout = layers.Dropout(0.5)
+        self.enc_dropout = layers.Dropout(0.1)
+        self.ffn_dropout = layers.Dropout(0.1)
+        self.ffn = keras.Sequential(
+            [
+                layers.Dense(feed_forward_dim, activation="relu"),
+                layers.Dense(embed_dim),
+            ]
+        )
+
+    def causal_attention_mask(self, batch_size, n_dest, n_src, dtype):
+        """Masks the upper half of the dot product matrix in self attention.
+
+        This prevents flow of information from future tokens to current token.
+        1's in the lower triangle, counting from the lower right corner.
+        """
+        i = tf.range(n_dest)[:, None]
+        j = tf.range(n_src)
+        m = i >= j - n_src + n_dest
+        mask = tf.cast(m, dtype)
+        mask = tf.reshape(mask, [1, n_dest, n_src])
+        mult = tf.concat(
+            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)], 0
+        )
+        return tf.tile(mask, mult)
+
+    def call(self, enc_out, target):
+        input_shape = tf.shape(target)
+        batch_size = input_shape[0]
+        seq_len = input_shape[1]
+        causal_mask = self.causal_attention_mask(batch_size, seq_len, seq_len, tf.bool)
+        target_att = self.self_att(target, target, attention_mask=causal_mask)
+        target_norm = self.layernorm1(target + self.self_dropout(target_att))
+        enc_out = self.enc_att(target_norm, enc_out)
+        enc_out_norm = self.layernorm2(self.enc_dropout(enc_out) + target_norm)
+        ffn_out = self.ffn(enc_out_norm)
+        ffn_out_norm = self.layernorm3(enc_out_norm + self.ffn_dropout(ffn_out))
+        return ffn_out_norm
+
+
+"""
+## Complete the Transformer model
+
+Our model takes audio spectrograms as inputs and predicts a sequence of characters.
+During training, we give the decoder the target character sequence shifted to the left
+as input. During inference, the decoder uses its own past predictions to predict the
+next token.
+"""
+
+
+class Transformer(keras.Model):
+    def __init__(
+        self,
+        num_hid=64,
+        num_head=2,
+        num_feed_forward=128,
+        source_maxlen=100,
+        target_maxlen=100,
+        num_layers_enc=4,
+        num_layers_dec=1,
+        num_classes=10,
+    ):
+        super().__init__()
+        self.loss_metric = keras.metrics.Mean(name="loss")
+        self.num_layers_enc = num_layers_enc
+        self.num_layers_dec = num_layers_dec
+        self.target_maxlen = target_maxlen
+        self.num_classes = num_classes
+
+        self.enc_input = SpeechFeatureEmbedding(num_hid=num_hid, maxlen=source_maxlen)
+        self.dec_input = TokenEmbedding(
+            num_vocab=num_classes, maxlen=target_maxlen, num_hid=num_hid
+        )
+
+        self.encoder = keras.Sequential(
+            [self.enc_input]
+            + [
+                TransformerEncoder(num_hid, num_head, num_feed_forward)
+                for _ in range(num_layers_enc)
+            ]
+        )
+
+        for i in range(num_layers_dec):
+            setattr(
+                self,
+                f"dec_layer_{i}",
+                TransformerDecoder(num_hid, num_head, num_feed_forward),
+            )
+
+        self.classifier = layers.Dense(num_classes)
+
+    def decode(self, enc_out, target):
+        y = self.dec_input(target)
+        for i in range(self.num_layers_dec):
+            y = getattr(self, f"dec_layer_{i}")(enc_out, y)
+        return y
+
+    def call(self, inputs):
+        source = inputs[0]
+        target = inputs[1]
+        x = self.encoder(source)
+        y = self.decode(x, target)
+        return self.classifier(y)
+
+    @property
+    def metrics(self):
+        return [self.loss_metric]
+
+    def train_step(self, batch):
+        """Processes one batch inside model.fit()."""
+        source = batch["source"]
+        target = batch["target"]
+        dec_input = target[:, :-1]
+        dec_target = target[:, 1:]
+        with tf.GradientTape() as tape:
+            preds = self([source, dec_input])
+            one_hot = tf.one_hot(dec_target, depth=self.num_classes)
+            mask = tf.math.logical_not(tf.math.equal(dec_target, 0))
+            loss = self.compute_loss(None, one_hot, preds, sample_weight=mask)
+        trainable_vars = self.trainable_variables
+        gradients = tape.gradient(loss, trainable_vars)
+        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
+        self.loss_metric.update_state(loss)
+        return {"loss": self.loss_metric.result()}
+
+    def test_step(self, batch):
+        source = batch["source"]
+        target = batch["target"]
+        dec_input = target[:, :-1]
+        dec_target = target[:, 1:]
+        preds = self([source, dec_input])
+        one_hot = tf.one_hot(dec_target, depth=self.num_classes)
+        mask = tf.math.logical_not(tf.math.equal(dec_target, 0))
+        loss = self.compute_loss(None, one_hot, preds, sample_weight=mask)
+        self.loss_metric.update_state(loss)
+        return {"loss": self.loss_metric.result()}
+
+    def generate(self, source, target_start_token_idx):
+        """Performs inference over one batch of inputs using greedy decoding."""
+        bs = tf.shape(source)[0]
+        enc = self.encoder(source)
+        dec_input = tf.ones((bs, 1), dtype=tf.int32) * target_start_token_idx
+        dec_logits = []
+        for i in range(self.target_maxlen - 1):
+            dec_out = self.decode(enc, dec_input)
+            logits = self.classifier(dec_out)
+            logits = tf.argmax(logits, axis=-1, output_type=tf.int32)
+            last_logit = tf.expand_dims(logits[:, -1], axis=-1)
+            dec_logits.append(last_logit)
+            dec_input = tf.concat([dec_input, last_logit], axis=-1)
+        return dec_input
+
+
+"""
+## Download the dataset
+
+Note: This requires ~3.6 GB of disk space and
+takes ~5 minutes for the extraction of files.
+"""
+
+pattern_wav_name = re.compile(r"([^/\\\.]+)")
+
+keras.utils.get_file(
+    os.path.join(os.getcwd(), "data.tar.gz"),
+    "https://data.keithito.com/data/speech/LJSpeech-1.1.tar.bz2",
+    extract=True,
+    archive_format="tar",
+    cache_dir=".",
+)
+
+
+saveto = "./datasets/LJSpeech-1.1"
+wavs = glob("{}/**/*.wav".format(saveto), recursive=True)
+
+id_to_text = {}
+with open(os.path.join(saveto, "metadata.csv"), encoding="utf-8") as f:
+    for line in f:
+        id = line.strip().split("|")[0]
+        text = line.strip().split("|")[2]
+        id_to_text[id] = text
+
+
+def get_data(wavs, id_to_text, maxlen=50):
+    """returns mapping of audio paths and transcription texts"""
+    data = []
+    for w in wavs:
+        id = pattern_wav_name.split(w)[-4]
+        if len(id_to_text[id]) < maxlen:
+            data.append({"audio": w, "text": id_to_text[id]})
+    return data
+
+
+"""
+## Preprocess the dataset
+"""
+
+
+class VectorizeChar:
+    def __init__(self, max_len=50):
+        self.vocab = (
+            ["-", "#", "<", ">"]
+            + [chr(i + 96) for i in range(1, 27)]
+            + [" ", ".", ",", "?"]
+        )
+        self.max_len = max_len
+        self.char_to_idx = {}
+        for i, ch in enumerate(self.vocab):
+            self.char_to_idx[ch] = i
+
+    def __call__(self, text):
+        text = text.lower()
+        text = text[: self.max_len - 2]
+        text = "<" + text + ">"
+        pad_len = self.max_len - len(text)
+        return [self.char_to_idx.get(ch, 1) for ch in text] + [0] * pad_len
+
+    def get_vocabulary(self):
+        return self.vocab
+
+
+max_target_len = 200  # all transcripts in out data are < 200 characters
+data = get_data(wavs, id_to_text, max_target_len)
+vectorizer = VectorizeChar(max_target_len)
+print("vocab size", len(vectorizer.get_vocabulary()))
+
+
+def create_text_ds(data):
+    texts = [_["text"] for _ in data]
+    text_ds = [vectorizer(t) for t in texts]
+    text_ds = tf.data.Dataset.from_tensor_slices(text_ds)
+    return text_ds
+
+
+def path_to_audio(path):
+    # spectrogram using stft
+    audio = tf.io.read_file(path)
+    audio, _ = tf.audio.decode_wav(audio, 1)
+    audio = tf.squeeze(audio, axis=-1)
+    stfts = tf.signal.stft(audio, frame_length=200, frame_step=80, fft_length=256)
+    x = tf.math.pow(tf.abs(stfts), 0.5)
+    # normalisation
+    means = tf.math.reduce_mean(x, 1, keepdims=True)
+    stddevs = tf.math.reduce_std(x, 1, keepdims=True)
+    x = (x - means) / stddevs
+    audio_len = tf.shape(x)[0]
+    # padding to 10 seconds
+    pad_len = 2754
+    paddings = tf.constant([[0, pad_len], [0, 0]])
+    x = tf.pad(x, paddings, "CONSTANT")[:pad_len, :]
+    return x
+
+
+def create_audio_ds(data):
+    flist = [_["audio"] for _ in data]
+    audio_ds = tf.data.Dataset.from_tensor_slices(flist)
+    audio_ds = audio_ds.map(path_to_audio, num_parallel_calls=tf.data.AUTOTUNE)
+    return audio_ds
+
+
+def create_tf_dataset(data, bs=4):
+    audio_ds = create_audio_ds(data)
+    text_ds = create_text_ds(data)
+    ds = tf.data.Dataset.zip((audio_ds, text_ds))
+    ds = ds.map(lambda x, y: {"source": x, "target": y})
+    ds = ds.batch(bs)
+    ds = ds.prefetch(tf.data.AUTOTUNE)
+    return ds
+
+
+split = int(len(data) * 0.99)
+train_data = data[:split]
+test_data = data[split:]
+ds = create_tf_dataset(train_data, bs=64)
+val_ds = create_tf_dataset(test_data, bs=4)
+
+"""
+## Callbacks to display predictions
+"""
+
+
+class DisplayOutputs(keras.callbacks.Callback):
+    def __init__(
+        self, batch, idx_to_token, target_start_token_idx=27, target_end_token_idx=28
+    ):
+        """Displays a batch of outputs after every epoch
+
+        Args:
+            batch: A test batch containing the keys "source" and "target"
+            idx_to_token: A List containing the vocabulary tokens corresponding to their indices
+            target_start_token_idx: A start token index in the target vocabulary
+            target_end_token_idx: An end token index in the target vocabulary
+        """
+        self.batch = batch
+        self.target_start_token_idx = target_start_token_idx
+        self.target_end_token_idx = target_end_token_idx
+        self.idx_to_char = idx_to_token
+
+    def on_epoch_end(self, epoch, logs=None):
+        if epoch % 5 != 0:
+            return
+        source = self.batch["source"]
+        target = self.batch["target"].numpy()
+        bs = tf.shape(source)[0]
+        preds = self.model.generate(source, self.target_start_token_idx)
+        preds = preds.numpy()
+        for i in range(bs):
+            target_text = "".join([self.idx_to_char[_] for _ in target[i, :]])
+            prediction = ""
+            for idx in preds[i, :]:
+                prediction += self.idx_to_char[idx]
+                if idx == self.target_end_token_idx:
+                    break
+            print(f"target:     {target_text.replace('-','')}")
+            print(f"prediction: {prediction}\n")
+
+
+"""
+## Learning rate schedule
+"""
+
+
+class CustomSchedule(keras.optimizers.schedules.LearningRateSchedule):
+    def __init__(
+        self,
+        init_lr=0.00001,
+        lr_after_warmup=0.001,
+        final_lr=0.00001,
+        warmup_epochs=15,
+        decay_epochs=85,
+        steps_per_epoch=203,
+    ):
+        super().__init__()
+        self.init_lr = init_lr
+        self.lr_after_warmup = lr_after_warmup
+        self.final_lr = final_lr
+        self.warmup_epochs = warmup_epochs
+        self.decay_epochs = decay_epochs
+        self.steps_per_epoch = steps_per_epoch
+
+    def calculate_lr(self, epoch):
+        """linear warm up - linear decay"""
+        warmup_lr = (
+            self.init_lr
+            + ((self.lr_after_warmup - self.init_lr) / (self.warmup_epochs - 1)) * epoch
+        )
+        decay_lr = tf.math.maximum(
+            self.final_lr,
+            self.lr_after_warmup
+            - (epoch - self.warmup_epochs)
+            * (self.lr_after_warmup - self.final_lr)
+            / self.decay_epochs,
+        )
+        return tf.math.minimum(warmup_lr, decay_lr)
+
+    def __call__(self, step):
+        epoch = step // self.steps_per_epoch
+        epoch = tf.cast(epoch, "float32")
+        return self.calculate_lr(epoch)
+
+
+"""
+## Create & train the end-to-end model
+"""
+
+batch = next(iter(val_ds))
+
+# The vocabulary to convert predicted indices into characters
+idx_to_char = vectorizer.get_vocabulary()
+display_cb = DisplayOutputs(
+    batch, idx_to_char, target_start_token_idx=2, target_end_token_idx=3
+)  # set the arguments as per vocabulary index for '<' and '>'
+
+model = Transformer(
+    num_hid=200,
+    num_head=2,
+    num_feed_forward=400,
+    target_maxlen=max_target_len,
+    num_layers_enc=4,
+    num_layers_dec=1,
+    num_classes=34,
+)
+loss_fn = keras.losses.CategoricalCrossentropy(
+    from_logits=True,
+    label_smoothing=0.1,
+)
+
+learning_rate = CustomSchedule(
+    init_lr=0.00001,
+    lr_after_warmup=0.001,
+    final_lr=0.00001,
+    warmup_epochs=15,
+    decay_epochs=85,
+    steps_per_epoch=len(ds),
+)
+optimizer = keras.optimizers.Adam(learning_rate)
+model.compile(optimizer=optimizer, loss=loss_fn)
+
+history = model.fit(ds, validation_data=val_ds, callbacks=[display_cb], epochs=1)
+
+"""
+In practice, you should train for around 100 epochs or more.
+
+Some of the predicted text at or around epoch 35 may look as follows:
+```
+target:     <as they sat in the car, frazier asked oswald where his lunch was>
+prediction: <as they sat in the car frazier his lunch ware mis lunch was>
+
+target:     <under the entry for may one, nineteen sixty,>
+prediction: <under the introus for may monee, nin the sixty,>
+```
+"""
diff --git a/knowledge_base/audio/uk_ireland_accent_recognition.py b/knowledge_base/audio/uk_ireland_accent_recognition.py
new file mode 100644
index 0000000000000000000000000000000000000000..283b0a4edc671efd23a8c3eda42e82f3f1c33b1b
--- /dev/null
+++ b/knowledge_base/audio/uk_ireland_accent_recognition.py
@@ -0,0 +1,696 @@
+"""
+Title: English speaker accent recognition using Transfer Learning
+Author: [Fadi Badine](https://twitter.com/fadibadine)
+Date created: 2022/04/16
+Last modified: 2022/04/16
+Description: Training a model to classify UK & Ireland accents using feature extraction from Yamnet.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+The following example shows how to use feature extraction in order to
+train a model to classify the English accent spoken in an audio wave.
+
+Instead of training a model from scratch, transfer learning enables us to
+take advantage of existing state-of-the-art deep learning models and use them as feature extractors.
+
+Our process:
+
+* Use a TF Hub pre-trained model (Yamnet) and apply it as part of the tf.data pipeline which transforms
+the audio files into feature vectors.
+* Train a dense model on the feature vectors.
+* Use the trained model for inference on a new audio file.
+
+Note:
+
+* We need to install TensorFlow IO in order to resample audio files to 16 kHz as required by Yamnet model.
+* In the test section, ffmpeg is used to convert the mp3 file to wav.
+
+You can install TensorFlow IO with the following command:
+"""
+
+"""shell
+pip install -U -q tensorflow_io
+"""
+
+"""
+## Configuration
+"""
+
+SEED = 1337
+EPOCHS = 100
+BATCH_SIZE = 64
+VALIDATION_RATIO = 0.1
+MODEL_NAME = "uk_irish_accent_recognition"
+
+# Location where the dataset will be downloaded.
+# By default (None), keras.utils.get_file will use ~/.keras/ as the CACHE_DIR
+CACHE_DIR = None
+
+# The location of the dataset
+URL_PATH = "https://www.openslr.org/resources/83/"
+
+# List of datasets compressed files that contain the audio files
+zip_files = {
+    0: "irish_english_male.zip",
+    1: "midlands_english_female.zip",
+    2: "midlands_english_male.zip",
+    3: "northern_english_female.zip",
+    4: "northern_english_male.zip",
+    5: "scottish_english_female.zip",
+    6: "scottish_english_male.zip",
+    7: "southern_english_female.zip",
+    8: "southern_english_male.zip",
+    9: "welsh_english_female.zip",
+    10: "welsh_english_male.zip",
+}
+
+# We see that there are 2 compressed files for each accent (except Irish):
+# - One for male speakers
+# - One for female speakers
+# However, we will be using a gender agnostic dataset.
+
+# List of gender agnostic categories
+gender_agnostic_categories = [
+    "ir",  # Irish
+    "mi",  # Midlands
+    "no",  # Northern
+    "sc",  # Scottish
+    "so",  # Southern
+    "we",  # Welsh
+]
+
+class_names = [
+    "Irish",
+    "Midlands",
+    "Northern",
+    "Scottish",
+    "Southern",
+    "Welsh",
+    "Not a speech",
+]
+
+"""
+## Imports
+"""
+
+import os
+import io
+import csv
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+import tensorflow_hub as hub
+import tensorflow_io as tfio
+from tensorflow import keras
+import matplotlib.pyplot as plt
+import seaborn as sns
+from scipy import stats
+from IPython.display import Audio
+
+
+# Set all random seeds in order to get reproducible results
+keras.utils.set_random_seed(SEED)
+
+# Where to download the dataset
+DATASET_DESTINATION = os.path.join(CACHE_DIR if CACHE_DIR else "~/.keras/", "datasets")
+
+"""
+## Yamnet Model
+
+Yamnet is an audio event classifier trained on the AudioSet dataset to predict audio
+events from the AudioSet ontology. It is available on TensorFlow Hub.
+
+Yamnet accepts a 1-D tensor of audio samples with a sample rate of 16 kHz.
+As output, the model returns a 3-tuple:
+
+* Scores of shape `(N, 521)` representing the scores of the 521 classes.
+* Embeddings of shape `(N, 1024)`.
+* The log-mel spectrogram of the entire audio frame.
+
+We will use the embeddings, which are the features extracted from the audio samples, as the input to our dense model.
+
+For more detailed information about Yamnet, please refer to its [TensorFlow Hub](https://tfhub.dev/google/yamnet/1) page.
+"""
+
+yamnet_model = hub.load("https://tfhub.dev/google/yamnet/1")
+
+"""
+## Dataset
+
+The dataset used is the
+[Crowdsourced high-quality UK and Ireland English Dialect speech data set](https://openslr.org/83/)
+which consists of a total of 17,877 high-quality audio wav files.
+
+This dataset includes over 31 hours of recording from 120 volunteers who self-identify as
+native speakers of Southern England, Midlands, Northern England, Wales, Scotland and Ireland.
+
+For more info, please refer to the above link or to the following paper:
+[Open-source Multi-speaker Corpora of the English Accents in the British Isles](https://aclanthology.org/2020.lrec-1.804.pdf)
+"""
+
+"""
+## Download the data
+"""
+
+# CSV file that contains information about the dataset. For each entry, we have:
+# - ID
+# - wav file name
+# - transcript
+line_index_file = keras.utils.get_file(
+    fname="line_index_file", origin=URL_PATH + "line_index_all.csv"
+)
+
+# Download the list of compressed files that contain the audio wav files
+for i in zip_files:
+    fname = zip_files[i].split(".")[0]
+    url = URL_PATH + zip_files[i]
+
+    zip_file = keras.utils.get_file(fname=fname, origin=url, extract=True)
+    os.remove(zip_file)
+
+"""
+## Load the data in a Dataframe
+
+Of the 3 columns (ID, filename and transcript), we are only interested in the filename column in order to read the audio file.
+We will ignore the other two.
+"""
+
+dataframe = pd.read_csv(
+    line_index_file, names=["id", "filename", "transcript"], usecols=["filename"]
+)
+dataframe.head()
+
+"""
+Let's now preprocess the dataset by:
+
+* Adjusting the filename (removing a leading space & adding ".wav" extension to the
+filename).
+* Creating a label using the first 2 characters of the filename which indicate the
+accent.
+* Shuffling the samples.
+"""
+
+
+# The purpose of this function is to preprocess the dataframe by applying the following:
+# - Cleaning the filename from a leading space
+# - Generating a label column that is gender agnostic i.e.
+#   welsh english male and welsh english female for example are both labeled as
+#   welsh english
+# - Add extension .wav to the filename
+# - Shuffle samples
+def preprocess_dataframe(dataframe):
+    # Remove leading space in filename column
+    dataframe["filename"] = dataframe.apply(lambda row: row["filename"].strip(), axis=1)
+
+    # Create gender agnostic labels based on the filename first 2 letters
+    dataframe["label"] = dataframe.apply(
+        lambda row: gender_agnostic_categories.index(row["filename"][:2]), axis=1
+    )
+
+    # Add the file path to the name
+    dataframe["filename"] = dataframe.apply(
+        lambda row: os.path.join(DATASET_DESTINATION, row["filename"] + ".wav"), axis=1
+    )
+
+    # Shuffle the samples
+    dataframe = dataframe.sample(frac=1, random_state=SEED).reset_index(drop=True)
+
+    return dataframe
+
+
+dataframe = preprocess_dataframe(dataframe)
+dataframe.head()
+
+"""
+## Prepare training & validation sets
+
+Let's split the samples creating training and validation sets.
+"""
+
+split = int(len(dataframe) * (1 - VALIDATION_RATIO))
+train_df = dataframe[:split]
+valid_df = dataframe[split:]
+
+print(
+    f"We have {train_df.shape[0]} training samples & {valid_df.shape[0]} validation ones"
+)
+
+"""
+## Prepare a TensorFlow Dataset
+
+Next, we need to create a `tf.data.Dataset`.
+This is done by creating a `dataframe_to_dataset` function that does the following:
+
+* Create a dataset using filenames and labels.
+* Get the Yamnet embeddings by calling another function `filepath_to_embeddings`.
+* Apply caching, reshuffling and setting batch size.
+
+The `filepath_to_embeddings` does the following:
+
+* Load audio file.
+* Resample audio to 16 kHz.
+* Generate scores and embeddings from Yamnet model.
+* Since Yamnet generates multiple samples for each audio file,
+this function also duplicates the label for all the generated samples
+that have `score=0` (speech) whereas sets the label for the others as
+'other' indicating that this audio segment is not a speech and we won't label it as one of the accents.
+
+The below `load_16k_audio_file` is copied from the following tutorial
+[Transfer learning with YAMNet for environmental sound classification](https://www.tensorflow.org/tutorials/audio/transfer_learning_audio)
+"""
+
+
+@tf.function
+def load_16k_audio_wav(filename):
+    # Read file content
+    file_content = tf.io.read_file(filename)
+
+    # Decode audio wave
+    audio_wav, sample_rate = tf.audio.decode_wav(file_content, desired_channels=1)
+    audio_wav = tf.squeeze(audio_wav, axis=-1)
+    sample_rate = tf.cast(sample_rate, dtype=tf.int64)
+
+    # Resample to 16k
+    audio_wav = tfio.audio.resample(audio_wav, rate_in=sample_rate, rate_out=16000)
+
+    return audio_wav
+
+
+def filepath_to_embeddings(filename, label):
+    # Load 16k audio wave
+    audio_wav = load_16k_audio_wav(filename)
+
+    # Get audio embeddings & scores.
+    # The embeddings are the audio features extracted using transfer learning
+    # while scores will be used to identify time slots that are not speech
+    # which will then be gathered into a specific new category 'other'
+    scores, embeddings, _ = yamnet_model(audio_wav)
+
+    # Number of embeddings in order to know how many times to repeat the label
+    embeddings_num = tf.shape(embeddings)[0]
+    labels = tf.repeat(label, embeddings_num)
+
+    # Change labels for time-slots that are not speech into a new category 'other'
+    labels = tf.where(tf.argmax(scores, axis=1) == 0, label, len(class_names) - 1)
+
+    # Using one-hot in order to use AUC
+    return (embeddings, tf.one_hot(labels, len(class_names)))
+
+
+def dataframe_to_dataset(dataframe, batch_size=64):
+    dataset = tf.data.Dataset.from_tensor_slices(
+        (dataframe["filename"], dataframe["label"])
+    )
+
+    dataset = dataset.map(
+        lambda x, y: filepath_to_embeddings(x, y),
+        num_parallel_calls=tf.data.experimental.AUTOTUNE,
+    ).unbatch()
+
+    return dataset.cache().batch(batch_size).prefetch(tf.data.AUTOTUNE)
+
+
+train_ds = dataframe_to_dataset(train_df)
+valid_ds = dataframe_to_dataset(valid_df)
+
+"""
+## Build the model
+
+The model that we use consists of:
+
+* An input layer which is the embedding output of the Yamnet classifier.
+* 4 dense hidden layers and 4 dropout layers.
+* An output dense layer.
+
+The model's hyperparameters were selected using
+[KerasTuner](https://keras.io/keras_tuner/).
+"""
+
+keras.backend.clear_session()
+
+
+def build_and_compile_model():
+    inputs = keras.layers.Input(shape=(1024), name="embedding")
+
+    x = keras.layers.Dense(256, activation="relu", name="dense_1")(inputs)
+    x = keras.layers.Dropout(0.15, name="dropout_1")(x)
+
+    x = keras.layers.Dense(384, activation="relu", name="dense_2")(x)
+    x = keras.layers.Dropout(0.2, name="dropout_2")(x)
+
+    x = keras.layers.Dense(192, activation="relu", name="dense_3")(x)
+    x = keras.layers.Dropout(0.25, name="dropout_3")(x)
+
+    x = keras.layers.Dense(384, activation="relu", name="dense_4")(x)
+    x = keras.layers.Dropout(0.2, name="dropout_4")(x)
+
+    outputs = keras.layers.Dense(len(class_names), activation="softmax", name="ouput")(
+        x
+    )
+
+    model = keras.Model(inputs=inputs, outputs=outputs, name="accent_recognition")
+
+    model.compile(
+        optimizer=keras.optimizers.Adam(learning_rate=1.9644e-5),
+        loss=keras.losses.CategoricalCrossentropy(),
+        metrics=["accuracy", keras.metrics.AUC(name="auc")],
+    )
+
+    return model
+
+
+model = build_and_compile_model()
+model.summary()
+
+"""
+## Class weights calculation
+
+Since the dataset is quite unbalanced, we will use `class_weight` argument during training.
+
+Getting the class weights is a little tricky because even though we know the number of
+audio files for each class, it does not represent the number of samples for that class
+since Yamnet transforms each audio file into multiple audio samples of 0.96 seconds each.
+So every audio file will be split into a number of samples that is proportional to its length.
+
+Therefore, to get those weights, we have to calculate the number of samples for each class
+after preprocessing through Yamnet.
+"""
+
+class_counts = tf.zeros(shape=(len(class_names),), dtype=tf.int32)
+
+for x, y in iter(train_ds):
+    class_counts = class_counts + tf.math.bincount(
+        tf.cast(tf.math.argmax(y, axis=1), tf.int32), minlength=len(class_names)
+    )
+
+class_weight = {
+    i: tf.math.reduce_sum(class_counts).numpy() / class_counts[i].numpy()
+    for i in range(len(class_counts))
+}
+
+print(class_weight)
+
+"""
+## Callbacks
+
+We use Keras callbacks in order to:
+
+* Stop whenever the validation AUC stops improving.
+* Save the best model.
+* Call TensorBoard in order to later view the training and validation logs.
+"""
+
+early_stopping_cb = keras.callbacks.EarlyStopping(
+    monitor="val_auc", patience=10, restore_best_weights=True
+)
+
+model_checkpoint_cb = keras.callbacks.ModelCheckpoint(
+    MODEL_NAME + ".h5", monitor="val_auc", save_best_only=True
+)
+
+tensorboard_cb = keras.callbacks.TensorBoard(
+    os.path.join(os.curdir, "logs", model.name)
+)
+
+callbacks = [early_stopping_cb, model_checkpoint_cb, tensorboard_cb]
+
+"""
+## Training
+"""
+
+history = model.fit(
+    train_ds,
+    epochs=EPOCHS,
+    validation_data=valid_ds,
+    class_weight=class_weight,
+    callbacks=callbacks,
+    verbose=2,
+)
+
+"""
+## Results
+
+Let's plot the training and validation AUC and accuracy.
+"""
+
+fig, axs = plt.subplots(nrows=1, ncols=2, figsize=(14, 5))
+
+axs[0].plot(range(EPOCHS), history.history["accuracy"], label="Training")
+axs[0].plot(range(EPOCHS), history.history["val_accuracy"], label="Validation")
+axs[0].set_xlabel("Epochs")
+axs[0].set_title("Training & Validation Accuracy")
+axs[0].legend()
+axs[0].grid(True)
+
+axs[1].plot(range(EPOCHS), history.history["auc"], label="Training")
+axs[1].plot(range(EPOCHS), history.history["val_auc"], label="Validation")
+axs[1].set_xlabel("Epochs")
+axs[1].set_title("Training & Validation AUC")
+axs[1].legend()
+axs[1].grid(True)
+
+plt.show()
+
+"""
+## Evaluation
+"""
+
+train_loss, train_acc, train_auc = model.evaluate(train_ds)
+valid_loss, valid_acc, valid_auc = model.evaluate(valid_ds)
+
+"""
+Let's try to compare our model's performance to Yamnet's using one of Yamnet metrics (d-prime)
+Yamnet achieved a d-prime value of 2.318.
+Let's check our model's performance.
+"""
+
+
+# The following function calculates the d-prime score from the AUC
+def d_prime(auc):
+    standard_normal = stats.norm()
+    d_prime = standard_normal.ppf(auc) * np.sqrt(2.0)
+    return d_prime
+
+
+print(
+    "train d-prime: {0:.3f}, validation d-prime: {1:.3f}".format(
+        d_prime(train_auc), d_prime(valid_auc)
+    )
+)
+
+"""
+We can see that the model achieves the following results:
+
+Results    | Training  | Validation
+-----------|-----------|------------
+Accuracy   | 54%       | 51%
+AUC        | 0.91      | 0.89
+d-prime    | 1.882     | 1.740
+
+"""
+
+"""
+## Confusion Matrix
+
+Let's now plot the confusion matrix for the validation dataset.
+
+The confusion matrix lets us see, for every class, not only how many samples were correctly classified,
+but also which other classes were the samples confused with.
+
+It allows us to calculate the precision and recall for every class.
+"""
+
+# Create x and y tensors
+x_valid = None
+y_valid = None
+
+for x, y in iter(valid_ds):
+    if x_valid is None:
+        x_valid = x.numpy()
+        y_valid = y.numpy()
+    else:
+        x_valid = np.concatenate((x_valid, x.numpy()), axis=0)
+        y_valid = np.concatenate((y_valid, y.numpy()), axis=0)
+
+# Generate predictions
+y_pred = model.predict(x_valid)
+
+# Calculate confusion matrix
+confusion_mtx = tf.math.confusion_matrix(
+    np.argmax(y_valid, axis=1), np.argmax(y_pred, axis=1)
+)
+
+# Plot the confusion matrix
+plt.figure(figsize=(10, 8))
+sns.heatmap(
+    confusion_mtx, xticklabels=class_names, yticklabels=class_names, annot=True, fmt="g"
+)
+plt.xlabel("Prediction")
+plt.ylabel("Label")
+plt.title("Validation Confusion Matrix")
+plt.show()
+
+"""
+## Precision & recall
+
+For every class:
+
+* Recall is the ratio of correctly classified samples i.e. it shows how many samples
+of this specific class, the model is able to detect.
+It is the ratio of diagonal elements to the sum of all elements in the row.
+* Precision shows the accuracy of the classifier. It is the ratio of correctly predicted
+samples among the ones classified as belonging to this class.
+It is the ratio of diagonal elements to the sum of all elements in the column.
+"""
+
+for i, label in enumerate(class_names):
+    precision = confusion_mtx[i, i] / np.sum(confusion_mtx[:, i])
+    recall = confusion_mtx[i, i] / np.sum(confusion_mtx[i, :])
+    print(
+        "{0:15} Precision:{1:.2f}%; Recall:{2:.2f}%".format(
+            label, precision * 100, recall * 100
+        )
+    )
+
+"""
+## Run inference on test data
+
+Let's now run a test on a single audio file.
+Let's check this example from [The Scottish Voice](https://www.thescottishvoice.org.uk/home/)
+
+We will:
+
+* Download the mp3 file.
+* Convert it to a 16k wav file.
+* Run the model on the wav file.
+* Plot the results.
+"""
+
+filename = "audio-sample-Stuart"
+url = "https://www.thescottishvoice.org.uk/files/cm/files/"
+
+if os.path.exists(filename + ".wav") == False:
+    print(f"Downloading {filename}.mp3 from {url}")
+    command = f"wget {url}{filename}.mp3"
+    os.system(command)
+
+    print(f"Converting mp3 to wav and resampling to 16 kHZ")
+    command = (
+        f"ffmpeg -hide_banner -loglevel panic -y -i {filename}.mp3 -acodec "
+        f"pcm_s16le -ac 1 -ar 16000 {filename}.wav"
+    )
+    os.system(command)
+
+filename = filename + ".wav"
+
+
+"""
+The below function `yamnet_class_names_from_csv` was copied and very slightly changed
+from this [Yamnet Notebook](https://colab.research.google.com/github/tensorflow/hub/blob/master/examples/colab/yamnet.ipynb).
+"""
+
+
+def yamnet_class_names_from_csv(yamnet_class_map_csv_text):
+    """Returns list of class names corresponding to score vector."""
+    yamnet_class_map_csv = io.StringIO(yamnet_class_map_csv_text)
+    yamnet_class_names = [
+        name for (class_index, mid, name) in csv.reader(yamnet_class_map_csv)
+    ]
+    yamnet_class_names = yamnet_class_names[1:]  # Skip CSV header
+    return yamnet_class_names
+
+
+yamnet_class_map_path = yamnet_model.class_map_path().numpy()
+yamnet_class_names = yamnet_class_names_from_csv(
+    tf.io.read_file(yamnet_class_map_path).numpy().decode("utf-8")
+)
+
+
+def calculate_number_of_non_speech(scores):
+    number_of_non_speech = tf.math.reduce_sum(
+        tf.where(tf.math.argmax(scores, axis=1, output_type=tf.int32) != 0, 1, 0)
+    )
+
+    return number_of_non_speech
+
+
+def filename_to_predictions(filename):
+    # Load 16k audio wave
+    audio_wav = load_16k_audio_wav(filename)
+
+    # Get audio embeddings & scores.
+    scores, embeddings, mel_spectrogram = yamnet_model(audio_wav)
+
+    print(
+        "Out of {} samples, {} are not speech".format(
+            scores.shape[0], calculate_number_of_non_speech(scores)
+        )
+    )
+
+    # Predict the output of the accent recognition model with embeddings as input
+    predictions = model.predict(embeddings)
+
+    return audio_wav, predictions, mel_spectrogram
+
+
+"""
+Let's run the model on the audio file:
+"""
+
+audio_wav, predictions, mel_spectrogram = filename_to_predictions(filename)
+
+infered_class = class_names[predictions.mean(axis=0).argmax()]
+print(f"The main accent is: {infered_class} English")
+
+"""
+Listen to the audio
+"""
+
+Audio(audio_wav, rate=16000)
+
+"""
+The below function was copied from this [Yamnet notebook](tinyurl.com/4a8xn7at) and adjusted to our need.
+
+This function plots the following:
+
+* Audio waveform
+* Mel spectrogram
+* Predictions for every time step
+"""
+
+plt.figure(figsize=(10, 6))
+
+# Plot the waveform.
+plt.subplot(3, 1, 1)
+plt.plot(audio_wav)
+plt.xlim([0, len(audio_wav)])
+
+# Plot the log-mel spectrogram (returned by the model).
+plt.subplot(3, 1, 2)
+plt.imshow(
+    mel_spectrogram.numpy().T, aspect="auto", interpolation="nearest", origin="lower"
+)
+
+# Plot and label the model output scores for the top-scoring classes.
+mean_predictions = np.mean(predictions, axis=0)
+
+top_class_indices = np.argsort(mean_predictions)[::-1]
+plt.subplot(3, 1, 3)
+plt.imshow(
+    predictions[:, top_class_indices].T,
+    aspect="auto",
+    interpolation="nearest",
+    cmap="gray_r",
+)
+
+# patch_padding = (PATCH_WINDOW_SECONDS / 2) / PATCH_HOP_SECONDS
+# values from the model documentation
+patch_padding = (0.025 / 2) / 0.01
+plt.xlim([-patch_padding - 0.5, predictions.shape[0] + patch_padding - 0.5])
+# Label the top_N classes.
+yticks = range(0, len(class_names), 1)
+plt.yticks(yticks, [class_names[top_class_indices[x]] for x in yticks])
+_ = plt.ylim(-0.5 + np.array([len(class_names), 0]))
diff --git a/knowledge_base/audio/vocal_track_separation.py b/knowledge_base/audio/vocal_track_separation.py
new file mode 100644
index 0000000000000000000000000000000000000000..25574e0ab843f29024cea95439799b0764ee32e0
--- /dev/null
+++ b/knowledge_base/audio/vocal_track_separation.py
@@ -0,0 +1,673 @@
+"""
+Title: Vocal Track Separation with Encoder-Decoder Architecture
+Author: [Joaquin Jimenez](https://github.com/johacks/)
+Date created: 2024/12/10
+Last modified: 2024/12/10
+Description: Train a model to separate vocal tracks from music mixtures.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+In this tutorial, we build a vocal track separation model using an encoder-decoder
+architecture in Keras 3.
+
+We train the model on the [MUSDB18 dataset](https://doi.org/10.5281/zenodo.1117372),
+which provides music mixtures and isolated tracks for drums, bass, other, and vocals.
+
+Key concepts covered:
+
+- Audio data preprocessing using the Short-Time Fourier Transform (STFT).
+- Audio data augmentation techniques.
+- Implementing custom encoders and decoders specialized for audio data.
+- Defining appropriate loss functions and metrics for audio source separation tasks.
+
+The model architecture is derived from the TFC_TDF_Net model described in:
+
+W. Choi, M. Kim, J. Chung, D. Lee, and S. Jung, “Investigating U-Nets with various
+intermediate blocks for spectrogram-based singing voice separation,” in the 21st
+International Society for Music Information Retrieval Conference, 2020.
+
+For reference code, see:
+[GitHub: ws-choi/ISMIR2020_U_Nets_SVS](https://github.com/ws-choi/ISMIR2020_U_Nets_SVS).
+
+The data processing and model training routines are partly derived from:
+[ZFTurbo/Music-Source-Separation-Training](https://github.com/ZFTurbo/Music-Source-Separation-Training/tree/main).
+"""
+
+"""
+## Setup
+
+Import and install all the required dependencies.
+"""
+
+"""shell
+pip install -qq audiomentations soundfile ffmpeg-binaries
+pip install -qq "keras==3.7.0"
+sudo -n apt-get install -y graphviz >/dev/null 2>&1  # Required for plotting the model
+"""
+
+import glob
+import os
+
+os.environ["KERAS_BACKEND"] = "jax"  # or "tensorflow" or "torch"
+
+import random
+import subprocess
+import tempfile
+import typing
+from os import path
+
+import audiomentations as aug
+import ffmpeg
+import keras
+import numpy as np
+import soundfile as sf
+from IPython import display
+from keras import callbacks, layers, ops, saving
+from matplotlib import pyplot as plt
+
+"""
+## Configuration
+
+The following constants define configuration parameters for audio processing
+and model training, including dataset paths, audio chunk sizes, Short-Time Fourier
+Transform (STFT) parameters, and training hyperparameters.
+"""
+
+# MUSDB18 dataset configuration
+MUSDB_STREAMS = {"mixture": 0, "drums": 1, "bass": 2, "other": 3, "vocals": 4}
+TARGET_INSTRUMENTS = {track: MUSDB_STREAMS[track] for track in ("vocals",)}
+N_INSTRUMENTS = len(TARGET_INSTRUMENTS)
+SOURCE_INSTRUMENTS = tuple(k for k in MUSDB_STREAMS if k != "mixture")
+
+# Audio preprocessing parameters for Short-Time Fourier Transform (STFT)
+N_SUBBANDS = 4  # Number of subbands into which frequencies are split
+CHUNK_SIZE = 65024  # Number of amplitude samples per audio chunk (~4 seconds)
+STFT_N_FFT = 2048  # FFT points used in STFT
+STFT_HOP_LENGTH = 512  # Hop length for STFT
+
+# Training hyperparameters
+N_CHANNELS = 64  # Base channel count for the model
+BATCH_SIZE = 3
+ACCUMULATION_STEPS = 2
+EFFECTIVE_BATCH_SIZE = BATCH_SIZE * (ACCUMULATION_STEPS or 1)
+
+# Paths
+TMP_DIR = path.expanduser("~/.keras/tmp")
+DATASET_DIR = path.expanduser("~/.keras/datasets")
+MODEL_PATH = path.join(TMP_DIR, f"model_{keras.backend.backend()}.keras")
+CSV_LOG_PATH = path.join(TMP_DIR, f"training_{keras.backend.backend()}.csv")
+os.makedirs(DATASET_DIR, exist_ok=True)
+os.makedirs(TMP_DIR, exist_ok=True)
+
+# Set random seed for reproducibility
+keras.utils.set_random_seed(21)
+
+"""
+## MUSDB18 Dataset
+
+The MUSDB18 dataset is a standard benchmark for music source separation, containing
+150 full-length music tracks along with isolated drums, bass, other, and vocals.
+The dataset is stored in .mp4 format, and each .mp4 file includes multiple audio
+streams (mixture and individual tracks).
+
+### Download and Conversion
+
+The following utility function downloads MUSDB18 and converts its .mp4 files to
+.wav files for each instrument track, resampled to 16 kHz.
+"""
+
+
+def download_musdb18(out_dir=None):
+    """Download and extract the MUSDB18 dataset, then convert .mp4 files to .wav files.
+
+    MUSDB18 reference:
+    Rafii, Z., Liutkus, A., Stöter, F.-R., Mimilakis, S. I., & Bittner, R. (2017).
+    MUSDB18 - a corpus for music separation (1.0.0) [Data set]. Zenodo.
+    """
+    ffmpeg.init()
+    from ffmpeg import FFMPEG_PATH
+
+    # Create output directories
+    os.makedirs((base := out_dir or tempfile.mkdtemp()), exist_ok=True)
+    if path.exists((out_dir := path.join(base, "musdb18_wav"))):
+        print("MUSDB18 dataset already downloaded")
+        return out_dir
+
+    # Download and extract the dataset
+    download_dir = keras.utils.get_file(
+        fname="musdb18",
+        origin="https://zenodo.org/records/1117372/files/musdb18.zip",
+        extract=True,
+    )
+
+    # ffmpeg command template: input, stream index, output
+    ffmpeg_args = str(FFMPEG_PATH) + " -v error -i {} -map 0:{} -vn -ar 16000 {}"
+
+    # Convert each mp4 file to multiple .wav files for each track
+    for split in ("train", "test"):
+        songs = os.listdir(path.join(download_dir, split))
+        for i, song in enumerate(songs):
+            if i % 10 == 0:
+                print(f"{split.capitalize()}: {i}/{len(songs)} songs processed")
+
+            mp4_path_orig = path.join(download_dir, split, song)
+            mp4_path = path.join(tempfile.mkdtemp(), split, song.replace(" ", "_"))
+            os.makedirs(path.dirname(mp4_path), exist_ok=True)
+            os.rename(mp4_path_orig, mp4_path)
+
+            wav_dir = path.join(out_dir, split, path.basename(mp4_path).split(".")[0])
+            os.makedirs(wav_dir, exist_ok=True)
+
+            for track in SOURCE_INSTRUMENTS:
+                out_path = path.join(wav_dir, f"{track}.wav")
+                stream_index = MUSDB_STREAMS[track]
+                args = ffmpeg_args.format(mp4_path, stream_index, out_path).split()
+                assert subprocess.run(args).returncode == 0, "ffmpeg conversion failed"
+    return out_dir
+
+
+# Download and prepare the MUSDB18 dataset
+songs = download_musdb18(out_dir=DATASET_DIR)
+
+"""
+### Custom Dataset
+
+We define a custom dataset class to generate random audio chunks and their corresponding
+labels. The dataset does the following:
+
+1. Selects a random chunk from a random song and instrument.
+2. Applies optional data augmentations.
+3. Combines isolated tracks to form new synthetic mixtures.
+4. Prepares features (mixtures) and labels (vocals) for training.
+
+This approach allows creating an effectively infinite variety of training examples
+through randomization and augmentation.
+"""
+
+
+class Dataset(keras.utils.PyDataset):
+    def __init__(
+        self,
+        songs,
+        batch_size=BATCH_SIZE,
+        chunk_size=CHUNK_SIZE,
+        batches_per_epoch=1000 * ACCUMULATION_STEPS,
+        augmentation=True,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.augmentation = augmentation
+        self.vocals_augmentations = [
+            aug.PitchShift(min_semitones=-5, max_semitones=5, p=0.1),
+            aug.SevenBandParametricEQ(-9, 9, p=0.25),
+            aug.TanhDistortion(0.1, 0.7, p=0.1),
+        ]
+        self.other_augmentations = [
+            aug.PitchShift(p=0.1),
+            aug.AddGaussianNoise(p=0.1),
+        ]
+        self.songs = songs
+        self.sizes = {song: self.get_track_set_size(song) for song in self.songs}
+        self.batch_size = batch_size
+        self.chunk_size = chunk_size
+        self.batches_per_epoch = batches_per_epoch
+
+    def get_track_set_size(self, song: str):
+        """Return the smallest track length in the given song directory."""
+        sizes = [len(sf.read(p)[0]) for p in glob.glob(path.join(song, "*.wav"))]
+        if max(sizes) != min(sizes):
+            print(f"Warning: {song} has different track lengths")
+        return min(sizes)
+
+    def random_chunk_of_instrument_type(self, instrument: str):
+        """Extract a random chunk for the specified instrument from a random song."""
+        song, size = random.choice(list(self.sizes.items()))
+        track = path.join(song, f"{instrument}.wav")
+
+        if self.chunk_size <= size:
+            start = np.random.randint(size - self.chunk_size + 1)
+            audio = sf.read(track, self.chunk_size, start, dtype="float32")[0]
+            audio_mono = np.mean(audio, axis=1)
+        else:
+            # If the track is shorter than chunk_size, pad the signal
+            audio_mono = np.mean(sf.read(track, dtype="float32")[0], axis=1)
+            audio_mono = np.pad(audio_mono, ((0, self.chunk_size - size),))
+
+        # If the chunk is almost silent, retry
+        if np.mean(np.abs(audio_mono)) < 0.01:
+            return self.random_chunk_of_instrument_type(instrument)
+
+        return self.data_augmentation(audio_mono, instrument)
+
+    def data_augmentation(self, audio: np.ndarray, instrument: str):
+        """Apply data augmentation to the audio chunk, if enabled."""
+
+        def coin_flip(x, probability: float, fn: typing.Callable):
+            return fn(x) if random.uniform(0, 1) < probability else x
+
+        if self.augmentation:
+            augmentations = (
+                self.vocals_augmentations
+                if instrument == "vocals"
+                else self.other_augmentations
+            )
+            # Loudness augmentation
+            audio *= np.random.uniform(0.5, 1.5, (len(audio),)).astype("float32")
+            # Random reverse
+            audio = coin_flip(audio, 0.1, lambda x: np.flip(x))
+            # Random polarity inversion
+            audio = coin_flip(audio, 0.5, lambda x: -x)
+            # Apply selected augmentations
+            for aug_ in augmentations:
+                aug_.randomize_parameters(audio, sample_rate=16000)
+                audio = aug_(audio, sample_rate=16000)
+        return audio
+
+    def random_mix_of_tracks(self) -> dict:
+        """Create a random mix of instruments by summing their individual chunks."""
+        tracks = {}
+        for instrument in SOURCE_INSTRUMENTS:
+            # Start with a single random chunk
+            mixup = [self.random_chunk_of_instrument_type(instrument)]
+
+            # Randomly add more chunks of the same instrument (mixup augmentation)
+            if self.augmentation:
+                for p in (0.2, 0.02):
+                    if random.uniform(0, 1) < p:
+                        mixup.append(self.random_chunk_of_instrument_type(instrument))
+
+            tracks[instrument] = np.mean(mixup, axis=0, dtype="float32")
+        return tracks
+
+    def __len__(self):
+        return self.batches_per_epoch
+
+    def __getitem__(self, idx):
+        # Generate a batch of random mixtures
+        batch = [self.random_mix_of_tracks() for _ in range(self.batch_size)]
+
+        # Features: sum of all tracks
+        batch_x = ops.sum(
+            np.array([list(track_set.values()) for track_set in batch]), axis=1
+        )
+
+        # Labels: isolated target instruments (e.g., vocals)
+        batch_y = np.array(
+            [[track_set[t] for t in TARGET_INSTRUMENTS] for track_set in batch]
+        )
+
+        return batch_x, ops.convert_to_tensor(batch_y)
+
+
+# Create train and validation datasets
+train_ds = Dataset(glob.glob(path.join(songs, "train", "*")))
+val_ds = Dataset(
+    glob.glob(path.join(songs, "test", "*")),
+    batches_per_epoch=int(0.1 * train_ds.batches_per_epoch),
+    augmentation=False,
+)
+
+"""
+### Visualize a Sample
+
+Let's visualize a random mixed audio chunk and its corresponding isolated vocals.
+This helps to understand the nature of the preprocessed input data.
+"""
+
+
+def visualize_audio_np(audio: np.ndarray, rate=16000, name="mixup"):
+    """Plot and display an audio waveform and also produce an Audio widget."""
+    plt.figure(figsize=(10, 6))
+    plt.plot(audio)
+    plt.title(f"Waveform: {name}")
+    plt.xlim(0, len(audio))
+    plt.ylabel("Amplitude")
+    plt.show()
+    # plt.savefig(f"tmp/{name}.png")
+
+    # Normalize and display audio
+    audio_norm = (audio - np.min(audio)) / (np.max(audio) - np.min(audio) + 1e-8)
+    audio_norm = (audio_norm * 2 - 1) * 0.6
+    display.display(display.Audio(audio_norm, rate=rate))
+    # sf.write(f"tmp/{name}.wav", audio_norm, rate)
+
+
+sample_batch_x, sample_batch_y = val_ds[None]  # Random batch
+visualize_audio_np(ops.convert_to_numpy(sample_batch_x[0]))
+visualize_audio_np(ops.convert_to_numpy(sample_batch_y[0, 0]), name="vocals")
+
+"""
+## Model
+
+### Preprocessing
+
+The model operates on STFT representations rather than raw audio. We define a
+preprocessing model to compute STFT and a corresponding inverse transform (iSTFT).
+"""
+
+
+def stft(inputs, fft_size=STFT_N_FFT, sequence_stride=STFT_HOP_LENGTH):
+    """Compute the STFT for the input audio and return the real and imaginary parts."""
+    real_x, imag_x = ops.stft(inputs, fft_size, sequence_stride, fft_size)
+    real_x, imag_x = ops.expand_dims(real_x, -1), ops.expand_dims(imag_x, -1)
+    x = ops.concatenate((real_x, imag_x), axis=-1)
+
+    # Drop last freq sample for convenience
+    return ops.split(x, [x.shape[2] - 1], axis=2)[0]
+
+
+def inverse_stft(inputs, fft_size=STFT_N_FFT, sequence_stride=STFT_HOP_LENGTH):
+    """Compute the inverse STFT for the given STFT input."""
+    x = inputs
+
+    # Pad back dropped freq sample if using torch backend
+    if keras.backend.backend() == "torch":
+        x = ops.pad(x, ((0, 0), (0, 0), (0, 1), (0, 0)))
+
+    real_x, imag_x = ops.split(x, 2, axis=-1)
+    real_x = ops.squeeze(real_x, axis=-1)
+    imag_x = ops.squeeze(imag_x, axis=-1)
+
+    return ops.istft((real_x, imag_x), fft_size, sequence_stride, fft_size)
+
+
+"""
+### Model Architecture
+
+The model uses a custom encoder-decoder architecture with Time-Frequency Convolution
+(TFC) and Time-Distributed Fully Connected (TDF) blocks. They are grouped into a
+`TimeFrequencyTransformBlock`, i.e. "TFC_TDF" in the original paper by Choi et al.
+
+We then define an encoder-decoder network with multiple scales. Each encoder scale
+applies TFC_TDF blocks followed by downsampling, while decoder scales apply TFC_TDF
+blocks over the concatenation of upsampled features and associated encoder outputs.
+"""
+
+
+@saving.register_keras_serializable()
+class TimeDistributedDenseBlock(layers.Layer):
+    """Time-Distributed Fully Connected layer block.
+
+    Applies frequency-wise dense transformations across time frames with instance
+    normalization and GELU activation.
+    """
+
+    def __init__(self, bottleneck_factor, fft_dim, **kwargs):
+        super().__init__(**kwargs)
+        self.fft_dim = fft_dim
+        self.hidden_dim = fft_dim // bottleneck_factor
+
+    def build(self, *_):
+        self.group_norm_1 = layers.GroupNormalization(groups=-1)
+        self.group_norm_2 = layers.GroupNormalization(groups=-1)
+        self.dense_1 = layers.Dense(self.hidden_dim, use_bias=False)
+        self.dense_2 = layers.Dense(self.fft_dim, use_bias=False)
+
+    def call(self, x):
+        # Apply normalization and dense layers frequency-wise
+        x = ops.gelu(self.group_norm_1(x))
+        x = ops.swapaxes(x, -1, -2)
+        x = self.dense_1(x)
+
+        x = ops.gelu(self.group_norm_2(ops.swapaxes(x, -1, -2)))
+        x = ops.swapaxes(x, -1, -2)
+        x = self.dense_2(x)
+        return ops.swapaxes(x, -1, -2)
+
+
+@saving.register_keras_serializable()
+class TimeFrequencyConvolution(layers.Layer):
+    """Time-Frequency Convolutional layer.
+
+    Applies a 2D convolution over time-frequency representations and applies instance
+    normalization and GELU activation.
+    """
+
+    def __init__(self, channels, **kwargs):
+        super().__init__(**kwargs)
+        self.channels = channels
+
+    def build(self, *_):
+        self.group_norm = layers.GroupNormalization(groups=-1)
+        self.conv = layers.Conv2D(self.channels, 3, padding="same", use_bias=False)
+
+    def call(self, x):
+        return self.conv(ops.gelu(self.group_norm(x)))
+
+
+@saving.register_keras_serializable()
+class TimeFrequencyTransformBlock(layers.Layer):
+    """Implements TFC_TDF block for encoder-decoder architecture.
+
+    Repeatedly apply Time-Frequency Convolution and Time-Distributed Dense blocks as
+    many times as specified by the `length` parameter.
+    """
+
+    def __init__(
+        self, channels, length, fft_dim, bottleneck_factor, in_channels=None, **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.channels = channels
+        self.length = length
+        self.fft_dim = fft_dim
+        self.bottleneck_factor = bottleneck_factor
+        self.in_channels = in_channels or channels
+
+    def build(self, *_):
+        self.blocks = []
+        # Add blocks in a flat list to avoid nested structures
+        for i in range(self.length):
+            in_channels = self.channels if i > 0 else self.in_channels
+            self.blocks.append(TimeFrequencyConvolution(in_channels))
+            self.blocks.append(
+                TimeDistributedDenseBlock(self.bottleneck_factor, self.fft_dim)
+            )
+            self.blocks.append(TimeFrequencyConvolution(self.channels))
+            # Residual connection
+            self.blocks.append(layers.Conv2D(self.channels, 1, 1, use_bias=False))
+
+    def call(self, inputs):
+        x = inputs
+        # Each block consists of 4 layers:
+        # 1. Time-Frequency Convolution
+        # 2. Time-Distributed Dense
+        # 3. Time-Frequency Convolution
+        # 4. Residual connection
+        for i in range(0, len(self.blocks), 4):
+            tfc_1 = self.blocks[i](x)
+            tdf = self.blocks[i + 1](x)
+            tfc_2 = self.blocks[i + 2](tfc_1 + tdf)
+            x = tfc_2 + self.blocks[i + 3](x)  # Residual connection
+        return x
+
+
+@saving.register_keras_serializable()
+class Downscale(layers.Layer):
+    """Downscale time-frequency dimensions using a convolution."""
+
+    conv_cls = layers.Conv2D
+
+    def __init__(self, channels, scale, **kwargs):
+        super().__init__(**kwargs)
+        self.channels = channels
+        self.scale = scale
+
+    def build(self, *_):
+        self.conv = self.conv_cls(self.channels, self.scale, self.scale, use_bias=False)
+        self.norm = layers.GroupNormalization(groups=-1)
+
+    def call(self, inputs):
+        return self.norm(ops.gelu(self.conv(inputs)))
+
+
+@saving.register_keras_serializable()
+class Upscale(Downscale):
+    """Upscale time-frequency dimensions using a transposed convolution."""
+
+    conv_cls = layers.Conv2DTranspose
+
+
+def build_model(
+    inputs,
+    n_instruments=N_INSTRUMENTS,
+    n_subbands=N_SUBBANDS,
+    channels=N_CHANNELS,
+    fft_dim=(STFT_N_FFT // 2) // N_SUBBANDS,
+    n_scales=4,
+    scale=(2, 2),
+    block_size=2,
+    growth=128,
+    bottleneck_factor=2,
+    **kwargs,
+):
+    """Build the TFC_TDF encoder-decoder model for source separation."""
+    # Compute STFT
+    x = stft(inputs)
+
+    # Split mixture into subbands as separate channels
+    mix = ops.reshape(x, (-1, x.shape[1], x.shape[2] // n_subbands, 2 * n_subbands))
+    first_conv_out = layers.Conv2D(channels, 1, 1, use_bias=False)(mix)
+    x = first_conv_out
+
+    # Encoder path
+    encoder_outs = []
+    for _ in range(n_scales):
+        x = TimeFrequencyTransformBlock(
+            channels, block_size, fft_dim, bottleneck_factor
+        )(x)
+        encoder_outs.append(x)
+        fft_dim, channels = fft_dim // scale[0], channels + growth
+        x = Downscale(channels, scale)(x)
+
+    # Bottleneck
+    x = TimeFrequencyTransformBlock(channels, block_size, fft_dim, bottleneck_factor)(x)
+
+    # Decoder path
+    for _ in range(n_scales):
+        fft_dim, channels = fft_dim * scale[0], channels - growth
+        x = ops.concatenate([Upscale(channels, scale)(x), encoder_outs.pop()], axis=-1)
+        x = TimeFrequencyTransformBlock(
+            channels, block_size, fft_dim, bottleneck_factor, in_channels=x.shape[-1]
+        )(x)
+
+    # Residual connection and final convolutions
+    x = ops.concatenate([mix, x * first_conv_out], axis=-1)
+    x = layers.Conv2D(channels, 1, 1, use_bias=False, activation="gelu")(x)
+    x = layers.Conv2D(n_instruments * n_subbands * 2, 1, 1, use_bias=False)(x)
+
+    # Reshape back to instrument-wise STFT
+    x = ops.reshape(x, (-1, x.shape[1], x.shape[2] * n_subbands, n_instruments, 2))
+    x = ops.transpose(x, (0, 3, 1, 2, 4))
+    x = ops.reshape(x, (-1, n_instruments, x.shape[2], x.shape[3] * 2))
+
+    return keras.Model(inputs=inputs, outputs=x, **kwargs)
+
+
+"""
+## Loss and Metrics
+
+We define:
+
+- `spectral_loss`: Mean absolute error in STFT domain.
+- `sdr`: Signal-to-Distortion Ratio, a common source separation metric.
+"""
+
+
+def prediction_to_wave(x, n_instruments=N_INSTRUMENTS):
+    """Convert STFT predictions back to waveform."""
+    x = ops.reshape(x, (-1, x.shape[2], x.shape[3] // 2, 2))
+    x = inverse_stft(x)
+    return ops.reshape(x, (-1, n_instruments, x.shape[1]))
+
+
+def target_to_stft(y):
+    """Convert target waveforms to their STFT representations."""
+    y = ops.reshape(y, (-1, CHUNK_SIZE))
+    y_real, y_imag = ops.stft(y, STFT_N_FFT, STFT_HOP_LENGTH, STFT_N_FFT)
+    y_real, y_imag = y_real[..., :-1], y_imag[..., :-1]
+    y = ops.stack([y_real, y_imag], axis=-1)
+    return ops.reshape(y, (-1, N_INSTRUMENTS, y.shape[1], y.shape[2] * 2))
+
+
+@saving.register_keras_serializable()
+def sdr(y_true, y_pred):
+    """Signal-to-Distortion Ratio metric."""
+    y_pred = prediction_to_wave(y_pred)
+    # Add epsilon for numerical stability
+    num = ops.sum(ops.square(y_true), axis=-1) + 1e-8
+    den = ops.sum(ops.square(y_true - y_pred), axis=-1) + 1e-8
+    return 10 * ops.log10(num / den)
+
+
+@saving.register_keras_serializable()
+def spectral_loss(y_true, y_pred):
+    """Mean absolute error in the STFT domain."""
+    y_true = target_to_stft(y_true)
+    return ops.mean(ops.absolute(y_true - y_pred))
+
+
+"""
+## Training
+
+### Visualize Model Architecture
+"""
+
+# Load or create the model
+if path.exists(MODEL_PATH):
+    model = saving.load_model(MODEL_PATH)
+else:
+    model = build_model(keras.Input(sample_batch_x.shape[1:]), name="tfc_tdf_net")
+
+# Display the model architecture
+model.summary()
+img = keras.utils.plot_model(model, path.join(TMP_DIR, "model.png"), show_shapes=True)
+display.display(img)
+
+"""
+### Compile and Train the Model
+"""
+
+# Compile the model
+optimizer = keras.optimizers.Adam(5e-05, gradient_accumulation_steps=ACCUMULATION_STEPS)
+model.compile(optimizer=optimizer, loss=spectral_loss, metrics=[sdr])
+
+# Define callbacks
+cbs = [
+    callbacks.ModelCheckpoint(MODEL_PATH, "val_sdr", save_best_only=True, mode="max"),
+    callbacks.ReduceLROnPlateau(factor=0.95, patience=2),
+    callbacks.CSVLogger(CSV_LOG_PATH),
+]
+
+if not path.exists(MODEL_PATH):
+    model.fit(train_ds, validation_data=val_ds, epochs=10, callbacks=cbs, shuffle=False)
+else:
+    # Demonstration of a single epoch of training when model already exists
+    model.fit(train_ds, validation_data=val_ds, epochs=1, shuffle=False, verbose=2)
+
+"""
+## Evaluation
+
+Evaluate the model on the validation dataset and visualize predicted vocals.
+"""
+
+model.evaluate(val_ds, verbose=2)
+y_pred = model.predict(sample_batch_x, verbose=2)
+y_pred = prediction_to_wave(y_pred)
+visualize_audio_np(ops.convert_to_numpy(y_pred[0, 0]), name="vocals_pred")
+
+"""
+## Conclusion
+
+We built and trained a vocal track separation model using an encoder-decoder
+architecture with custom blocks applied to the MUSDB18 dataset. We demonstrated
+STFT-based preprocessing, data augmentation, and a source separation metric (SDR).
+
+**Next steps:**
+
+- Train for more epochs and refine hyperparameters.
+- Separate multiple instruments simultaneously.
+- Enhance the model to handle instruments not present in the mixture.
+"""
diff --git a/knowledge_base/generative/adain.py b/knowledge_base/generative/adain.py
new file mode 100644
index 0000000000000000000000000000000000000000..2a0ea5539afcff37f5eae38b5974a7a56515ecf3
--- /dev/null
+++ b/knowledge_base/generative/adain.py
@@ -0,0 +1,677 @@
+"""
+Title: Neural Style Transfer with AdaIN
+Author: [Aritra Roy Gosthipaty](https://twitter.com/arig23498), [Ritwik Raha](https://twitter.com/ritwik_raha)
+Date created: 2021/11/08
+Last modified: 2021/11/08
+Description: Neural Style Transfer with Adaptive Instance Normalization.
+Accelerator: GPU
+"""
+
+"""
+# Introduction
+
+[Neural Style Transfer](https://www.tensorflow.org/tutorials/generative/style_transfer)
+is the process of transferring the style of one image onto the content
+of another. This was first introduced in the seminal paper
+["A Neural Algorithm of Artistic Style"](https://arxiv.org/abs/1508.06576)
+by Gatys et al. A major limitation of the technique proposed in this
+work is in its runtime, as the algorithm uses a slow iterative
+optimization process.
+
+Follow-up papers that introduced
+[Batch Normalization](https://arxiv.org/abs/1502.03167),
+[Instance Normalization](https://arxiv.org/abs/1701.02096) and
+[Conditional Instance Normalization](https://arxiv.org/abs/1610.07629)
+allowed Style Transfer to be performed in new ways, no longer
+requiring a slow iterative process.
+
+Following these papers, the authors Xun Huang and Serge
+Belongie propose
+[Adaptive Instance Normalization](https://arxiv.org/abs/1703.06868) (AdaIN),
+which allows arbitrary style transfer in real time.
+
+In this example we implement Adaptive Instance Normalization
+for Neural Style Transfer. We show in the below figure the output
+of our AdaIN model trained for
+only **30 epochs**.
+
+![Style transfer sample gallery](https://i.imgur.com/zDjDuea.png)
+
+You can also try out the model with your own images with this
+[Hugging Face demo](https://huggingface.co/spaces/ariG23498/nst).
+"""
+
+"""
+# Setup
+
+We begin with importing the necessary packages. We also set the
+seed for reproducibility. The global variables are hyperparameters
+which we can change as we like.
+"""
+
+import os
+import numpy as np
+import tensorflow as tf
+from tensorflow import keras
+import matplotlib.pyplot as plt
+import tensorflow_datasets as tfds
+from tensorflow.keras import layers
+
+# Defining the global variables.
+IMAGE_SIZE = (224, 224)
+BATCH_SIZE = 64
+# Training for single epoch for time constraint.
+# Please use atleast 30 epochs to see good results.
+EPOCHS = 1
+AUTOTUNE = tf.data.AUTOTUNE
+
+"""
+## Style transfer sample gallery
+
+For Neural Style Transfer we need style images and content images. In
+this example we will use the
+[Best Artworks of All Time](https://www.kaggle.com/ikarus777/best-artworks-of-all-time)
+as our style dataset and
+[Pascal VOC](https://www.tensorflow.org/datasets/catalog/voc)
+as our content dataset.
+
+This is a deviation from the original paper implementation by the
+authors, where they use
+[WIKI-Art](https://paperswithcode.com/dataset/wikiart) as style and
+[MSCOCO](https://cocodataset.org/#home) as content datasets
+respectively. We do this to create a minimal yet reproducible example.
+
+## Downloading the dataset from Kaggle
+
+The [Best Artworks of All Time](https://www.kaggle.com/ikarus777/best-artworks-of-all-time)
+dataset is hosted on Kaggle and one can easily download it in Colab by
+following these steps:
+
+- Follow the instructions [here](https://github.com/Kaggle/kaggle-api)
+in order to obtain your Kaggle API keys in case you don't have them.
+- Use the following command to upload the Kaggle API keys.
+
+```python
+from google.colab import files
+files.upload()
+```
+
+- Use the following commands to move the API keys to the proper
+directory and download the dataset.
+
+```shell
+$ mkdir ~/.kaggle
+$ cp kaggle.json ~/.kaggle/
+$ chmod 600 ~/.kaggle/kaggle.json
+$ kaggle datasets download ikarus777/best-artworks-of-all-time
+$ unzip -qq best-artworks-of-all-time.zip
+$ rm -rf images
+$ mv resized artwork
+$ rm best-artworks-of-all-time.zip artists.csv
+```
+"""
+
+"""
+## `tf.data` pipeline
+
+In this section, we will build the `tf.data` pipeline for the project.
+For the style dataset, we decode, convert and resize the images from
+the folder. For the content images we are already presented with a
+`tf.data` dataset as we use the `tfds` module.
+
+After we have our style and content data pipeline ready, we zip the
+two together to obtain the data pipeline that our model will consume.
+"""
+
+
+def decode_and_resize(image_path):
+    """Decodes and resizes an image from the image file path.
+
+    Args:
+        image_path: The image file path.
+
+    Returns:
+        A resized image.
+    """
+    image = tf.io.read_file(image_path)
+    image = tf.image.decode_jpeg(image, channels=3)
+    image = tf.image.convert_image_dtype(image, dtype="float32")
+    image = tf.image.resize(image, IMAGE_SIZE)
+    return image
+
+
+def extract_image_from_voc(element):
+    """Extracts image from the PascalVOC dataset.
+
+    Args:
+        element: A dictionary of data.
+
+    Returns:
+        A resized image.
+    """
+    image = element["image"]
+    image = tf.image.convert_image_dtype(image, dtype="float32")
+    image = tf.image.resize(image, IMAGE_SIZE)
+    return image
+
+
+# Get the image file paths for the style images.
+style_images = os.listdir("artwork/resized")
+style_images = [os.path.join("artwork/resized", path) for path in style_images]
+
+# split the style images in train, val and test
+total_style_images = len(style_images)
+train_style = style_images[: int(0.8 * total_style_images)]
+val_style = style_images[int(0.8 * total_style_images) : int(0.9 * total_style_images)]
+test_style = style_images[int(0.9 * total_style_images) :]
+
+# Build the style and content tf.data datasets.
+train_style_ds = (
+    tf.data.Dataset.from_tensor_slices(train_style)
+    .map(decode_and_resize, num_parallel_calls=AUTOTUNE)
+    .repeat()
+)
+train_content_ds = tfds.load("voc", split="train").map(extract_image_from_voc).repeat()
+
+val_style_ds = (
+    tf.data.Dataset.from_tensor_slices(val_style)
+    .map(decode_and_resize, num_parallel_calls=AUTOTUNE)
+    .repeat()
+)
+val_content_ds = (
+    tfds.load("voc", split="validation").map(extract_image_from_voc).repeat()
+)
+
+test_style_ds = (
+    tf.data.Dataset.from_tensor_slices(test_style)
+    .map(decode_and_resize, num_parallel_calls=AUTOTUNE)
+    .repeat()
+)
+test_content_ds = (
+    tfds.load("voc", split="test")
+    .map(extract_image_from_voc, num_parallel_calls=AUTOTUNE)
+    .repeat()
+)
+
+# Zipping the style and content datasets.
+train_ds = (
+    tf.data.Dataset.zip((train_style_ds, train_content_ds))
+    .shuffle(BATCH_SIZE * 2)
+    .batch(BATCH_SIZE)
+    .prefetch(AUTOTUNE)
+)
+
+val_ds = (
+    tf.data.Dataset.zip((val_style_ds, val_content_ds))
+    .shuffle(BATCH_SIZE * 2)
+    .batch(BATCH_SIZE)
+    .prefetch(AUTOTUNE)
+)
+
+test_ds = (
+    tf.data.Dataset.zip((test_style_ds, test_content_ds))
+    .shuffle(BATCH_SIZE * 2)
+    .batch(BATCH_SIZE)
+    .prefetch(AUTOTUNE)
+)
+
+"""
+## Visualizing the data
+
+It is always better to visualize the data before training. To ensure
+the correctness of our preprocessing pipeline, we visualize 10 samples
+from our dataset.
+"""
+
+style, content = next(iter(train_ds))
+fig, axes = plt.subplots(nrows=10, ncols=2, figsize=(5, 30))
+[ax.axis("off") for ax in np.ravel(axes)]
+
+for axis, style_image, content_image in zip(axes, style[0:10], content[0:10]):
+    (ax_style, ax_content) = axis
+    ax_style.imshow(style_image)
+    ax_style.set_title("Style Image")
+
+    ax_content.imshow(content_image)
+    ax_content.set_title("Content Image")
+
+"""
+## Architecture
+
+The style transfer network takes a content image and a style image as
+inputs and outputs the style transferred image. The authors of AdaIN
+propose a simple encoder-decoder structure for achieving this.
+
+![AdaIN architecture](https://i.imgur.com/JbIfoyE.png)
+
+The content image (`C`) and the style image (`S`) are both fed to the
+encoder networks. The output from these encoder networks (feature maps)
+are then fed to the AdaIN layer. The AdaIN layer computes a combined
+feature map. This feature map is then fed into a randomly initialized
+decoder network that serves as the generator for the neural style
+transferred image.
+
+![AdaIn equation](https://i.imgur.com/hqhcBQS.png)
+
+The style feature map (`fs`) and the content feature map (`fc`) are
+fed to the AdaIN layer. This layer produced the combined feature map
+`t`. The function `g` represents the decoder (generator) network.
+"""
+
+"""
+### Encoder
+
+The encoder is a part of the pretrained (pretrained on
+[imagenet](https://www.image-net.org/)) VGG19 model. We slice the
+model from the `block4-conv1` layer. The output layer is as suggested
+by the authors in their paper.
+"""
+
+
+def get_encoder():
+    vgg19 = keras.applications.VGG19(
+        include_top=False,
+        weights="imagenet",
+        input_shape=(*IMAGE_SIZE, 3),
+    )
+    vgg19.trainable = False
+    mini_vgg19 = keras.Model(vgg19.input, vgg19.get_layer("block4_conv1").output)
+
+    inputs = layers.Input([*IMAGE_SIZE, 3])
+    mini_vgg19_out = mini_vgg19(inputs)
+    return keras.Model(inputs, mini_vgg19_out, name="mini_vgg19")
+
+
+"""
+### Adaptive Instance Normalization
+
+The AdaIN layer takes in the features
+of the content and style image. The layer can be defined via the
+following equation:
+
+![AdaIn formula](https://i.imgur.com/tWq3VKP.png)
+
+where `sigma` is the standard deviation and `mu` is the mean for the
+concerned variable. In the above equation the mean and variance of the
+content feature map `fc` is aligned with the mean and variance of the
+style feature maps `fs`.
+
+It is important to note that the AdaIN layer proposed by the authors
+uses no other parameters apart from mean and variance. The layer also
+does not have any trainable parameters. This is why we use a
+*Python function* instead of using a *Keras layer*. The function takes
+style and content feature maps, computes the mean and standard deviation
+of the images and returns the adaptive instance normalized feature map.
+"""
+
+
+def get_mean_std(x, epsilon=1e-5):
+    axes = [1, 2]
+
+    # Compute the mean and standard deviation of a tensor.
+    mean, variance = tf.nn.moments(x, axes=axes, keepdims=True)
+    standard_deviation = tf.sqrt(variance + epsilon)
+    return mean, standard_deviation
+
+
+def ada_in(style, content):
+    """Computes the AdaIn feature map.
+
+    Args:
+        style: The style feature map.
+        content: The content feature map.
+
+    Returns:
+        The AdaIN feature map.
+    """
+    content_mean, content_std = get_mean_std(content)
+    style_mean, style_std = get_mean_std(style)
+    t = style_std * (content - content_mean) / content_std + style_mean
+    return t
+
+
+"""
+### Decoder
+
+The authors specify that the decoder network must mirror the encoder
+network.  We have symmetrically inverted the encoder to build our
+decoder. We have used `UpSampling2D` layers to increase the spatial
+resolution of the feature maps.
+
+Note that the authors warn against using any normalization layer
+in the decoder network, and do indeed go on to show that including
+batch normalization or instance normalization hurts the performance
+of the overall network.
+
+This is the only portion of the entire architecture that is trainable.
+"""
+
+
+def get_decoder():
+    config = {"kernel_size": 3, "strides": 1, "padding": "same", "activation": "relu"}
+    decoder = keras.Sequential(
+        [
+            layers.InputLayer((None, None, 512)),
+            layers.Conv2D(filters=512, **config),
+            layers.UpSampling2D(),
+            layers.Conv2D(filters=256, **config),
+            layers.Conv2D(filters=256, **config),
+            layers.Conv2D(filters=256, **config),
+            layers.Conv2D(filters=256, **config),
+            layers.UpSampling2D(),
+            layers.Conv2D(filters=128, **config),
+            layers.Conv2D(filters=128, **config),
+            layers.UpSampling2D(),
+            layers.Conv2D(filters=64, **config),
+            layers.Conv2D(
+                filters=3,
+                kernel_size=3,
+                strides=1,
+                padding="same",
+                activation="sigmoid",
+            ),
+        ]
+    )
+    return decoder
+
+
+"""
+### Loss functions
+
+Here we build the loss functions for the neural style transfer model.
+The authors propose to use a pretrained VGG-19 to compute the loss
+function of the network. It is important to keep in mind that this
+will be used for training only the decoder network. The total
+loss (`Lt`) is a weighted combination of content loss (`Lc`) and style
+loss (`Ls`). The `lambda` term is used to vary the amount of style
+transferred.
+
+![The total loss](https://i.imgur.com/Q5y1jUM.png)
+
+### Content Loss
+
+This is the Euclidean distance between the content image features
+and the features of the neural style transferred image.
+
+![The content loss](https://i.imgur.com/dZ0uD0N.png)
+
+Here the authors propose to use the output from the AdaIn layer `t` as
+the content target rather than using features of the original image as
+target. This is done to speed up convergence.
+
+### Style Loss
+
+Rather than using the more commonly used
+[Gram Matrix](https://mathworld.wolfram.com/GramMatrix.html),
+the authors propose to compute the difference between the statistical features
+(mean and variance) which makes it conceptually cleaner. This can be
+easily visualized via the following equation:
+
+![The style loss](https://i.imgur.com/Ctclhn3.png)
+
+where `theta` denotes the layers in VGG-19 used to compute the loss.
+In this case this corresponds to:
+
+- `block1_conv1`
+- `block1_conv2`
+- `block1_conv3`
+- `block1_conv4`
+
+"""
+
+
+def get_loss_net():
+    vgg19 = keras.applications.VGG19(
+        include_top=False, weights="imagenet", input_shape=(*IMAGE_SIZE, 3)
+    )
+    vgg19.trainable = False
+    layer_names = ["block1_conv1", "block2_conv1", "block3_conv1", "block4_conv1"]
+    outputs = [vgg19.get_layer(name).output for name in layer_names]
+    mini_vgg19 = keras.Model(vgg19.input, outputs)
+
+    inputs = layers.Input([*IMAGE_SIZE, 3])
+    mini_vgg19_out = mini_vgg19(inputs)
+    return keras.Model(inputs, mini_vgg19_out, name="loss_net")
+
+
+"""
+## Neural Style Transfer
+
+This is the trainer module. We wrap the encoder and decoder inside
+a `tf.keras.Model` subclass. This allows us to customize what happens
+in the `model.fit()` loop.
+"""
+
+
+class NeuralStyleTransfer(tf.keras.Model):
+    def __init__(self, encoder, decoder, loss_net, style_weight, **kwargs):
+        super().__init__(**kwargs)
+        self.encoder = encoder
+        self.decoder = decoder
+        self.loss_net = loss_net
+        self.style_weight = style_weight
+
+    def compile(self, optimizer, loss_fn):
+        super().compile()
+        self.optimizer = optimizer
+        self.loss_fn = loss_fn
+        self.style_loss_tracker = keras.metrics.Mean(name="style_loss")
+        self.content_loss_tracker = keras.metrics.Mean(name="content_loss")
+        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
+
+    def train_step(self, inputs):
+        style, content = inputs
+
+        # Initialize the content and style loss.
+        loss_content = 0.0
+        loss_style = 0.0
+
+        with tf.GradientTape() as tape:
+            # Encode the style and content image.
+            style_encoded = self.encoder(style)
+            content_encoded = self.encoder(content)
+
+            # Compute the AdaIN target feature maps.
+            t = ada_in(style=style_encoded, content=content_encoded)
+
+            # Generate the neural style transferred image.
+            reconstructed_image = self.decoder(t)
+
+            # Compute the losses.
+            reconstructed_vgg_features = self.loss_net(reconstructed_image)
+            style_vgg_features = self.loss_net(style)
+            loss_content = self.loss_fn(t, reconstructed_vgg_features[-1])
+            for inp, out in zip(style_vgg_features, reconstructed_vgg_features):
+                mean_inp, std_inp = get_mean_std(inp)
+                mean_out, std_out = get_mean_std(out)
+                loss_style += self.loss_fn(mean_inp, mean_out) + self.loss_fn(
+                    std_inp, std_out
+                )
+            loss_style = self.style_weight * loss_style
+            total_loss = loss_content + loss_style
+
+        # Compute gradients and optimize the decoder.
+        trainable_vars = self.decoder.trainable_variables
+        gradients = tape.gradient(total_loss, trainable_vars)
+        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
+
+        # Update the trackers.
+        self.style_loss_tracker.update_state(loss_style)
+        self.content_loss_tracker.update_state(loss_content)
+        self.total_loss_tracker.update_state(total_loss)
+        return {
+            "style_loss": self.style_loss_tracker.result(),
+            "content_loss": self.content_loss_tracker.result(),
+            "total_loss": self.total_loss_tracker.result(),
+        }
+
+    def test_step(self, inputs):
+        style, content = inputs
+
+        # Initialize the content and style loss.
+        loss_content = 0.0
+        loss_style = 0.0
+
+        # Encode the style and content image.
+        style_encoded = self.encoder(style)
+        content_encoded = self.encoder(content)
+
+        # Compute the AdaIN target feature maps.
+        t = ada_in(style=style_encoded, content=content_encoded)
+
+        # Generate the neural style transferred image.
+        reconstructed_image = self.decoder(t)
+
+        # Compute the losses.
+        recons_vgg_features = self.loss_net(reconstructed_image)
+        style_vgg_features = self.loss_net(style)
+        loss_content = self.loss_fn(t, recons_vgg_features[-1])
+        for inp, out in zip(style_vgg_features, recons_vgg_features):
+            mean_inp, std_inp = get_mean_std(inp)
+            mean_out, std_out = get_mean_std(out)
+            loss_style += self.loss_fn(mean_inp, mean_out) + self.loss_fn(
+                std_inp, std_out
+            )
+        loss_style = self.style_weight * loss_style
+        total_loss = loss_content + loss_style
+
+        # Update the trackers.
+        self.style_loss_tracker.update_state(loss_style)
+        self.content_loss_tracker.update_state(loss_content)
+        self.total_loss_tracker.update_state(total_loss)
+        return {
+            "style_loss": self.style_loss_tracker.result(),
+            "content_loss": self.content_loss_tracker.result(),
+            "total_loss": self.total_loss_tracker.result(),
+        }
+
+    @property
+    def metrics(self):
+        return [
+            self.style_loss_tracker,
+            self.content_loss_tracker,
+            self.total_loss_tracker,
+        ]
+
+
+"""
+## Train Monitor callback
+
+This callback is used to visualize the style transfer output of
+the model at the end of each epoch. The objective of style transfer cannot be
+quantified properly, and is to be subjectively evaluated by an audience.
+For this reason, visualization is a key aspect of evaluating the model.
+"""
+
+test_style, test_content = next(iter(test_ds))
+
+
+class TrainMonitor(tf.keras.callbacks.Callback):
+    def on_epoch_end(self, epoch, logs=None):
+        # Encode the style and content image.
+        test_style_encoded = self.model.encoder(test_style)
+        test_content_encoded = self.model.encoder(test_content)
+
+        # Compute the AdaIN features.
+        test_t = ada_in(style=test_style_encoded, content=test_content_encoded)
+        test_reconstructed_image = self.model.decoder(test_t)
+
+        # Plot the Style, Content and the NST image.
+        fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(20, 5))
+        ax[0].imshow(tf.keras.utils.array_to_img(test_style[0]))
+        ax[0].set_title(f"Style: {epoch:03d}")
+
+        ax[1].imshow(tf.keras.utils.array_to_img(test_content[0]))
+        ax[1].set_title(f"Content: {epoch:03d}")
+
+        ax[2].imshow(tf.keras.utils.array_to_img(test_reconstructed_image[0]))
+        ax[2].set_title(f"NST: {epoch:03d}")
+
+        plt.show()
+        plt.close()
+
+
+"""
+## Train the model
+
+In this section, we define the optimizer, the loss function, and the
+trainer module. We compile the trainer module with the optimizer and
+the loss function and then train it.
+
+*Note*: We train the model for a single epoch for time constraints,
+but we will need to train is for atleast 30 epochs to see good results.
+"""
+
+optimizer = keras.optimizers.Adam(learning_rate=1e-5)
+loss_fn = keras.losses.MeanSquaredError()
+
+encoder = get_encoder()
+loss_net = get_loss_net()
+decoder = get_decoder()
+
+model = NeuralStyleTransfer(
+    encoder=encoder, decoder=decoder, loss_net=loss_net, style_weight=4.0
+)
+
+model.compile(optimizer=optimizer, loss_fn=loss_fn)
+
+history = model.fit(
+    train_ds,
+    epochs=EPOCHS,
+    steps_per_epoch=50,
+    validation_data=val_ds,
+    validation_steps=50,
+    callbacks=[TrainMonitor()],
+)
+
+"""
+## Inference
+
+After we train the model, we now need to run inference with it. We will
+pass arbitrary content and style images from the test dataset and take a look at
+the output images.
+
+*NOTE*: To try out the model on your own images, you can use this
+[Hugging Face demo](https://huggingface.co/spaces/ariG23498/nst).
+"""
+
+for style, content in test_ds.take(1):
+    style_encoded = model.encoder(style)
+    content_encoded = model.encoder(content)
+    t = ada_in(style=style_encoded, content=content_encoded)
+    reconstructed_image = model.decoder(t)
+    fig, axes = plt.subplots(nrows=10, ncols=3, figsize=(10, 30))
+    [ax.axis("off") for ax in np.ravel(axes)]
+
+    for axis, style_image, content_image, reconstructed_image in zip(
+        axes, style[0:10], content[0:10], reconstructed_image[0:10]
+    ):
+        (ax_style, ax_content, ax_reconstructed) = axis
+        ax_style.imshow(style_image)
+        ax_style.set_title("Style Image")
+        ax_content.imshow(content_image)
+        ax_content.set_title("Content Image")
+        ax_reconstructed.imshow(reconstructed_image)
+        ax_reconstructed.set_title("NST Image")
+
+"""
+## Conclusion
+
+Adaptive Instance Normalization allows arbitrary style transfer in
+real time. It is also important to note that the novel proposition of
+the authors is to achieve this only by aligning the statistical
+features (mean and standard deviation) of the style and the content
+images.
+
+*Note*: AdaIN also serves as the base for
+[Style-GANs](https://arxiv.org/abs/1812.04948).
+
+## Reference
+
+- [TF implementation](https://github.com/ftokarev/tf-adain)
+
+## Acknowledgement
+
+We thank [Luke Wood](https://lukewood.xyz) for his
+detailed review.
+"""
diff --git a/knowledge_base/generative/conditional_gan.py b/knowledge_base/generative/conditional_gan.py
new file mode 100644
index 0000000000000000000000000000000000000000..810e28d0fe04513f9636fb4eec2adc5fc66d6f22
--- /dev/null
+++ b/knowledge_base/generative/conditional_gan.py
@@ -0,0 +1,336 @@
+"""
+Title: Conditional GAN
+Author: [Sayak Paul](https://twitter.com/RisingSayak)
+Date created: 2021/07/13
+Last modified: 2024/01/02
+Description: Training a GAN conditioned on class labels to generate handwritten digits.
+Accelerator: GPU
+"""
+
+"""
+Generative Adversarial Networks (GANs) let us generate novel image data, video data,
+or audio data from a random input. Typically, the random input is sampled
+from a normal distribution, before going through a series of transformations that turn
+it into something plausible (image, video, audio, etc.).
+
+However, a simple [DCGAN](https://arxiv.org/abs/1511.06434) doesn't let us control
+the appearance (e.g. class) of the samples we're generating. For instance,
+with a GAN that generates MNIST handwritten digits, a simple DCGAN wouldn't let us
+choose the class of digits we're generating.
+To be able to control what we generate, we need to _condition_ the GAN output
+on a semantic input, such as the class of an image.
+
+In this example, we'll build a **Conditional GAN** that can generate MNIST handwritten
+digits conditioned on a given class. Such a model can have various useful applications:
+
+* let's say you are dealing with an
+[imbalanced image dataset](https://developers.google.com/machine-learning/data-prep/construct/sampling-splitting/imbalanced-data),
+and you'd like to gather more examples for the skewed class to balance the dataset.
+Data collection can be a costly process on its own. You could instead train a Conditional GAN and use
+it to generate novel images for the class that needs balancing.
+* Since the generator learns to associate the generated samples with the class labels,
+its representations can also be used for [other downstream tasks](https://arxiv.org/abs/1809.11096).
+
+Following are the references used for developing this example:
+
+* [Conditional Generative Adversarial Nets](https://arxiv.org/abs/1411.1784)
+* [Lecture on Conditional Generation from Coursera](https://www.coursera.org/lecture/build-basic-generative-adversarial-networks-gans/conditional-generation-inputs-2OPrG)
+
+If you need a refresher on GANs, you can refer to the "Generative adversarial networks"
+section of
+[this resource](https://livebook.manning.com/book/deep-learning-with-python-second-edition/chapter-12/r-3/232).
+
+This example requires TensorFlow 2.5 or higher, as well as TensorFlow Docs, which can be
+installed using the following command:
+"""
+
+"""shell
+pip install -q git+https://github.com/tensorflow/docs
+"""
+
+"""
+## Imports
+"""
+
+import keras
+
+from keras import layers
+from keras import ops
+from tensorflow_docs.vis import embed
+import tensorflow as tf
+import numpy as np
+import imageio
+
+"""
+## Constants and hyperparameters
+"""
+
+batch_size = 64
+num_channels = 1
+num_classes = 10
+image_size = 28
+latent_dim = 128
+
+"""
+## Loading the MNIST dataset and preprocessing it
+"""
+
+# We'll use all the available examples from both the training and test
+# sets.
+(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
+all_digits = np.concatenate([x_train, x_test])
+all_labels = np.concatenate([y_train, y_test])
+
+# Scale the pixel values to [0, 1] range, add a channel dimension to
+# the images, and one-hot encode the labels.
+all_digits = all_digits.astype("float32") / 255.0
+all_digits = np.reshape(all_digits, (-1, 28, 28, 1))
+all_labels = keras.utils.to_categorical(all_labels, 10)
+
+# Create tf.data.Dataset.
+dataset = tf.data.Dataset.from_tensor_slices((all_digits, all_labels))
+dataset = dataset.shuffle(buffer_size=1024).batch(batch_size)
+
+print(f"Shape of training images: {all_digits.shape}")
+print(f"Shape of training labels: {all_labels.shape}")
+
+"""
+## Calculating the number of input channel for the generator and discriminator
+
+In a regular (unconditional) GAN, we start by sampling noise (of some fixed
+dimension) from a normal distribution. In our case, we also need to account
+for the class labels. We will have to add the number of classes to
+the input channels of the generator (noise input) as well as the discriminator
+(generated image input).
+"""
+
+generator_in_channels = latent_dim + num_classes
+discriminator_in_channels = num_channels + num_classes
+print(generator_in_channels, discriminator_in_channels)
+
+"""
+## Creating the discriminator and generator
+
+The model definitions (`discriminator`, `generator`, and `ConditionalGAN`) have been
+adapted from [this example](https://keras.io/guides/customizing_what_happens_in_fit/).
+"""
+
+# Create the discriminator.
+discriminator = keras.Sequential(
+    [
+        keras.layers.InputLayer((28, 28, discriminator_in_channels)),
+        layers.Conv2D(64, (3, 3), strides=(2, 2), padding="same"),
+        layers.LeakyReLU(negative_slope=0.2),
+        layers.Conv2D(128, (3, 3), strides=(2, 2), padding="same"),
+        layers.LeakyReLU(negative_slope=0.2),
+        layers.GlobalMaxPooling2D(),
+        layers.Dense(1),
+    ],
+    name="discriminator",
+)
+
+# Create the generator.
+generator = keras.Sequential(
+    [
+        keras.layers.InputLayer((generator_in_channels,)),
+        # We want to generate 128 + num_classes coefficients to reshape into a
+        # 7x7x(128 + num_classes) map.
+        layers.Dense(7 * 7 * generator_in_channels),
+        layers.LeakyReLU(negative_slope=0.2),
+        layers.Reshape((7, 7, generator_in_channels)),
+        layers.Conv2DTranspose(128, (4, 4), strides=(2, 2), padding="same"),
+        layers.LeakyReLU(negative_slope=0.2),
+        layers.Conv2DTranspose(128, (4, 4), strides=(2, 2), padding="same"),
+        layers.LeakyReLU(negative_slope=0.2),
+        layers.Conv2D(1, (7, 7), padding="same", activation="sigmoid"),
+    ],
+    name="generator",
+)
+
+"""
+## Creating a `ConditionalGAN` model
+"""
+
+
+class ConditionalGAN(keras.Model):
+    def __init__(self, discriminator, generator, latent_dim):
+        super().__init__()
+        self.discriminator = discriminator
+        self.generator = generator
+        self.latent_dim = latent_dim
+        self.seed_generator = keras.random.SeedGenerator(1337)
+        self.gen_loss_tracker = keras.metrics.Mean(name="generator_loss")
+        self.disc_loss_tracker = keras.metrics.Mean(name="discriminator_loss")
+
+    @property
+    def metrics(self):
+        return [self.gen_loss_tracker, self.disc_loss_tracker]
+
+    def compile(self, d_optimizer, g_optimizer, loss_fn):
+        super().compile()
+        self.d_optimizer = d_optimizer
+        self.g_optimizer = g_optimizer
+        self.loss_fn = loss_fn
+
+    def train_step(self, data):
+        # Unpack the data.
+        real_images, one_hot_labels = data
+
+        # Add dummy dimensions to the labels so that they can be concatenated with
+        # the images. This is for the discriminator.
+        image_one_hot_labels = one_hot_labels[:, :, None, None]
+        image_one_hot_labels = ops.repeat(
+            image_one_hot_labels, repeats=[image_size * image_size]
+        )
+        image_one_hot_labels = ops.reshape(
+            image_one_hot_labels, (-1, image_size, image_size, num_classes)
+        )
+
+        # Sample random points in the latent space and concatenate the labels.
+        # This is for the generator.
+        batch_size = ops.shape(real_images)[0]
+        random_latent_vectors = keras.random.normal(
+            shape=(batch_size, self.latent_dim), seed=self.seed_generator
+        )
+        random_vector_labels = ops.concatenate(
+            [random_latent_vectors, one_hot_labels], axis=1
+        )
+
+        # Decode the noise (guided by labels) to fake images.
+        generated_images = self.generator(random_vector_labels)
+
+        # Combine them with real images. Note that we are concatenating the labels
+        # with these images here.
+        fake_image_and_labels = ops.concatenate(
+            [generated_images, image_one_hot_labels], -1
+        )
+        real_image_and_labels = ops.concatenate([real_images, image_one_hot_labels], -1)
+        combined_images = ops.concatenate(
+            [fake_image_and_labels, real_image_and_labels], axis=0
+        )
+
+        # Assemble labels discriminating real from fake images.
+        labels = ops.concatenate(
+            [ops.ones((batch_size, 1)), ops.zeros((batch_size, 1))], axis=0
+        )
+
+        # Train the discriminator.
+        with tf.GradientTape() as tape:
+            predictions = self.discriminator(combined_images)
+            d_loss = self.loss_fn(labels, predictions)
+        grads = tape.gradient(d_loss, self.discriminator.trainable_weights)
+        self.d_optimizer.apply_gradients(
+            zip(grads, self.discriminator.trainable_weights)
+        )
+
+        # Sample random points in the latent space.
+        random_latent_vectors = keras.random.normal(
+            shape=(batch_size, self.latent_dim), seed=self.seed_generator
+        )
+        random_vector_labels = ops.concatenate(
+            [random_latent_vectors, one_hot_labels], axis=1
+        )
+
+        # Assemble labels that say "all real images".
+        misleading_labels = ops.zeros((batch_size, 1))
+
+        # Train the generator (note that we should *not* update the weights
+        # of the discriminator)!
+        with tf.GradientTape() as tape:
+            fake_images = self.generator(random_vector_labels)
+            fake_image_and_labels = ops.concatenate(
+                [fake_images, image_one_hot_labels], -1
+            )
+            predictions = self.discriminator(fake_image_and_labels)
+            g_loss = self.loss_fn(misleading_labels, predictions)
+        grads = tape.gradient(g_loss, self.generator.trainable_weights)
+        self.g_optimizer.apply_gradients(zip(grads, self.generator.trainable_weights))
+
+        # Monitor loss.
+        self.gen_loss_tracker.update_state(g_loss)
+        self.disc_loss_tracker.update_state(d_loss)
+        return {
+            "g_loss": self.gen_loss_tracker.result(),
+            "d_loss": self.disc_loss_tracker.result(),
+        }
+
+
+"""
+## Training the Conditional GAN
+"""
+
+cond_gan = ConditionalGAN(
+    discriminator=discriminator, generator=generator, latent_dim=latent_dim
+)
+cond_gan.compile(
+    d_optimizer=keras.optimizers.Adam(learning_rate=0.0003),
+    g_optimizer=keras.optimizers.Adam(learning_rate=0.0003),
+    loss_fn=keras.losses.BinaryCrossentropy(from_logits=True),
+)
+
+cond_gan.fit(dataset, epochs=20)
+
+"""
+## Interpolating between classes with the trained generator
+"""
+
+# We first extract the trained generator from our Conditional GAN.
+trained_gen = cond_gan.generator
+
+# Choose the number of intermediate images that would be generated in
+# between the interpolation + 2 (start and last images).
+num_interpolation = 9  # @param {type:"integer"}
+
+# Sample noise for the interpolation.
+interpolation_noise = keras.random.normal(shape=(1, latent_dim))
+interpolation_noise = ops.repeat(interpolation_noise, repeats=num_interpolation)
+interpolation_noise = ops.reshape(interpolation_noise, (num_interpolation, latent_dim))
+
+
+def interpolate_class(first_number, second_number):
+    # Convert the start and end labels to one-hot encoded vectors.
+    first_label = keras.utils.to_categorical([first_number], num_classes)
+    second_label = keras.utils.to_categorical([second_number], num_classes)
+    first_label = ops.cast(first_label, "float32")
+    second_label = ops.cast(second_label, "float32")
+
+    # Calculate the interpolation vector between the two labels.
+    percent_second_label = ops.linspace(0, 1, num_interpolation)[:, None]
+    percent_second_label = ops.cast(percent_second_label, "float32")
+    interpolation_labels = (
+        first_label * (1 - percent_second_label) + second_label * percent_second_label
+    )
+
+    # Combine the noise and the labels and run inference with the generator.
+    noise_and_labels = ops.concatenate([interpolation_noise, interpolation_labels], 1)
+    fake = trained_gen.predict(noise_and_labels)
+    return fake
+
+
+start_class = 2  # @param {type:"slider", min:0, max:9, step:1}
+end_class = 6  # @param {type:"slider", min:0, max:9, step:1}
+
+fake_images = interpolate_class(start_class, end_class)
+
+"""
+Here, we first sample noise from a normal distribution and then we repeat that for
+`num_interpolation` times and reshape the result accordingly.
+We then distribute it uniformly for `num_interpolation`
+with the label identities being present in some proportion.
+"""
+
+fake_images *= 255.0
+converted_images = fake_images.astype(np.uint8)
+converted_images = ops.image.resize(converted_images, (96, 96)).numpy().astype(np.uint8)
+imageio.mimsave("animation.gif", converted_images[:, :, :, 0], fps=1)
+embed.embed_file("animation.gif")
+
+"""
+We can further improve the performance of this model with recipes like
+[WGAN-GP](https://keras.io/examples/generative/wgan_gp).
+Conditional generation is also widely used in many modern image generation architectures like
+[VQ-GANs](https://arxiv.org/abs/2012.09841), [DALL-E](https://openai.com/blog/dall-e/),
+etc.
+
+You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/keras-io/conditional-gan) and try the demo on [Hugging Face Spaces](https://huggingface.co/spaces/keras-io/conditional-GAN).
+"""
diff --git a/knowledge_base/generative/cyclegan.py b/knowledge_base/generative/cyclegan.py
new file mode 100644
index 0000000000000000000000000000000000000000..bef76f5d9b62973527d458c1ae9de5d210880eac
--- /dev/null
+++ b/knowledge_base/generative/cyclegan.py
@@ -0,0 +1,663 @@
+"""
+Title: CycleGAN
+Author: [A_K_Nain](https://twitter.com/A_K_Nain)
+Date created: 2020/08/12
+Last modified: 2024/09/30
+Description: Implementation of CycleGAN.
+Accelerator: GPU
+"""
+
+"""
+## CycleGAN
+
+CycleGAN is a model that aims to solve the image-to-image translation
+problem. The goal of the image-to-image translation problem is to learn the
+mapping between an input image and an output image using a training set of
+aligned image pairs. However, obtaining paired examples isn't always feasible.
+CycleGAN tries to learn this mapping without requiring paired input-output images,
+using cycle-consistent adversarial networks.
+
+- [Paper](https://arxiv.org/abs/1703.10593)
+- [Original implementation](https://github.com/junyanz/pytorch-CycleGAN-and-pix2pix)
+"""
+
+"""
+## Setup
+"""
+
+import os
+import numpy as np
+import matplotlib.pyplot as plt
+import tensorflow as tf
+import keras
+from keras import layers, ops
+import tensorflow_datasets as tfds
+
+tfds.disable_progress_bar()
+autotune = tf.data.AUTOTUNE
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+"""
+## Prepare the dataset
+
+In this example, we will be using the
+[horse to zebra](https://www.tensorflow.org/datasets/catalog/cycle_gan#cycle_ganhorse2zebra)
+dataset.
+"""
+
+# Load the horse-zebra dataset using tensorflow-datasets.
+dataset, _ = tfds.load(name="cycle_gan/horse2zebra", with_info=True, as_supervised=True)
+train_horses, train_zebras = dataset["trainA"], dataset["trainB"]
+test_horses, test_zebras = dataset["testA"], dataset["testB"]
+
+# Define the standard image size.
+orig_img_size = (286, 286)
+# Size of the random crops to be used during training.
+input_img_size = (256, 256, 3)
+# Weights initializer for the layers.
+kernel_init = keras.initializers.RandomNormal(mean=0.0, stddev=0.02)
+# Gamma initializer for instance normalization.
+gamma_init = keras.initializers.RandomNormal(mean=0.0, stddev=0.02)
+
+buffer_size = 256
+batch_size = 1
+
+
+def normalize_img(img):
+    img = ops.cast(img, dtype=tf.float32)
+    # Map values in the range [-1, 1]
+    return (img / 127.5) - 1.0
+
+
+def preprocess_train_image(img, label):
+    # Random flip
+    img = tf.image.random_flip_left_right(img)
+    # Resize to the original size first
+    img = ops.image.resize(img, [*orig_img_size])
+    # Random crop to 256X256
+    img = tf.image.random_crop(img, size=[*input_img_size])
+    # Normalize the pixel values in the range [-1, 1]
+    img = normalize_img(img)
+    return img
+
+
+def preprocess_test_image(img, label):
+    # Only resizing and normalization for the test images.
+    img = ops.image.resize(img, [input_img_size[0], input_img_size[1]])
+    img = normalize_img(img)
+    return img
+
+
+"""
+## Create `Dataset` objects
+"""
+
+
+# Apply the preprocessing operations to the training data
+train_horses = (
+    train_horses.map(preprocess_train_image, num_parallel_calls=autotune)
+    .cache()
+    .shuffle(buffer_size)
+    .batch(batch_size)
+)
+train_zebras = (
+    train_zebras.map(preprocess_train_image, num_parallel_calls=autotune)
+    .cache()
+    .shuffle(buffer_size)
+    .batch(batch_size)
+)
+
+# Apply the preprocessing operations to the test data
+test_horses = (
+    test_horses.map(preprocess_test_image, num_parallel_calls=autotune)
+    .cache()
+    .shuffle(buffer_size)
+    .batch(batch_size)
+)
+test_zebras = (
+    test_zebras.map(preprocess_test_image, num_parallel_calls=autotune)
+    .cache()
+    .shuffle(buffer_size)
+    .batch(batch_size)
+)
+
+
+"""
+## Visualize some samples
+"""
+
+
+_, ax = plt.subplots(4, 2, figsize=(10, 15))
+for i, samples in enumerate(zip(train_horses.take(4), train_zebras.take(4))):
+    horse = (((samples[0][0] * 127.5) + 127.5).numpy()).astype(np.uint8)
+    zebra = (((samples[1][0] * 127.5) + 127.5).numpy()).astype(np.uint8)
+    ax[i, 0].imshow(horse)
+    ax[i, 1].imshow(zebra)
+plt.show()
+
+
+"""
+## Building blocks used in the CycleGAN generators and discriminators
+"""
+
+
+class ReflectionPadding2D(layers.Layer):
+    """Implements Reflection Padding as a layer.
+
+    Args:
+        padding(tuple): Amount of padding for the
+        spatial dimensions.
+
+    Returns:
+        A padded tensor with the same type as the input tensor.
+    """
+
+    def __init__(self, padding=(1, 1), **kwargs):
+        self.padding = tuple(padding)
+        super().__init__(**kwargs)
+
+    def call(self, input_tensor, mask=None):
+        padding_width, padding_height = self.padding
+        padding_tensor = [
+            [0, 0],
+            [padding_height, padding_height],
+            [padding_width, padding_width],
+            [0, 0],
+        ]
+        return ops.pad(input_tensor, padding_tensor, mode="REFLECT")
+
+
+def residual_block(
+    x,
+    activation,
+    kernel_initializer=kernel_init,
+    kernel_size=(3, 3),
+    strides=(1, 1),
+    padding="valid",
+    gamma_initializer=gamma_init,
+    use_bias=False,
+):
+    dim = x.shape[-1]
+    input_tensor = x
+
+    x = ReflectionPadding2D()(input_tensor)
+    x = layers.Conv2D(
+        dim,
+        kernel_size,
+        strides=strides,
+        kernel_initializer=kernel_initializer,
+        padding=padding,
+        use_bias=use_bias,
+    )(x)
+    x = keras.layers.GroupNormalization(groups=1, gamma_initializer=gamma_initializer)(
+        x
+    )
+    x = activation(x)
+
+    x = ReflectionPadding2D()(x)
+    x = layers.Conv2D(
+        dim,
+        kernel_size,
+        strides=strides,
+        kernel_initializer=kernel_initializer,
+        padding=padding,
+        use_bias=use_bias,
+    )(x)
+    x = keras.layers.GroupNormalization(groups=1, gamma_initializer=gamma_initializer)(
+        x
+    )
+    x = layers.add([input_tensor, x])
+    return x
+
+
+def downsample(
+    x,
+    filters,
+    activation,
+    kernel_initializer=kernel_init,
+    kernel_size=(3, 3),
+    strides=(2, 2),
+    padding="same",
+    gamma_initializer=gamma_init,
+    use_bias=False,
+):
+    x = layers.Conv2D(
+        filters,
+        kernel_size,
+        strides=strides,
+        kernel_initializer=kernel_initializer,
+        padding=padding,
+        use_bias=use_bias,
+    )(x)
+    x = keras.layers.GroupNormalization(groups=1, gamma_initializer=gamma_initializer)(
+        x
+    )
+    if activation:
+        x = activation(x)
+    return x
+
+
+def upsample(
+    x,
+    filters,
+    activation,
+    kernel_size=(3, 3),
+    strides=(2, 2),
+    padding="same",
+    kernel_initializer=kernel_init,
+    gamma_initializer=gamma_init,
+    use_bias=False,
+):
+    x = layers.Conv2DTranspose(
+        filters,
+        kernel_size,
+        strides=strides,
+        padding=padding,
+        kernel_initializer=kernel_initializer,
+        use_bias=use_bias,
+    )(x)
+    x = keras.layers.GroupNormalization(groups=1, gamma_initializer=gamma_initializer)(
+        x
+    )
+    if activation:
+        x = activation(x)
+    return x
+
+
+"""
+## Build the generators
+
+The generator consists of downsampling blocks: nine residual blocks
+and upsampling blocks. The structure of the generator is the following:
+
+```
+c7s1-64 ==> Conv block with `relu` activation, filter size of 7
+d128 ====|
+         |-> 2 downsampling blocks
+d256 ====|
+R256 ====|
+R256     |
+R256     |
+R256     |
+R256     |-> 9 residual blocks
+R256     |
+R256     |
+R256     |
+R256 ====|
+u128 ====|
+         |-> 2 upsampling blocks
+u64  ====|
+c7s1-3 => Last conv block with `tanh` activation, filter size of 7.
+```
+"""
+
+
+def get_resnet_generator(
+    filters=64,
+    num_downsampling_blocks=2,
+    num_residual_blocks=9,
+    num_upsample_blocks=2,
+    gamma_initializer=gamma_init,
+    name=None,
+):
+    img_input = layers.Input(shape=input_img_size, name=name + "_img_input")
+    x = ReflectionPadding2D(padding=(3, 3))(img_input)
+    x = layers.Conv2D(filters, (7, 7), kernel_initializer=kernel_init, use_bias=False)(
+        x
+    )
+    x = keras.layers.GroupNormalization(groups=1, gamma_initializer=gamma_initializer)(
+        x
+    )
+    x = layers.Activation("relu")(x)
+
+    # Downsampling
+    for _ in range(num_downsampling_blocks):
+        filters *= 2
+        x = downsample(x, filters=filters, activation=layers.Activation("relu"))
+
+    # Residual blocks
+    for _ in range(num_residual_blocks):
+        x = residual_block(x, activation=layers.Activation("relu"))
+
+    # Upsampling
+    for _ in range(num_upsample_blocks):
+        filters //= 2
+        x = upsample(x, filters, activation=layers.Activation("relu"))
+
+    # Final block
+    x = ReflectionPadding2D(padding=(3, 3))(x)
+    x = layers.Conv2D(3, (7, 7), padding="valid")(x)
+    x = layers.Activation("tanh")(x)
+
+    model = keras.models.Model(img_input, x, name=name)
+    return model
+
+
+"""
+## Build the discriminators
+
+The discriminators implement the following architecture:
+`C64->C128->C256->C512`
+"""
+
+
+def get_discriminator(
+    filters=64, kernel_initializer=kernel_init, num_downsampling=3, name=None
+):
+    img_input = layers.Input(shape=input_img_size, name=name + "_img_input")
+    x = layers.Conv2D(
+        filters,
+        (4, 4),
+        strides=(2, 2),
+        padding="same",
+        kernel_initializer=kernel_initializer,
+    )(img_input)
+    x = layers.LeakyReLU(0.2)(x)
+
+    num_filters = filters
+    for num_downsample_block in range(3):
+        num_filters *= 2
+        if num_downsample_block < 2:
+            x = downsample(
+                x,
+                filters=num_filters,
+                activation=layers.LeakyReLU(0.2),
+                kernel_size=(4, 4),
+                strides=(2, 2),
+            )
+        else:
+            x = downsample(
+                x,
+                filters=num_filters,
+                activation=layers.LeakyReLU(0.2),
+                kernel_size=(4, 4),
+                strides=(1, 1),
+            )
+
+    x = layers.Conv2D(
+        1, (4, 4), strides=(1, 1), padding="same", kernel_initializer=kernel_initializer
+    )(x)
+
+    model = keras.models.Model(inputs=img_input, outputs=x, name=name)
+    return model
+
+
+# Get the generators
+gen_G = get_resnet_generator(name="generator_G")
+gen_F = get_resnet_generator(name="generator_F")
+
+# Get the discriminators
+disc_X = get_discriminator(name="discriminator_X")
+disc_Y = get_discriminator(name="discriminator_Y")
+
+
+"""
+## Build the CycleGAN model
+
+We will override the `train_step()` method of the `Model` class
+for training via `fit()`.
+"""
+
+
+class CycleGan(keras.Model):
+    def __init__(
+        self,
+        generator_G,
+        generator_F,
+        discriminator_X,
+        discriminator_Y,
+        lambda_cycle=10.0,
+        lambda_identity=0.5,
+    ):
+        super().__init__()
+        self.gen_G = generator_G
+        self.gen_F = generator_F
+        self.disc_X = discriminator_X
+        self.disc_Y = discriminator_Y
+        self.lambda_cycle = lambda_cycle
+        self.lambda_identity = lambda_identity
+
+    def call(self, inputs):
+        return (
+            self.disc_X(inputs),
+            self.disc_Y(inputs),
+            self.gen_G(inputs),
+            self.gen_F(inputs),
+        )
+
+    def compile(
+        self,
+        gen_G_optimizer,
+        gen_F_optimizer,
+        disc_X_optimizer,
+        disc_Y_optimizer,
+        gen_loss_fn,
+        disc_loss_fn,
+    ):
+        super().compile()
+        self.gen_G_optimizer = gen_G_optimizer
+        self.gen_F_optimizer = gen_F_optimizer
+        self.disc_X_optimizer = disc_X_optimizer
+        self.disc_Y_optimizer = disc_Y_optimizer
+        self.generator_loss_fn = gen_loss_fn
+        self.discriminator_loss_fn = disc_loss_fn
+        self.cycle_loss_fn = keras.losses.MeanAbsoluteError()
+        self.identity_loss_fn = keras.losses.MeanAbsoluteError()
+
+    def train_step(self, batch_data):
+        # x is Horse and y is zebra
+        real_x, real_y = batch_data
+
+        # For CycleGAN, we need to calculate different
+        # kinds of losses for the generators and discriminators.
+        # We will perform the following steps here:
+        #
+        # 1. Pass real images through the generators and get the generated images
+        # 2. Pass the generated images back to the generators to check if we
+        #    can predict the original image from the generated image.
+        # 3. Do an identity mapping of the real images using the generators.
+        # 4. Pass the generated images in 1) to the corresponding discriminators.
+        # 5. Calculate the generators total loss (adversarial + cycle + identity)
+        # 6. Calculate the discriminators loss
+        # 7. Update the weights of the generators
+        # 8. Update the weights of the discriminators
+        # 9. Return the losses in a dictionary
+
+        with tf.GradientTape(persistent=True) as tape:
+            # Horse to fake zebra
+            fake_y = self.gen_G(real_x, training=True)
+            # Zebra to fake horse -> y2x
+            fake_x = self.gen_F(real_y, training=True)
+
+            # Cycle (Horse to fake zebra to fake horse): x -> y -> x
+            cycled_x = self.gen_F(fake_y, training=True)
+            # Cycle (Zebra to fake horse to fake zebra) y -> x -> y
+            cycled_y = self.gen_G(fake_x, training=True)
+
+            # Identity mapping
+            same_x = self.gen_F(real_x, training=True)
+            same_y = self.gen_G(real_y, training=True)
+
+            # Discriminator output
+            disc_real_x = self.disc_X(real_x, training=True)
+            disc_fake_x = self.disc_X(fake_x, training=True)
+
+            disc_real_y = self.disc_Y(real_y, training=True)
+            disc_fake_y = self.disc_Y(fake_y, training=True)
+
+            # Generator adversarial loss
+            gen_G_loss = self.generator_loss_fn(disc_fake_y)
+            gen_F_loss = self.generator_loss_fn(disc_fake_x)
+
+            # Generator cycle loss
+            cycle_loss_G = self.cycle_loss_fn(real_y, cycled_y) * self.lambda_cycle
+            cycle_loss_F = self.cycle_loss_fn(real_x, cycled_x) * self.lambda_cycle
+
+            # Generator identity loss
+            id_loss_G = (
+                self.identity_loss_fn(real_y, same_y)
+                * self.lambda_cycle
+                * self.lambda_identity
+            )
+            id_loss_F = (
+                self.identity_loss_fn(real_x, same_x)
+                * self.lambda_cycle
+                * self.lambda_identity
+            )
+
+            # Total generator loss
+            total_loss_G = gen_G_loss + cycle_loss_G + id_loss_G
+            total_loss_F = gen_F_loss + cycle_loss_F + id_loss_F
+
+            # Discriminator loss
+            disc_X_loss = self.discriminator_loss_fn(disc_real_x, disc_fake_x)
+            disc_Y_loss = self.discriminator_loss_fn(disc_real_y, disc_fake_y)
+
+        # Get the gradients for the generators
+        grads_G = tape.gradient(total_loss_G, self.gen_G.trainable_variables)
+        grads_F = tape.gradient(total_loss_F, self.gen_F.trainable_variables)
+
+        # Get the gradients for the discriminators
+        disc_X_grads = tape.gradient(disc_X_loss, self.disc_X.trainable_variables)
+        disc_Y_grads = tape.gradient(disc_Y_loss, self.disc_Y.trainable_variables)
+
+        # Update the weights of the generators
+        self.gen_G_optimizer.apply_gradients(
+            zip(grads_G, self.gen_G.trainable_variables)
+        )
+        self.gen_F_optimizer.apply_gradients(
+            zip(grads_F, self.gen_F.trainable_variables)
+        )
+
+        # Update the weights of the discriminators
+        self.disc_X_optimizer.apply_gradients(
+            zip(disc_X_grads, self.disc_X.trainable_variables)
+        )
+        self.disc_Y_optimizer.apply_gradients(
+            zip(disc_Y_grads, self.disc_Y.trainable_variables)
+        )
+
+        return {
+            "G_loss": total_loss_G,
+            "F_loss": total_loss_F,
+            "D_X_loss": disc_X_loss,
+            "D_Y_loss": disc_Y_loss,
+        }
+
+
+"""
+## Create a callback that periodically saves generated images
+"""
+
+
+class GANMonitor(keras.callbacks.Callback):
+    """A callback to generate and save images after each epoch"""
+
+    def __init__(self, num_img=4):
+        self.num_img = num_img
+
+    def on_epoch_end(self, epoch, logs=None):
+        _, ax = plt.subplots(4, 2, figsize=(12, 12))
+        for i, img in enumerate(test_horses.take(self.num_img)):
+            prediction = self.model.gen_G(img)[0].numpy()
+            prediction = (prediction * 127.5 + 127.5).astype(np.uint8)
+            img = (img[0] * 127.5 + 127.5).numpy().astype(np.uint8)
+
+            ax[i, 0].imshow(img)
+            ax[i, 1].imshow(prediction)
+            ax[i, 0].set_title("Input image")
+            ax[i, 1].set_title("Translated image")
+            ax[i, 0].axis("off")
+            ax[i, 1].axis("off")
+
+            prediction = keras.utils.array_to_img(prediction)
+            prediction.save(
+                "generated_img_{i}_{epoch}.png".format(i=i, epoch=epoch + 1)
+            )
+        plt.show()
+        plt.close()
+
+
+"""
+## Train the end-to-end model
+"""
+
+
+# Loss function for evaluating adversarial loss
+adv_loss_fn = keras.losses.MeanSquaredError()
+
+# Define the loss function for the generators
+
+
+def generator_loss_fn(fake):
+    fake_loss = adv_loss_fn(ops.ones_like(fake), fake)
+    return fake_loss
+
+
+# Define the loss function for the discriminators
+def discriminator_loss_fn(real, fake):
+    real_loss = adv_loss_fn(ops.ones_like(real), real)
+    fake_loss = adv_loss_fn(ops.zeros_like(fake), fake)
+    return (real_loss + fake_loss) * 0.5
+
+
+# Create cycle gan model
+cycle_gan_model = CycleGan(
+    generator_G=gen_G, generator_F=gen_F, discriminator_X=disc_X, discriminator_Y=disc_Y
+)
+
+# Compile the model
+cycle_gan_model.compile(
+    gen_G_optimizer=keras.optimizers.Adam(learning_rate=2e-4, beta_1=0.5),
+    gen_F_optimizer=keras.optimizers.Adam(learning_rate=2e-4, beta_1=0.5),
+    disc_X_optimizer=keras.optimizers.Adam(learning_rate=2e-4, beta_1=0.5),
+    disc_Y_optimizer=keras.optimizers.Adam(learning_rate=2e-4, beta_1=0.5),
+    gen_loss_fn=generator_loss_fn,
+    disc_loss_fn=discriminator_loss_fn,
+)
+# Callbacks
+plotter = GANMonitor()
+checkpoint_filepath = "./model_checkpoints/cyclegan_checkpoints.weights.h5"
+model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
+    filepath=checkpoint_filepath, save_weights_only=True
+)
+
+# Here we will train the model for just one epoch as each epoch takes around
+# 7 minutes on a single P100 backed machine.
+cycle_gan_model.fit(
+    tf.data.Dataset.zip((train_horses, train_zebras)),
+    epochs=90,
+    callbacks=[plotter, model_checkpoint_callback],
+)
+
+"""
+Test the performance of the model.
+"""
+
+
+# Once the weights are loaded, we will take a few samples from the test data and check the model's performance.
+
+
+# Load the checkpoints
+cycle_gan_model.load_weights(checkpoint_filepath)
+print("Weights loaded successfully")
+
+_, ax = plt.subplots(4, 2, figsize=(10, 15))
+for i, img in enumerate(test_horses.take(4)):
+    prediction = cycle_gan_model.gen_G(img, training=False)[0].numpy()
+    prediction = (prediction * 127.5 + 127.5).astype(np.uint8)
+    img = (img[0] * 127.5 + 127.5).numpy().astype(np.uint8)
+
+    ax[i, 0].imshow(img)
+    ax[i, 1].imshow(prediction)
+    ax[i, 0].set_title("Input image")
+    ax[i, 0].set_title("Input image")
+    ax[i, 1].set_title("Translated image")
+    ax[i, 0].axis("off")
+    ax[i, 1].axis("off")
+
+    prediction = keras.utils.array_to_img(prediction)
+    prediction.save("predicted_img_{i}.png".format(i=i))
+plt.tight_layout()
+plt.show()
diff --git a/knowledge_base/generative/dcgan_overriding_train_step.py b/knowledge_base/generative/dcgan_overriding_train_step.py
new file mode 100644
index 0000000000000000000000000000000000000000..26779c76cf0fc5ab7e8fd48f9d80541e9cf43029
--- /dev/null
+++ b/knowledge_base/generative/dcgan_overriding_train_step.py
@@ -0,0 +1,233 @@
+"""
+Title: DCGAN to generate face images
+Author: [fchollet](https://twitter.com/fchollet)
+Date created: 2019/04/29
+Last modified: 2023/12/21
+Description: A simple DCGAN trained using `fit()` by overriding `train_step` on CelebA images.
+Accelerator: GPU
+"""
+
+"""
+## Setup
+"""
+
+import keras
+import tensorflow as tf
+
+from keras import layers
+from keras import ops
+import matplotlib.pyplot as plt
+import os
+import gdown
+from zipfile import ZipFile
+
+
+"""
+## Prepare CelebA data
+
+We'll use face images from the CelebA dataset, resized to 64x64.
+"""
+
+os.makedirs("celeba_gan")
+
+url = "https://drive.google.com/uc?id=1O7m1010EJjLE5QxLZiM9Fpjs7Oj6e684"
+output = "celeba_gan/data.zip"
+gdown.download(url, output, quiet=True)
+
+with ZipFile("celeba_gan/data.zip", "r") as zipobj:
+    zipobj.extractall("celeba_gan")
+
+"""
+Create a dataset from our folder, and rescale the images to the [0-1] range:
+"""
+
+dataset = keras.utils.image_dataset_from_directory(
+    "celeba_gan", label_mode=None, image_size=(64, 64), batch_size=32
+)
+dataset = dataset.map(lambda x: x / 255.0)
+
+
+"""
+Let's display a sample image:
+"""
+
+
+for x in dataset:
+    plt.axis("off")
+    plt.imshow((x.numpy() * 255).astype("int32")[0])
+    break
+
+
+"""
+## Create the discriminator
+
+It maps a 64x64 image to a binary classification score.
+"""
+
+discriminator = keras.Sequential(
+    [
+        keras.Input(shape=(64, 64, 3)),
+        layers.Conv2D(64, kernel_size=4, strides=2, padding="same"),
+        layers.LeakyReLU(negative_slope=0.2),
+        layers.Conv2D(128, kernel_size=4, strides=2, padding="same"),
+        layers.LeakyReLU(negative_slope=0.2),
+        layers.Conv2D(128, kernel_size=4, strides=2, padding="same"),
+        layers.LeakyReLU(negative_slope=0.2),
+        layers.Flatten(),
+        layers.Dropout(0.2),
+        layers.Dense(1, activation="sigmoid"),
+    ],
+    name="discriminator",
+)
+discriminator.summary()
+
+"""
+## Create the generator
+
+It mirrors the discriminator, replacing `Conv2D` layers with `Conv2DTranspose` layers.
+"""
+
+latent_dim = 128
+
+generator = keras.Sequential(
+    [
+        keras.Input(shape=(latent_dim,)),
+        layers.Dense(8 * 8 * 128),
+        layers.Reshape((8, 8, 128)),
+        layers.Conv2DTranspose(128, kernel_size=4, strides=2, padding="same"),
+        layers.LeakyReLU(negative_slope=0.2),
+        layers.Conv2DTranspose(256, kernel_size=4, strides=2, padding="same"),
+        layers.LeakyReLU(negative_slope=0.2),
+        layers.Conv2DTranspose(512, kernel_size=4, strides=2, padding="same"),
+        layers.LeakyReLU(negative_slope=0.2),
+        layers.Conv2D(3, kernel_size=5, padding="same", activation="sigmoid"),
+    ],
+    name="generator",
+)
+generator.summary()
+
+"""
+## Override `train_step`
+"""
+
+
+class GAN(keras.Model):
+    def __init__(self, discriminator, generator, latent_dim):
+        super().__init__()
+        self.discriminator = discriminator
+        self.generator = generator
+        self.latent_dim = latent_dim
+        self.seed_generator = keras.random.SeedGenerator(1337)
+
+    def compile(self, d_optimizer, g_optimizer, loss_fn):
+        super().compile()
+        self.d_optimizer = d_optimizer
+        self.g_optimizer = g_optimizer
+        self.loss_fn = loss_fn
+        self.d_loss_metric = keras.metrics.Mean(name="d_loss")
+        self.g_loss_metric = keras.metrics.Mean(name="g_loss")
+
+    @property
+    def metrics(self):
+        return [self.d_loss_metric, self.g_loss_metric]
+
+    def train_step(self, real_images):
+        # Sample random points in the latent space
+        batch_size = ops.shape(real_images)[0]
+        random_latent_vectors = keras.random.normal(
+            shape=(batch_size, self.latent_dim), seed=self.seed_generator
+        )
+
+        # Decode them to fake images
+        generated_images = self.generator(random_latent_vectors)
+
+        # Combine them with real images
+        combined_images = ops.concatenate([generated_images, real_images], axis=0)
+
+        # Assemble labels discriminating real from fake images
+        labels = ops.concatenate(
+            [ops.ones((batch_size, 1)), ops.zeros((batch_size, 1))], axis=0
+        )
+        # Add random noise to the labels - important trick!
+        labels += 0.05 * tf.random.uniform(tf.shape(labels))
+
+        # Train the discriminator
+        with tf.GradientTape() as tape:
+            predictions = self.discriminator(combined_images)
+            d_loss = self.loss_fn(labels, predictions)
+        grads = tape.gradient(d_loss, self.discriminator.trainable_weights)
+        self.d_optimizer.apply_gradients(
+            zip(grads, self.discriminator.trainable_weights)
+        )
+
+        # Sample random points in the latent space
+        random_latent_vectors = keras.random.normal(
+            shape=(batch_size, self.latent_dim), seed=self.seed_generator
+        )
+
+        # Assemble labels that say "all real images"
+        misleading_labels = ops.zeros((batch_size, 1))
+
+        # Train the generator (note that we should *not* update the weights
+        # of the discriminator)!
+        with tf.GradientTape() as tape:
+            predictions = self.discriminator(self.generator(random_latent_vectors))
+            g_loss = self.loss_fn(misleading_labels, predictions)
+        grads = tape.gradient(g_loss, self.generator.trainable_weights)
+        self.g_optimizer.apply_gradients(zip(grads, self.generator.trainable_weights))
+
+        # Update metrics
+        self.d_loss_metric.update_state(d_loss)
+        self.g_loss_metric.update_state(g_loss)
+        return {
+            "d_loss": self.d_loss_metric.result(),
+            "g_loss": self.g_loss_metric.result(),
+        }
+
+
+"""
+## Create a callback that periodically saves generated images
+"""
+
+
+class GANMonitor(keras.callbacks.Callback):
+    def __init__(self, num_img=3, latent_dim=128):
+        self.num_img = num_img
+        self.latent_dim = latent_dim
+        self.seed_generator = keras.random.SeedGenerator(42)
+
+    def on_epoch_end(self, epoch, logs=None):
+        random_latent_vectors = keras.random.normal(
+            shape=(self.num_img, self.latent_dim), seed=self.seed_generator
+        )
+        generated_images = self.model.generator(random_latent_vectors)
+        generated_images *= 255
+        generated_images.numpy()
+        for i in range(self.num_img):
+            img = keras.utils.array_to_img(generated_images[i])
+            img.save("generated_img_%03d_%d.png" % (epoch, i))
+
+
+"""
+## Train the end-to-end model
+"""
+
+epochs = 1  # In practice, use ~100 epochs
+
+gan = GAN(discriminator=discriminator, generator=generator, latent_dim=latent_dim)
+gan.compile(
+    d_optimizer=keras.optimizers.Adam(learning_rate=0.0001),
+    g_optimizer=keras.optimizers.Adam(learning_rate=0.0001),
+    loss_fn=keras.losses.BinaryCrossentropy(),
+)
+
+gan.fit(
+    dataset, epochs=epochs, callbacks=[GANMonitor(num_img=10, latent_dim=latent_dim)]
+)
+
+"""
+Some of the last generated images around epoch 30
+(results keep improving after that):
+
+![results](https://i.imgur.com/h5MtQZ7l.png)
+"""
diff --git a/knowledge_base/generative/ddim.py b/knowledge_base/generative/ddim.py
new file mode 100644
index 0000000000000000000000000000000000000000..bd533ee4ff6ec03b588167ac5c82bdf89a4fe392
--- /dev/null
+++ b/knowledge_base/generative/ddim.py
@@ -0,0 +1,868 @@
+"""
+Title: Denoising Diffusion Implicit Models
+Author: [András Béres](https://www.linkedin.com/in/andras-beres-789190210)
+Date created: 2022/06/24
+Last modified: 2022/06/24
+Description: Generating images of flowers with denoising diffusion implicit models.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+### What are diffusion models?
+
+Recently, [denoising diffusion models](https://arxiv.org/abs/2006.11239), including
+[score-based generative models](https://arxiv.org/abs/1907.05600), gained popularity as a
+powerful class of generative models, that can [rival](https://arxiv.org/abs/2105.05233)
+even [generative adversarial networks (GANs)](https://arxiv.org/abs/1406.2661) in image
+synthesis quality. They tend to generate more diverse samples, while being stable to
+train and easy to scale. Recent large diffusion models, such as
+[DALL-E 2](https://openai.com/dall-e-2/) and [Imagen](https://imagen.research.google/),
+have shown incredible text-to-image generation capability. One of their drawbacks is
+however, that they are slower to sample from, because they require multiple forward passes
+for generating an image.
+
+Diffusion refers to the process of turning a structured signal (an image) into noise
+step-by-step. By simulating diffusion, we can generate noisy images from our training
+images, and can train a neural network to try to denoise them. Using the trained network
+we can simulate the opposite of diffusion, reverse diffusion, which is the process of an
+image emerging from noise.
+
+![diffusion process gif](https://i.imgur.com/dipPOfa.gif)
+
+One-sentence summary: **diffusion models are trained to denoise noisy images, and can
+generate images by iteratively denoising pure noise.**
+
+### Goal of this example
+
+This code example intends to be a minimal but feature-complete (with a generation quality
+metric) implementation of diffusion models, with modest compute requirements and
+reasonable performance. My implementation choices and hyperparameter tuning were done
+with these goals in mind.
+
+Since currently the literature of diffusion models is
+[mathematically quite complex](https://arxiv.org/abs/2206.00364)
+with multiple theoretical frameworks
+([score matching](https://arxiv.org/abs/1907.05600),
+[differential equations](https://arxiv.org/abs/2011.13456),
+[Markov chains](https://arxiv.org/abs/2006.11239)) and sometimes even
+[conflicting notations (see Appendix C.2)](https://arxiv.org/abs/2010.02502),
+it can be daunting trying to understand
+them. My view of these models in this example will be that they learn to separate a
+noisy image into its image and Gaussian noise components.
+
+In this example I made effort to break down all long mathematical expressions into
+digestible pieces and gave all variables explanatory names. I also included numerous
+links to relevant literature to help interested readers dive deeper into the topic, in
+the hope that this code example will become a good starting point for practitioners
+learning about diffusion models.
+
+In the following sections, we will implement a continuous time version of
+[Denoising Diffusion Implicit Models (DDIMs)](https://arxiv.org/abs/2010.02502)
+with deterministic sampling.
+"""
+
+"""
+## Setup
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import math
+import matplotlib.pyplot as plt
+import tensorflow as tf
+import tensorflow_datasets as tfds
+
+import keras
+from keras import layers
+from keras import ops
+
+"""
+## Hyperparameters
+"""
+
+# data
+dataset_name = "oxford_flowers102"
+dataset_repetitions = 5
+num_epochs = 1  # train for at least 50 epochs for good results
+image_size = 64
+# KID = Kernel Inception Distance, see related section
+kid_image_size = 75
+kid_diffusion_steps = 5
+plot_diffusion_steps = 20
+
+# sampling
+min_signal_rate = 0.02
+max_signal_rate = 0.95
+
+# architecture
+embedding_dims = 32
+embedding_max_frequency = 1000.0
+widths = [32, 64, 96, 128]
+block_depth = 2
+
+# optimization
+batch_size = 64
+ema = 0.999
+learning_rate = 1e-3
+weight_decay = 1e-4
+
+"""
+## Data pipeline
+
+We will use the
+[Oxford Flowers 102](https://www.tensorflow.org/datasets/catalog/oxford_flowers102)
+dataset for
+generating images of flowers, which is a diverse natural dataset containing around 8,000
+images. Unfortunately the official splits are imbalanced, as most of the images are
+contained in the test split. We create new splits (80% train, 20% validation) using the
+[Tensorflow Datasets slicing API](https://www.tensorflow.org/datasets/splits). We apply
+center crops as preprocessing, and repeat the dataset multiple times (reason given in the
+next section).
+"""
+
+
+def preprocess_image(data):
+    # center crop image
+    height = ops.shape(data["image"])[0]
+    width = ops.shape(data["image"])[1]
+    crop_size = ops.minimum(height, width)
+    image = tf.image.crop_to_bounding_box(
+        data["image"],
+        (height - crop_size) // 2,
+        (width - crop_size) // 2,
+        crop_size,
+        crop_size,
+    )
+
+    # resize and clip
+    # for image downsampling it is important to turn on antialiasing
+    image = tf.image.resize(image, size=[image_size, image_size], antialias=True)
+    return ops.clip(image / 255.0, 0.0, 1.0)
+
+
+def prepare_dataset(split):
+    # the validation dataset is shuffled as well, because data order matters
+    # for the KID estimation
+    return (
+        tfds.load(dataset_name, split=split, shuffle_files=True)
+        .map(preprocess_image, num_parallel_calls=tf.data.AUTOTUNE)
+        .cache()
+        .repeat(dataset_repetitions)
+        .shuffle(10 * batch_size)
+        .batch(batch_size, drop_remainder=True)
+        .prefetch(buffer_size=tf.data.AUTOTUNE)
+    )
+
+
+# load dataset
+train_dataset = prepare_dataset("train[:80%]+validation[:80%]+test[:80%]")
+val_dataset = prepare_dataset("train[80%:]+validation[80%:]+test[80%:]")
+
+"""
+## Kernel inception distance
+
+[Kernel Inception Distance (KID)](https://arxiv.org/abs/1801.01401) is an image quality
+metric which was proposed as a replacement for the popular
+[Frechet Inception Distance (FID)](https://arxiv.org/abs/1706.08500).
+I prefer KID to FID because it is simpler to
+implement, can be estimated per-batch, and is computationally lighter. More details
+[here](https://keras.io/examples/generative/gan_ada/#kernel-inception-distance).
+
+In this example, the images are evaluated at the minimal possible resolution of the
+Inception network (75x75 instead of 299x299), and the metric is only measured on the
+validation set for computational efficiency. We also limit the number of sampling steps
+at evaluation to 5 for the same reason.
+
+Since the dataset is relatively small, we go over the train and validation splits
+multiple times per epoch, because the KID estimation is noisy and compute-intensive, so
+we want to evaluate only after many iterations, but for many iterations.
+
+"""
+
+
+@keras.saving.register_keras_serializable()
+class KID(keras.metrics.Metric):
+    def __init__(self, name, **kwargs):
+        super().__init__(name=name, **kwargs)
+
+        # KID is estimated per batch and is averaged across batches
+        self.kid_tracker = keras.metrics.Mean(name="kid_tracker")
+
+        # a pretrained InceptionV3 is used without its classification layer
+        # transform the pixel values to the 0-255 range, then use the same
+        # preprocessing as during pretraining
+        self.encoder = keras.Sequential(
+            [
+                keras.Input(shape=(image_size, image_size, 3)),
+                layers.Rescaling(255.0),
+                layers.Resizing(height=kid_image_size, width=kid_image_size),
+                layers.Lambda(keras.applications.inception_v3.preprocess_input),
+                keras.applications.InceptionV3(
+                    include_top=False,
+                    input_shape=(kid_image_size, kid_image_size, 3),
+                    weights="imagenet",
+                ),
+                layers.GlobalAveragePooling2D(),
+            ],
+            name="inception_encoder",
+        )
+
+    def polynomial_kernel(self, features_1, features_2):
+        feature_dimensions = ops.cast(ops.shape(features_1)[1], dtype="float32")
+        return (
+            features_1 @ ops.transpose(features_2) / feature_dimensions + 1.0
+        ) ** 3.0
+
+    def update_state(self, real_images, generated_images, sample_weight=None):
+        real_features = self.encoder(real_images, training=False)
+        generated_features = self.encoder(generated_images, training=False)
+
+        # compute polynomial kernels using the two sets of features
+        kernel_real = self.polynomial_kernel(real_features, real_features)
+        kernel_generated = self.polynomial_kernel(
+            generated_features, generated_features
+        )
+        kernel_cross = self.polynomial_kernel(real_features, generated_features)
+
+        # estimate the squared maximum mean discrepancy using the average kernel values
+        batch_size = real_features.shape[0]
+        batch_size_f = ops.cast(batch_size, dtype="float32")
+        mean_kernel_real = ops.sum(kernel_real * (1.0 - ops.eye(batch_size))) / (
+            batch_size_f * (batch_size_f - 1.0)
+        )
+        mean_kernel_generated = ops.sum(
+            kernel_generated * (1.0 - ops.eye(batch_size))
+        ) / (batch_size_f * (batch_size_f - 1.0))
+        mean_kernel_cross = ops.mean(kernel_cross)
+        kid = mean_kernel_real + mean_kernel_generated - 2.0 * mean_kernel_cross
+
+        # update the average KID estimate
+        self.kid_tracker.update_state(kid)
+
+    def result(self):
+        return self.kid_tracker.result()
+
+    def reset_state(self):
+        self.kid_tracker.reset_state()
+
+
+"""
+## Network architecture
+
+Here we specify the architecture of the neural network that we will use for denoising. We
+build a [U-Net](https://arxiv.org/abs/1505.04597) with identical input and output
+dimensions. U-Net is a popular semantic segmentation architecture, whose main idea is
+that it progressively downsamples and then upsamples its input image, and adds skip
+connections between layers having the same resolution. These help with gradient flow and
+avoid introducing a representation bottleneck, unlike usual
+[autoencoders](https://www.deeplearningbook.org/contents/autoencoders.html). Based on
+this, one can view
+[diffusion models as denoising autoencoders](https://benanne.github.io/2022/01/31/diffusion.html)
+without a bottleneck.
+
+The network takes two inputs, the noisy images and the variances of their noise
+components. The latter is required since denoising a signal requires different operations
+at different levels of noise. We transform the noise variances using sinusoidal
+embeddings, similarly to positional encodings used both in
+[transformers](https://arxiv.org/abs/1706.03762) and
+[NeRF](https://arxiv.org/abs/2003.08934). This helps the network to be
+[highly sensitive](https://arxiv.org/abs/2006.10739) to the noise level, which is
+crucial for good performance. We implement sinusoidal embeddings using a
+[Lambda layer](https://keras.io/api/layers/core_layers/lambda/).
+
+Some other considerations:
+
+* We build the network using the
+[Keras Functional API](https://keras.io/guides/functional_api/), and use
+[closures](https://twitter.com/fchollet/status/1441927912836321280) to build blocks of
+layers in a consistent style.
+* [Diffusion models](https://arxiv.org/abs/2006.11239) embed the index of the timestep of
+the diffusion process instead of the noise variance, while
+[score-based models (Table 1)](https://arxiv.org/abs/2206.00364)
+usually use some function of the noise level. I
+prefer the latter so that we can change the sampling schedule at inference time, without
+retraining the network.
+* [Diffusion models](https://arxiv.org/abs/2006.11239) input the embedding to each
+convolution block separately. We only input it at the start of the network for
+simplicity, which in my experience barely decreases performance, because the skip and
+residual connections help the information propagate through the network properly.
+* In the literature it is common to use
+[attention layers](https://keras.io/api/layers/attention_layers/multi_head_attention/)
+at lower resolutions for better global coherence. I omitted it for simplicity.
+* We disable the learnable center and scale parameters of the batch normalization layers,
+since the following convolution layers make them redundant.
+* We initialize the last convolution's kernel to all zeros as a good practice, making the
+network predict only zeros after initialization, which is the mean of its targets. This
+will improve behaviour at the start of training and make the mean squared error loss
+start at exactly 1.
+"""
+
+
+@keras.saving.register_keras_serializable()
+def sinusoidal_embedding(x):
+    embedding_min_frequency = 1.0
+    frequencies = ops.exp(
+        ops.linspace(
+            ops.log(embedding_min_frequency),
+            ops.log(embedding_max_frequency),
+            embedding_dims // 2,
+        )
+    )
+    angular_speeds = ops.cast(2.0 * math.pi * frequencies, "float32")
+    embeddings = ops.concatenate(
+        [ops.sin(angular_speeds * x), ops.cos(angular_speeds * x)], axis=3
+    )
+    return embeddings
+
+
+def ResidualBlock(width):
+    def apply(x):
+        input_width = x.shape[3]
+        if input_width == width:
+            residual = x
+        else:
+            residual = layers.Conv2D(width, kernel_size=1)(x)
+        x = layers.BatchNormalization(center=False, scale=False)(x)
+        x = layers.Conv2D(width, kernel_size=3, padding="same", activation="swish")(x)
+        x = layers.Conv2D(width, kernel_size=3, padding="same")(x)
+        x = layers.Add()([x, residual])
+        return x
+
+    return apply
+
+
+def DownBlock(width, block_depth):
+    def apply(x):
+        x, skips = x
+        for _ in range(block_depth):
+            x = ResidualBlock(width)(x)
+            skips.append(x)
+        x = layers.AveragePooling2D(pool_size=2)(x)
+        return x
+
+    return apply
+
+
+def UpBlock(width, block_depth):
+    def apply(x):
+        x, skips = x
+        x = layers.UpSampling2D(size=2, interpolation="bilinear")(x)
+        for _ in range(block_depth):
+            x = layers.Concatenate()([x, skips.pop()])
+            x = ResidualBlock(width)(x)
+        return x
+
+    return apply
+
+
+def get_network(image_size, widths, block_depth):
+    noisy_images = keras.Input(shape=(image_size, image_size, 3))
+    noise_variances = keras.Input(shape=(1, 1, 1))
+
+    e = layers.Lambda(sinusoidal_embedding, output_shape=(1, 1, 32))(noise_variances)
+    e = layers.UpSampling2D(size=image_size, interpolation="nearest")(e)
+
+    x = layers.Conv2D(widths[0], kernel_size=1)(noisy_images)
+    x = layers.Concatenate()([x, e])
+
+    skips = []
+    for width in widths[:-1]:
+        x = DownBlock(width, block_depth)([x, skips])
+
+    for _ in range(block_depth):
+        x = ResidualBlock(widths[-1])(x)
+
+    for width in reversed(widths[:-1]):
+        x = UpBlock(width, block_depth)([x, skips])
+
+    x = layers.Conv2D(3, kernel_size=1, kernel_initializer="zeros")(x)
+
+    return keras.Model([noisy_images, noise_variances], x, name="residual_unet")
+
+
+"""
+This showcases the power of the Functional API. Note how we built a relatively complex
+U-Net with skip connections, residual blocks, multiple inputs, and sinusoidal embeddings
+in 80 lines of code!
+"""
+
+"""
+## Diffusion model
+
+### Diffusion schedule
+
+Let us say, that a diffusion process starts at time = 0, and ends at time = 1. This
+variable will be called diffusion time, and can be either discrete (common in diffusion
+models) or continuous (common in score-based models). I choose the latter, so that the
+number of sampling steps can be changed at inference time.
+
+We need to have a function that tells us at each point in the diffusion process the noise
+levels and signal levels of the noisy image corresponding to the actual diffusion time.
+This will be called the diffusion schedule (see `diffusion_schedule()`).
+
+This schedule outputs two quantities: the `noise_rate` and the `signal_rate`
+(corresponding to sqrt(1 - alpha) and sqrt(alpha) in the DDIM paper, respectively). We
+generate the noisy image by weighting the random noise and the training image by their
+corresponding rates and adding them together.
+
+Since the (standard normal) random noises and the (normalized) images both have zero mean
+and unit variance, the noise rate and signal rate can be interpreted as the standard
+deviation of their components in the noisy image, while the squares of their rates can be
+interpreted as their variance (or their power in the signal processing sense). The rates
+will always be set so that their squared sum is 1, meaning that the noisy images will
+always have unit variance, just like its unscaled components.
+
+We will use a simplified, continuous version of the
+[cosine schedule (Section 3.2)](https://arxiv.org/abs/2102.09672),
+that is quite commonly used in the literature.
+This schedule is symmetric, slow towards the start and end of the diffusion process, and
+it also has a nice geometric interpretation, using the
+[trigonometric properties of the unit circle](https://en.wikipedia.org/wiki/Unit_circle#/media/File:Circle-trig6.svg):
+
+![diffusion schedule gif](https://i.imgur.com/JW9W0fA.gif)
+
+### Training process
+
+The training procedure (see `train_step()` and `denoise()`) of denoising diffusion models
+is the following: we sample random diffusion times uniformly, and mix the training images
+with random gaussian noises at rates corresponding to the diffusion times. Then, we train
+the model to separate the noisy image to its two components.
+
+Usually, the neural network is trained to predict the unscaled noise component, from
+which the predicted image component can be calculated using the signal and noise rates.
+Pixelwise
+[mean squared error](https://keras.io/api/losses/regression_losses/#mean_squared_error-function) should
+be used theoretically, however I recommend using
+[mean absolute error](https://keras.io/api/losses/regression_losses/#mean_absolute_error-function)
+instead (similarly to
+[this](https://github.com/lucidrains/denoising-diffusion-pytorch/blob/master/denoising_diffusion_pytorch/denoising_diffusion_pytorch.py#L371)
+implementation), which produces better results on this dataset.
+
+### Sampling (reverse diffusion)
+
+When sampling (see `reverse_diffusion()`), at each step we take the previous estimate of
+the noisy image and separate it into image and noise using our network. Then we recombine
+these components using the signal and noise rate of the following step.
+
+Though a similar view is shown in
+[Equation 12 of DDIMs](https://arxiv.org/abs/2010.02502), I believe the above explanation
+of the sampling equation is not widely known.
+
+This example only implements the deterministic sampling procedure from DDIM, which
+corresponds to *eta = 0* in the paper. One can also use stochastic sampling (in which
+case the model becomes a
+[Denoising Diffusion Probabilistic Model (DDPM)](https://arxiv.org/abs/2006.11239)),
+where a part of the predicted noise is
+replaced with the same or larger amount of random noise
+([see Equation 16 and below](https://arxiv.org/abs/2010.02502)).
+
+Stochastic sampling can be used without retraining the network (since both models are
+trained the same way), and it can improve sample quality, while on the other hand
+requiring more sampling steps usually.
+"""
+
+
+@keras.saving.register_keras_serializable()
+class DiffusionModel(keras.Model):
+    def __init__(self, image_size, widths, block_depth):
+        super().__init__()
+
+        self.normalizer = layers.Normalization()
+        self.network = get_network(image_size, widths, block_depth)
+        self.ema_network = keras.models.clone_model(self.network)
+
+    def compile(self, **kwargs):
+        super().compile(**kwargs)
+
+        self.noise_loss_tracker = keras.metrics.Mean(name="n_loss")
+        self.image_loss_tracker = keras.metrics.Mean(name="i_loss")
+        self.kid = KID(name="kid")
+
+    @property
+    def metrics(self):
+        return [self.noise_loss_tracker, self.image_loss_tracker, self.kid]
+
+    def denormalize(self, images):
+        # convert the pixel values back to 0-1 range
+        images = self.normalizer.mean + images * self.normalizer.variance**0.5
+        return ops.clip(images, 0.0, 1.0)
+
+    def diffusion_schedule(self, diffusion_times):
+        # diffusion times -> angles
+        start_angle = ops.cast(ops.arccos(max_signal_rate), "float32")
+        end_angle = ops.cast(ops.arccos(min_signal_rate), "float32")
+
+        diffusion_angles = start_angle + diffusion_times * (end_angle - start_angle)
+
+        # angles -> signal and noise rates
+        signal_rates = ops.cos(diffusion_angles)
+        noise_rates = ops.sin(diffusion_angles)
+        # note that their squared sum is always: sin^2(x) + cos^2(x) = 1
+
+        return noise_rates, signal_rates
+
+    def denoise(self, noisy_images, noise_rates, signal_rates, training):
+        # the exponential moving average weights are used at evaluation
+        if training:
+            network = self.network
+        else:
+            network = self.ema_network
+
+        # predict noise component and calculate the image component using it
+        pred_noises = network([noisy_images, noise_rates**2], training=training)
+        pred_images = (noisy_images - noise_rates * pred_noises) / signal_rates
+
+        return pred_noises, pred_images
+
+    def reverse_diffusion(self, initial_noise, diffusion_steps):
+        # reverse diffusion = sampling
+        num_images = initial_noise.shape[0]
+        step_size = 1.0 / diffusion_steps
+
+        # important line:
+        # at the first sampling step, the "noisy image" is pure noise
+        # but its signal rate is assumed to be nonzero (min_signal_rate)
+        next_noisy_images = initial_noise
+        for step in range(diffusion_steps):
+            noisy_images = next_noisy_images
+
+            # separate the current noisy image to its components
+            diffusion_times = ops.ones((num_images, 1, 1, 1)) - step * step_size
+            noise_rates, signal_rates = self.diffusion_schedule(diffusion_times)
+            pred_noises, pred_images = self.denoise(
+                noisy_images, noise_rates, signal_rates, training=False
+            )
+            # network used in eval mode
+
+            # remix the predicted components using the next signal and noise rates
+            next_diffusion_times = diffusion_times - step_size
+            next_noise_rates, next_signal_rates = self.diffusion_schedule(
+                next_diffusion_times
+            )
+            next_noisy_images = (
+                next_signal_rates * pred_images + next_noise_rates * pred_noises
+            )
+            # this new noisy image will be used in the next step
+
+        return pred_images
+
+    def generate(self, num_images, diffusion_steps):
+        # noise -> images -> denormalized images
+        initial_noise = keras.random.normal(
+            shape=(num_images, image_size, image_size, 3)
+        )
+        generated_images = self.reverse_diffusion(initial_noise, diffusion_steps)
+        generated_images = self.denormalize(generated_images)
+        return generated_images
+
+    def train_step(self, images):
+        # normalize images to have standard deviation of 1, like the noises
+        images = self.normalizer(images, training=True)
+        noises = keras.random.normal(shape=(batch_size, image_size, image_size, 3))
+
+        # sample uniform random diffusion times
+        diffusion_times = keras.random.uniform(
+            shape=(batch_size, 1, 1, 1), minval=0.0, maxval=1.0
+        )
+        noise_rates, signal_rates = self.diffusion_schedule(diffusion_times)
+        # mix the images with noises accordingly
+        noisy_images = signal_rates * images + noise_rates * noises
+
+        with tf.GradientTape() as tape:
+            # train the network to separate noisy images to their components
+            pred_noises, pred_images = self.denoise(
+                noisy_images, noise_rates, signal_rates, training=True
+            )
+
+            noise_loss = self.loss(noises, pred_noises)  # used for training
+            image_loss = self.loss(images, pred_images)  # only used as metric
+
+        gradients = tape.gradient(noise_loss, self.network.trainable_weights)
+        self.optimizer.apply_gradients(zip(gradients, self.network.trainable_weights))
+
+        self.noise_loss_tracker.update_state(noise_loss)
+        self.image_loss_tracker.update_state(image_loss)
+
+        # track the exponential moving averages of weights
+        for weight, ema_weight in zip(self.network.weights, self.ema_network.weights):
+            ema_weight.assign(ema * ema_weight + (1 - ema) * weight)
+
+        # KID is not measured during the training phase for computational efficiency
+        return {m.name: m.result() for m in self.metrics[:-1]}
+
+    def test_step(self, images):
+        # normalize images to have standard deviation of 1, like the noises
+        images = self.normalizer(images, training=False)
+        noises = keras.random.normal(shape=(batch_size, image_size, image_size, 3))
+
+        # sample uniform random diffusion times
+        diffusion_times = keras.random.uniform(
+            shape=(batch_size, 1, 1, 1), minval=0.0, maxval=1.0
+        )
+        noise_rates, signal_rates = self.diffusion_schedule(diffusion_times)
+        # mix the images with noises accordingly
+        noisy_images = signal_rates * images + noise_rates * noises
+
+        # use the network to separate noisy images to their components
+        pred_noises, pred_images = self.denoise(
+            noisy_images, noise_rates, signal_rates, training=False
+        )
+
+        noise_loss = self.loss(noises, pred_noises)
+        image_loss = self.loss(images, pred_images)
+
+        self.image_loss_tracker.update_state(image_loss)
+        self.noise_loss_tracker.update_state(noise_loss)
+
+        # measure KID between real and generated images
+        # this is computationally demanding, kid_diffusion_steps has to be small
+        images = self.denormalize(images)
+        generated_images = self.generate(
+            num_images=batch_size, diffusion_steps=kid_diffusion_steps
+        )
+        self.kid.update_state(images, generated_images)
+
+        return {m.name: m.result() for m in self.metrics}
+
+    def plot_images(self, epoch=None, logs=None, num_rows=3, num_cols=6):
+        # plot random generated images for visual evaluation of generation quality
+        generated_images = self.generate(
+            num_images=num_rows * num_cols,
+            diffusion_steps=plot_diffusion_steps,
+        )
+
+        plt.figure(figsize=(num_cols * 2.0, num_rows * 2.0))
+        for row in range(num_rows):
+            for col in range(num_cols):
+                index = row * num_cols + col
+                plt.subplot(num_rows, num_cols, index + 1)
+                plt.imshow(generated_images[index])
+                plt.axis("off")
+        plt.tight_layout()
+        plt.show()
+        plt.close()
+
+
+"""
+## Training
+"""
+
+# create and compile the model
+model = DiffusionModel(image_size, widths, block_depth)
+# below tensorflow 2.9:
+# pip install tensorflow_addons
+# import tensorflow_addons as tfa
+# optimizer=tfa.optimizers.AdamW
+model.compile(
+    optimizer=keras.optimizers.AdamW(
+        learning_rate=learning_rate, weight_decay=weight_decay
+    ),
+    loss=keras.losses.mean_absolute_error,
+)
+# pixelwise mean absolute error is used as loss
+
+# save the best model based on the validation KID metric
+checkpoint_path = "checkpoints/diffusion_model.weights.h5"
+checkpoint_callback = keras.callbacks.ModelCheckpoint(
+    filepath=checkpoint_path,
+    save_weights_only=True,
+    monitor="val_kid",
+    mode="min",
+    save_best_only=True,
+)
+
+# calculate mean and variance of training dataset for normalization
+model.normalizer.adapt(train_dataset)
+
+# run training and plot generated images periodically
+model.fit(
+    train_dataset,
+    epochs=num_epochs,
+    validation_data=val_dataset,
+    callbacks=[
+        keras.callbacks.LambdaCallback(on_epoch_end=model.plot_images),
+        checkpoint_callback,
+    ],
+)
+
+"""
+## Inference
+"""
+
+# load the best model and generate images
+model.load_weights(checkpoint_path)
+model.plot_images()
+
+"""
+## Results
+
+By running the training for at least 50 epochs (takes 2 hours on a T4 GPU and 30 minutes
+on an A100 GPU), one can get high quality image generations using this code example.
+
+The evolution of a batch of images over a 80 epoch training (color artifacts are due to
+GIF compression):
+
+![flowers training gif](https://i.imgur.com/FSCKtZq.gif)
+
+Images generated using between 1 and 20 sampling steps from the same initial noise:
+
+![flowers sampling steps gif](https://i.imgur.com/tM5LyH3.gif)
+
+Interpolation (spherical) between initial noise samples:
+
+![flowers interpolation gif](https://i.imgur.com/hk5Hd5o.gif)
+
+Deterministic sampling process (noisy images on top, predicted images on bottom, 40
+steps):
+
+![flowers deterministic generation gif](https://i.imgur.com/wCvzynh.gif)
+
+Stochastic sampling process (noisy images on top, predicted images on bottom, 80 steps):
+
+![flowers stochastic generation gif](https://i.imgur.com/kRXOGzd.gif)
+
+"""
+
+"""
+## Lessons learned
+
+During preparation for this code example I have run numerous experiments using
+[this repository](https://github.com/beresandras/clear-diffusion-keras).
+In this section I list
+the lessons learned and my recommendations in my subjective order of importance.
+
+### Algorithmic tips
+
+* **min. and max. signal rates**: I found the min. signal rate to be an important
+hyperparameter. Setting it too low will make the generated images oversaturated, while
+setting it too high will make them undersaturated. I recommend tuning it carefully. Also,
+setting it to 0 will lead to a division by zero error. The max. signal rate can be set to
+1, but I found that setting it lower slightly improves generation quality.
+* **loss function**: While large models tend to use mean squared error (MSE) loss, I
+recommend using mean absolute error (MAE) on this dataset. In my experience MSE loss
+generates more diverse samples (it also seems to hallucinate more
+[Section 3](https://arxiv.org/abs/2111.05826)), while MAE loss leads to smoother images.
+I recommend trying both.
+* **weight decay**: I did occasionally run into diverged trainings when scaling up the
+model, and found that weight decay helps in avoiding instabilities at a low performance
+cost. This is why I use
+[AdamW](https://www.tensorflow.org/api_docs/python/tf/keras/optimizers/experimental/AdamW)
+instead of [Adam](https://keras.io/api/optimizers/adam/) in this example.
+* **exponential moving average of weights**: This helps to reduce the variance of the KID
+metric, and helps in averaging out short-term changes during training.
+* **image augmentations**: Though I did not use image augmentations in this example, in
+my experience adding horizontal flips to the training increases generation performance,
+while random crops do not. Since we use a supervised denoising loss, overfitting can be
+an issue, so image augmentations might be important on small datasets. One should also be
+careful not to use
+[leaky augmentations](https://keras.io/examples/generative/gan_ada/#invertible-data-augmentation),
+which can be done following
+[this method (end of Section 5)](https://arxiv.org/abs/2206.00364) for instance.
+* **data normalization**: In the literature the pixel values of images are usually
+converted to the -1 to 1 range. For theoretical correctness, I normalize the images to
+have zero mean and unit variance instead, exactly like the random noises.
+* **noise level input**: I chose to input the noise variance to the network, as it is
+symmetrical under our sampling schedule. One could also input the noise rate (similar
+performance), the signal rate (lower performance), or even the
+[log-signal-to-noise ratio (Appendix B.1)](https://arxiv.org/abs/2107.00630)
+(did not try, as its range is highly
+dependent on the min. and max. signal rates, and would require adjusting the min.
+embedding frequency accordingly).
+* **gradient clipping**: Using global gradient clipping with a value of 1 can help with
+training stability for large models, but decreased performance significantly in my
+experience.
+* **residual connection downscaling**: For
+[deeper models (Appendix B)](https://arxiv.org/abs/2205.11487), scaling the residual
+connections with 1/sqrt(2) can be helpful, but did not help in my case.
+* **learning rate**: For me, [Adam optimizer's](https://keras.io/api/optimizers/adam/)
+default learning rate of 1e-3 worked very well, but lower learning rates are more common
+in the [literature (Tables 11-13)](https://arxiv.org/abs/2105.05233).
+
+### Architectural tips
+
+* **sinusoidal embedding**: Using sinusoidal embeddings on the noise level input of the
+network is crucial for good performance. I recommend setting the min. embedding frequency
+to the reciprocal of the range of this input, and since we use the noise variance in this
+example, it can be left always at 1. The max. embedding frequency controls the smallest
+change in the noise variance that the network will be sensitive to, and the embedding
+dimensions set the number of frequency components in the embedding. In my experience the
+performance is not too sensitive to these values.
+* **skip connections**: Using skip connections in the network architecture is absolutely
+critical, without them the model will fail to learn to denoise at a good performance.
+* **residual connections**: In my experience residual connections also significantly
+improve performance, but this might be due to the fact that we only input the noise
+level embeddings to the first layer of the network instead of to all of them.
+* **normalization**: When scaling up the model, I did occasionally encounter diverged
+trainings, using normalization layers helped to mitigate this issue. In the literature it
+is common to use
+[GroupNormalization](https://www.tensorflow.org/addons/api_docs/python/tfa/layers/GroupNormalization)
+(with 8 groups for example) or
+[LayerNormalization](https://keras.io/api/layers/normalization_layers/layer_normalization/)
+in the network, I however chose to use
+[BatchNormalization](https://keras.io/api/layers/normalization_layers/batch_normalization/),
+as it gave similar benefits in my experiments but was computationally lighter.
+* **activations**: The choice of activation functions had a larger effect on generation
+quality than I expected. In my experiments using non-monotonic activation functions
+outperformed monotonic ones (such as
+[ReLU](https://www.tensorflow.org/api_docs/python/tf/keras/activations/relu)), with
+[Swish](https://www.tensorflow.org/api_docs/python/tf/keras/activations/swish) performing
+the best (this is also what [Imagen uses, page 41](https://arxiv.org/abs/2205.11487)).
+* **attention**: As mentioned earlier, it is common in the literature to use
+[attention layers](https://keras.io/api/layers/attention_layers/multi_head_attention/) at low
+resolutions for better global coherence. I omitted them for simplicity.
+* **upsampling**:
+[Bilinear and nearest neighbour upsampling](https://keras.io/api/layers/reshaping_layers/up_sampling2d/)
+in the network performed similarly, however I did not try
+[transposed convolutions](https://keras.io/api/layers/convolution_layers/convolution2d_transpose/).
+
+For a similar list about GANs check out
+[this Keras tutorial](https://keras.io/examples/generative/gan_ada/#gan-tips-and-tricks).
+"""
+
+"""
+## What to try next?
+
+If you would like to dive in deeper to the topic, I recommend checking out
+[this repository](https://github.com/beresandras/clear-diffusion-keras) that I created in
+preparation for this code example, which implements a wider range of features in a
+similar style, such as:
+
+* stochastic sampling
+* second-order sampling based on the
+[differential equation view of DDIMs (Equation 13)](https://arxiv.org/abs/2010.02502)
+* more diffusion schedules
+* more network output types: predicting image or
+[velocity (Appendix D)](https://arxiv.org/abs/2202.00512) instead of noise
+* more datasets
+"""
+
+"""
+## Related works
+
+* [Score-based generative modeling](https://yang-song.github.io/blog/2021/score/)
+(blogpost)
+* [What are diffusion models?](https://lilianweng.github.io/posts/2021-07-11-diffusion-models/)
+(blogpost)
+* [Annotated diffusion model](https://huggingface.co/blog/annotated-diffusion) (blogpost)
+* [CVPR 2022 tutorial on diffusion models](https://cvpr2022-tutorial-diffusion-models.github.io/)
+(slides available)
+* [Elucidating the Design Space of Diffusion-Based Generative Models](https://arxiv.org/abs/2206.00364):
+attempts unifying diffusion methods under a common framework
+* High-level video overviews: [1](https://www.youtube.com/watch?v=yTAMrHVG1ew),
+[2](https://www.youtube.com/watch?v=344w5h24-h8)
+* Detailed technical videos: [1](https://www.youtube.com/watch?v=fbLgFrlTnGU),
+[2](https://www.youtube.com/watch?v=W-O7AZNzbzQ)
+* Score-based generative models: [NCSN](https://arxiv.org/abs/1907.05600),
+[NCSN+](https://arxiv.org/abs/2006.09011), [NCSN++](https://arxiv.org/abs/2011.13456)
+* Denoising diffusion models: [DDPM](https://arxiv.org/abs/2006.11239),
+[DDIM](https://arxiv.org/abs/2010.02502), [DDPM+](https://arxiv.org/abs/2102.09672),
+[DDPM++](https://arxiv.org/abs/2105.05233)
+* Large diffusion models: [GLIDE](https://arxiv.org/abs/2112.10741),
+[DALL-E 2](https://arxiv.org/abs/2204.06125/), [Imagen](https://arxiv.org/abs/2205.11487)
+
+
+"""
diff --git a/knowledge_base/generative/ddpm.py b/knowledge_base/generative/ddpm.py
new file mode 100644
index 0000000000000000000000000000000000000000..baae14ee53cb5913687b3db01ec21c82bda41fe7
--- /dev/null
+++ b/knowledge_base/generative/ddpm.py
@@ -0,0 +1,815 @@
+"""
+Title: Denoising Diffusion Probabilistic Model
+Author: [A_K_Nain](https://twitter.com/A_K_Nain)
+Date created: 2022/11/30
+Last modified: 2022/12/07
+Description: Generating images of flowers with denoising diffusion probabilistic models.
+"""
+
+"""
+## Introduction
+
+Generative modeling experienced tremendous growth in the last five years. Models like
+VAEs, GANs, and flow-based models proved to be a great success in generating
+high-quality content, especially images. Diffusion models are a new type of generative
+model that has proven to be better than previous approaches.
+
+Diffusion models are inspired by non-equilibrium thermodynamics, and they learn to
+generate by denoising. Learning by denoising consists of two processes,
+each of which is a Markov Chain. These are:
+
+1. The forward process: In the forward process, we slowly add random noise to the data
+in a series of time steps `(t1, t2, ..., tn )`. Samples at the current time step are
+drawn from a Gaussian distribution where the mean of the distribution is conditioned
+on the sample at the previous time step, and the variance of the distribution follows
+a fixed schedule. At the end of the forward process, the samples end up with a pure
+noise distribution.
+
+2. The reverse process: During the reverse process, we try to undo the added noise at
+every time step. We start with the pure noise distribution (the last step of the
+forward process) and try to denoise the samples in the backward direction
+`(tn, tn-1, ..., t1)`.
+
+We implement the [Denoising Diffusion Probabilistic Models](https://arxiv.org/abs/2006.11239)
+paper or DDPMs for short in this code example. It was the first paper demonstrating
+the use of diffusion models for generating high-quality images. The authors proved
+that a certain parameterization of diffusion models reveals an equivalence with
+denoising score matching over multiple noise levels during training and with annealed
+Langevin dynamics during sampling that generates the best quality results.
+
+This paper replicates both the Markov chains (forward process and reverse process)
+involved in the diffusion process but for images. The forward process is fixed and
+gradually adds Gaussian noise to the images according to a fixed variance schedule
+denoted by beta in the paper. This is what the diffusion process looks like in case
+of images: (image -> noise::noise -> image)
+
+![diffusion process gif](https://imgur.com/Yn7tho9.gif)
+
+
+The paper describes two algorithms, one for training the model, and the other for
+sampling from the trained model. Training is performed by optimizing the usual
+variational bound on negative log-likelihood. The objective function is further
+simplified, and the network is treated as a noise prediction network. Once optimized,
+we can sample from the network to generate new images from noise samples. Here is an
+overview of both algorithms as presented in the paper:
+
+![ddpms](https://i.imgur.com/S7KH5hZ.png)
+
+
+**Note:** DDPM is just one way of implementing a diffusion model. Also, the sampling
+algorithm in the DDPM replicates the complete Markov chain. Hence, it's slow in
+generating new samples compared to other generative models like GANs. Lots of research
+efforts have been made to address this issue. One such example is Denoising Diffusion
+Implicit Models, or DDIM for short, where the authors replaced the Markov chain with a
+non-Markovian process to sample faster. You can find the code example for DDIM
+[here](https://keras.io/examples/generative/ddim/)
+
+Implementing a DDPM model is simple. We define a model that takes
+two inputs: Images and the randomly sampled time steps. At each training step, we
+perform the following operations to train our model:
+
+1. Sample random noise to be added to the inputs.
+2. Apply the forward process to diffuse the inputs with the sampled noise.
+3. Your model takes these noisy samples as inputs and outputs the noise
+prediction for each time step.
+4. Given true noise and predicted noise, we calculate the loss values
+5. We then calculate the gradients and update the model weights.
+
+Given that our model knows how to denoise a noisy sample at a given time step,
+we can leverage this idea to generate new samples, starting from a pure noise
+distribution.
+"""
+
+"""
+## Setup
+"""
+
+import math
+import numpy as np
+import matplotlib.pyplot as plt
+
+# Requires TensorFlow >=2.11 for the GroupNormalization layer.
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+import tensorflow_datasets as tfds
+
+"""
+## Hyperparameters
+"""
+
+batch_size = 32
+num_epochs = 1  # Just for the sake of demonstration
+total_timesteps = 1000
+norm_groups = 8  # Number of groups used in GroupNormalization layer
+learning_rate = 2e-4
+
+img_size = 64
+img_channels = 3
+clip_min = -1.0
+clip_max = 1.0
+
+first_conv_channels = 64
+channel_multiplier = [1, 2, 4, 8]
+widths = [first_conv_channels * mult for mult in channel_multiplier]
+has_attention = [False, False, True, True]
+num_res_blocks = 2  # Number of residual blocks
+
+dataset_name = "oxford_flowers102"
+splits = ["train"]
+
+
+"""
+## Dataset
+
+We use the [Oxford Flowers 102](https://www.tensorflow.org/datasets/catalog/oxford_flowers102)
+dataset for generating images of flowers. In terms of preprocessing, we use center
+cropping for resizing the images to the desired image size, and we rescale the pixel
+values in the range `[-1.0, 1.0]`. This is in line with the range of the pixel values that
+was applied by the authors of the [DDPMs paper](https://arxiv.org/abs/2006.11239). For
+augmenting training data, we randomly flip the images left/right.
+"""
+
+
+# Load the dataset
+(ds,) = tfds.load(dataset_name, split=splits, with_info=False, shuffle_files=True)
+
+
+def augment(img):
+    """Flips an image left/right randomly."""
+    return tf.image.random_flip_left_right(img)
+
+
+def resize_and_rescale(img, size):
+    """Resize the image to the desired size first and then
+    rescale the pixel values in the range [-1.0, 1.0].
+
+    Args:
+        img: Image tensor
+        size: Desired image size for resizing
+    Returns:
+        Resized and rescaled image tensor
+    """
+
+    height = tf.shape(img)[0]
+    width = tf.shape(img)[1]
+    crop_size = tf.minimum(height, width)
+
+    img = tf.image.crop_to_bounding_box(
+        img,
+        (height - crop_size) // 2,
+        (width - crop_size) // 2,
+        crop_size,
+        crop_size,
+    )
+
+    # Resize
+    img = tf.cast(img, dtype=tf.float32)
+    img = tf.image.resize(img, size=size, antialias=True)
+
+    # Rescale the pixel values
+    img = img / 127.5 - 1.0
+    img = tf.clip_by_value(img, clip_min, clip_max)
+    return img
+
+
+def train_preprocessing(x):
+    img = x["image"]
+    img = resize_and_rescale(img, size=(img_size, img_size))
+    img = augment(img)
+    return img
+
+
+train_ds = (
+    ds.map(train_preprocessing, num_parallel_calls=tf.data.AUTOTUNE)
+    .batch(batch_size, drop_remainder=True)
+    .shuffle(batch_size * 2)
+    .prefetch(tf.data.AUTOTUNE)
+)
+
+
+"""
+## Gaussian diffusion utilities
+
+We define the forward process and the reverse process
+as a separate utility. Most of the code in this utility has been borrowed
+from the original implementation with some slight modifications.
+"""
+
+
+class GaussianDiffusion:
+    """Gaussian diffusion utility.
+
+    Args:
+        beta_start: Start value of the scheduled variance
+        beta_end: End value of the scheduled variance
+        timesteps: Number of time steps in the forward process
+    """
+
+    def __init__(
+        self,
+        beta_start=1e-4,
+        beta_end=0.02,
+        timesteps=1000,
+        clip_min=-1.0,
+        clip_max=1.0,
+    ):
+        self.beta_start = beta_start
+        self.beta_end = beta_end
+        self.timesteps = timesteps
+        self.clip_min = clip_min
+        self.clip_max = clip_max
+
+        # Define the linear variance schedule
+        self.betas = betas = np.linspace(
+            beta_start,
+            beta_end,
+            timesteps,
+            dtype=np.float64,  # Using float64 for better precision
+        )
+        self.num_timesteps = int(timesteps)
+
+        alphas = 1.0 - betas
+        alphas_cumprod = np.cumprod(alphas, axis=0)
+        alphas_cumprod_prev = np.append(1.0, alphas_cumprod[:-1])
+
+        self.betas = tf.constant(betas, dtype=tf.float32)
+        self.alphas_cumprod = tf.constant(alphas_cumprod, dtype=tf.float32)
+        self.alphas_cumprod_prev = tf.constant(alphas_cumprod_prev, dtype=tf.float32)
+
+        # Calculations for diffusion q(x_t | x_{t-1}) and others
+        self.sqrt_alphas_cumprod = tf.constant(
+            np.sqrt(alphas_cumprod), dtype=tf.float32
+        )
+
+        self.sqrt_one_minus_alphas_cumprod = tf.constant(
+            np.sqrt(1.0 - alphas_cumprod), dtype=tf.float32
+        )
+
+        self.log_one_minus_alphas_cumprod = tf.constant(
+            np.log(1.0 - alphas_cumprod), dtype=tf.float32
+        )
+
+        self.sqrt_recip_alphas_cumprod = tf.constant(
+            np.sqrt(1.0 / alphas_cumprod), dtype=tf.float32
+        )
+        self.sqrt_recipm1_alphas_cumprod = tf.constant(
+            np.sqrt(1.0 / alphas_cumprod - 1), dtype=tf.float32
+        )
+
+        # Calculations for posterior q(x_{t-1} | x_t, x_0)
+        posterior_variance = (
+            betas * (1.0 - alphas_cumprod_prev) / (1.0 - alphas_cumprod)
+        )
+        self.posterior_variance = tf.constant(posterior_variance, dtype=tf.float32)
+
+        # Log calculation clipped because the posterior variance is 0 at the beginning
+        # of the diffusion chain
+        self.posterior_log_variance_clipped = tf.constant(
+            np.log(np.maximum(posterior_variance, 1e-20)), dtype=tf.float32
+        )
+
+        self.posterior_mean_coef1 = tf.constant(
+            betas * np.sqrt(alphas_cumprod_prev) / (1.0 - alphas_cumprod),
+            dtype=tf.float32,
+        )
+
+        self.posterior_mean_coef2 = tf.constant(
+            (1.0 - alphas_cumprod_prev) * np.sqrt(alphas) / (1.0 - alphas_cumprod),
+            dtype=tf.float32,
+        )
+
+    def _extract(self, a, t, x_shape):
+        """Extract some coefficients at specified timesteps,
+        then reshape to [batch_size, 1, 1, 1, 1, ...] for broadcasting purposes.
+
+        Args:
+            a: Tensor to extract from
+            t: Timestep for which the coefficients are to be extracted
+            x_shape: Shape of the current batched samples
+        """
+        batch_size = x_shape[0]
+        out = tf.gather(a, t)
+        return tf.reshape(out, [batch_size, 1, 1, 1])
+
+    def q_mean_variance(self, x_start, t):
+        """Extracts the mean, and the variance at current timestep.
+
+        Args:
+            x_start: Initial sample (before the first diffusion step)
+            t: Current timestep
+        """
+        x_start_shape = tf.shape(x_start)
+        mean = self._extract(self.sqrt_alphas_cumprod, t, x_start_shape) * x_start
+        variance = self._extract(1.0 - self.alphas_cumprod, t, x_start_shape)
+        log_variance = self._extract(
+            self.log_one_minus_alphas_cumprod, t, x_start_shape
+        )
+        return mean, variance, log_variance
+
+    def q_sample(self, x_start, t, noise):
+        """Diffuse the data.
+
+        Args:
+            x_start: Initial sample (before the first diffusion step)
+            t: Current timestep
+            noise: Gaussian noise to be added at the current timestep
+        Returns:
+            Diffused samples at timestep `t`
+        """
+        x_start_shape = tf.shape(x_start)
+        return (
+            self._extract(self.sqrt_alphas_cumprod, t, tf.shape(x_start)) * x_start
+            + self._extract(self.sqrt_one_minus_alphas_cumprod, t, x_start_shape)
+            * noise
+        )
+
+    def predict_start_from_noise(self, x_t, t, noise):
+        x_t_shape = tf.shape(x_t)
+        return (
+            self._extract(self.sqrt_recip_alphas_cumprod, t, x_t_shape) * x_t
+            - self._extract(self.sqrt_recipm1_alphas_cumprod, t, x_t_shape) * noise
+        )
+
+    def q_posterior(self, x_start, x_t, t):
+        """Compute the mean and variance of the diffusion
+        posterior q(x_{t-1} | x_t, x_0).
+
+        Args:
+            x_start: Stating point(sample) for the posterior computation
+            x_t: Sample at timestep `t`
+            t: Current timestep
+        Returns:
+            Posterior mean and variance at current timestep
+        """
+
+        x_t_shape = tf.shape(x_t)
+        posterior_mean = (
+            self._extract(self.posterior_mean_coef1, t, x_t_shape) * x_start
+            + self._extract(self.posterior_mean_coef2, t, x_t_shape) * x_t
+        )
+        posterior_variance = self._extract(self.posterior_variance, t, x_t_shape)
+        posterior_log_variance_clipped = self._extract(
+            self.posterior_log_variance_clipped, t, x_t_shape
+        )
+        return posterior_mean, posterior_variance, posterior_log_variance_clipped
+
+    def p_mean_variance(self, pred_noise, x, t, clip_denoised=True):
+        x_recon = self.predict_start_from_noise(x, t=t, noise=pred_noise)
+        if clip_denoised:
+            x_recon = tf.clip_by_value(x_recon, self.clip_min, self.clip_max)
+
+        model_mean, posterior_variance, posterior_log_variance = self.q_posterior(
+            x_start=x_recon, x_t=x, t=t
+        )
+        return model_mean, posterior_variance, posterior_log_variance
+
+    def p_sample(self, pred_noise, x, t, clip_denoised=True):
+        """Sample from the diffusion model.
+
+        Args:
+            pred_noise: Noise predicted by the diffusion model
+            x: Samples at a given timestep for which the noise was predicted
+            t: Current timestep
+            clip_denoised (bool): Whether to clip the predicted noise
+                within the specified range or not.
+        """
+        model_mean, _, model_log_variance = self.p_mean_variance(
+            pred_noise, x=x, t=t, clip_denoised=clip_denoised
+        )
+        noise = tf.random.normal(shape=x.shape, dtype=x.dtype)
+        # No noise when t == 0
+        nonzero_mask = tf.reshape(
+            1 - tf.cast(tf.equal(t, 0), tf.float32), [tf.shape(x)[0], 1, 1, 1]
+        )
+        return model_mean + nonzero_mask * tf.exp(0.5 * model_log_variance) * noise
+
+
+"""
+## Network architecture
+
+U-Net, originally developed for semantic segmentation, is an architecture that is
+widely used for implementing diffusion models but with some slight modifications:
+
+1. The network accepts two inputs: Image and time step
+2. Self-attention between the convolution blocks once we reach a specific resolution
+(16x16 in the paper)
+3. Group Normalization instead of weight normalization
+
+We implement most of the things as used in the original paper. We use the
+`swish` activation function throughout the network. We use the variance scaling
+kernel initializer.
+
+The only difference here is the number of groups used for the
+`GroupNormalization` layer. For the flowers dataset,
+we found that a value of `groups=8` produces better results
+compared to the default value of `groups=32`. Dropout is optional and should be
+used where chances of over fitting is high. In the paper, the authors used dropout
+only when training on CIFAR10.
+"""
+
+
+# Kernel initializer to use
+def kernel_init(scale):
+    scale = max(scale, 1e-10)
+    return keras.initializers.VarianceScaling(
+        scale, mode="fan_avg", distribution="uniform"
+    )
+
+
+class AttentionBlock(layers.Layer):
+    """Applies self-attention.
+
+    Args:
+        units: Number of units in the dense layers
+        groups: Number of groups to be used for GroupNormalization layer
+    """
+
+    def __init__(self, units, groups=8, **kwargs):
+        self.units = units
+        self.groups = groups
+        super().__init__(**kwargs)
+
+        self.norm = layers.GroupNormalization(groups=groups)
+        self.query = layers.Dense(units, kernel_initializer=kernel_init(1.0))
+        self.key = layers.Dense(units, kernel_initializer=kernel_init(1.0))
+        self.value = layers.Dense(units, kernel_initializer=kernel_init(1.0))
+        self.proj = layers.Dense(units, kernel_initializer=kernel_init(0.0))
+
+    def call(self, inputs):
+        batch_size = tf.shape(inputs)[0]
+        height = tf.shape(inputs)[1]
+        width = tf.shape(inputs)[2]
+        scale = tf.cast(self.units, tf.float32) ** (-0.5)
+
+        inputs = self.norm(inputs)
+        q = self.query(inputs)
+        k = self.key(inputs)
+        v = self.value(inputs)
+
+        attn_score = tf.einsum("bhwc, bHWc->bhwHW", q, k) * scale
+        attn_score = tf.reshape(attn_score, [batch_size, height, width, height * width])
+
+        attn_score = tf.nn.softmax(attn_score, -1)
+        attn_score = tf.reshape(attn_score, [batch_size, height, width, height, width])
+
+        proj = tf.einsum("bhwHW,bHWc->bhwc", attn_score, v)
+        proj = self.proj(proj)
+        return inputs + proj
+
+
+class TimeEmbedding(layers.Layer):
+    def __init__(self, dim, **kwargs):
+        super().__init__(**kwargs)
+        self.dim = dim
+        self.half_dim = dim // 2
+        self.emb = math.log(10000) / (self.half_dim - 1)
+        self.emb = tf.exp(tf.range(self.half_dim, dtype=tf.float32) * -self.emb)
+
+    def call(self, inputs):
+        inputs = tf.cast(inputs, dtype=tf.float32)
+        emb = inputs[:, None] * self.emb[None, :]
+        emb = tf.concat([tf.sin(emb), tf.cos(emb)], axis=-1)
+        return emb
+
+
+def ResidualBlock(width, groups=8, activation_fn=keras.activations.swish):
+    def apply(inputs):
+        x, t = inputs
+        input_width = x.shape[3]
+
+        if input_width == width:
+            residual = x
+        else:
+            residual = layers.Conv2D(
+                width, kernel_size=1, kernel_initializer=kernel_init(1.0)
+            )(x)
+
+        temb = activation_fn(t)
+        temb = layers.Dense(width, kernel_initializer=kernel_init(1.0))(temb)[
+            :, None, None, :
+        ]
+
+        x = layers.GroupNormalization(groups=groups)(x)
+        x = activation_fn(x)
+        x = layers.Conv2D(
+            width, kernel_size=3, padding="same", kernel_initializer=kernel_init(1.0)
+        )(x)
+
+        x = layers.Add()([x, temb])
+        x = layers.GroupNormalization(groups=groups)(x)
+        x = activation_fn(x)
+
+        x = layers.Conv2D(
+            width, kernel_size=3, padding="same", kernel_initializer=kernel_init(0.0)
+        )(x)
+        x = layers.Add()([x, residual])
+        return x
+
+    return apply
+
+
+def DownSample(width):
+    def apply(x):
+        x = layers.Conv2D(
+            width,
+            kernel_size=3,
+            strides=2,
+            padding="same",
+            kernel_initializer=kernel_init(1.0),
+        )(x)
+        return x
+
+    return apply
+
+
+def UpSample(width, interpolation="nearest"):
+    def apply(x):
+        x = layers.UpSampling2D(size=2, interpolation=interpolation)(x)
+        x = layers.Conv2D(
+            width, kernel_size=3, padding="same", kernel_initializer=kernel_init(1.0)
+        )(x)
+        return x
+
+    return apply
+
+
+def TimeMLP(units, activation_fn=keras.activations.swish):
+    def apply(inputs):
+        temb = layers.Dense(
+            units, activation=activation_fn, kernel_initializer=kernel_init(1.0)
+        )(inputs)
+        temb = layers.Dense(units, kernel_initializer=kernel_init(1.0))(temb)
+        return temb
+
+    return apply
+
+
+def build_model(
+    img_size,
+    img_channels,
+    widths,
+    has_attention,
+    num_res_blocks=2,
+    norm_groups=8,
+    interpolation="nearest",
+    activation_fn=keras.activations.swish,
+):
+    image_input = layers.Input(
+        shape=(img_size, img_size, img_channels), name="image_input"
+    )
+    time_input = keras.Input(shape=(), dtype=tf.int64, name="time_input")
+
+    x = layers.Conv2D(
+        first_conv_channels,
+        kernel_size=(3, 3),
+        padding="same",
+        kernel_initializer=kernel_init(1.0),
+    )(image_input)
+
+    temb = TimeEmbedding(dim=first_conv_channels * 4)(time_input)
+    temb = TimeMLP(units=first_conv_channels * 4, activation_fn=activation_fn)(temb)
+
+    skips = [x]
+
+    # DownBlock
+    for i in range(len(widths)):
+        for _ in range(num_res_blocks):
+            x = ResidualBlock(
+                widths[i], groups=norm_groups, activation_fn=activation_fn
+            )([x, temb])
+            if has_attention[i]:
+                x = AttentionBlock(widths[i], groups=norm_groups)(x)
+            skips.append(x)
+
+        if widths[i] != widths[-1]:
+            x = DownSample(widths[i])(x)
+            skips.append(x)
+
+    # MiddleBlock
+    x = ResidualBlock(widths[-1], groups=norm_groups, activation_fn=activation_fn)(
+        [x, temb]
+    )
+    x = AttentionBlock(widths[-1], groups=norm_groups)(x)
+    x = ResidualBlock(widths[-1], groups=norm_groups, activation_fn=activation_fn)(
+        [x, temb]
+    )
+
+    # UpBlock
+    for i in reversed(range(len(widths))):
+        for _ in range(num_res_blocks + 1):
+            x = layers.Concatenate(axis=-1)([x, skips.pop()])
+            x = ResidualBlock(
+                widths[i], groups=norm_groups, activation_fn=activation_fn
+            )([x, temb])
+            if has_attention[i]:
+                x = AttentionBlock(widths[i], groups=norm_groups)(x)
+
+        if i != 0:
+            x = UpSample(widths[i], interpolation=interpolation)(x)
+
+    # End block
+    x = layers.GroupNormalization(groups=norm_groups)(x)
+    x = activation_fn(x)
+    x = layers.Conv2D(3, (3, 3), padding="same", kernel_initializer=kernel_init(0.0))(x)
+    return keras.Model([image_input, time_input], x, name="unet")
+
+
+"""
+## Training
+
+We follow the same setup for training the diffusion model as described
+in the paper. We use `Adam` optimizer with a learning rate of `2e-4`.
+We use EMA on model parameters with a decay factor of 0.999. We
+treat our model as noise prediction network i.e. at every training step, we
+input a batch of images and corresponding time steps to our UNet,
+and the network outputs the noise as predictions.
+
+The only difference is that we aren't using the Kernel Inception Distance (KID)
+or Frechet Inception Distance (FID) for evaluating the quality of generated
+samples during training. This is because both these metrics are compute heavy
+and are skipped for the brevity of implementation.
+
+**Note: ** We are using mean squared error as the loss function which is aligned with
+the paper, and theoretically makes sense. In practice, though, it is also common to
+use mean absolute error or Huber loss as the loss function.
+"""
+
+
+class DiffusionModel(keras.Model):
+    def __init__(self, network, ema_network, timesteps, gdf_util, ema=0.999):
+        super().__init__()
+        self.network = network
+        self.ema_network = ema_network
+        self.timesteps = timesteps
+        self.gdf_util = gdf_util
+        self.ema = ema
+
+    def train_step(self, images):
+        # 1. Get the batch size
+        batch_size = tf.shape(images)[0]
+
+        # 2. Sample timesteps uniformly
+        t = tf.random.uniform(
+            minval=0, maxval=self.timesteps, shape=(batch_size,), dtype=tf.int64
+        )
+
+        with tf.GradientTape() as tape:
+            # 3. Sample random noise to be added to the images in the batch
+            noise = tf.random.normal(shape=tf.shape(images), dtype=images.dtype)
+
+            # 4. Diffuse the images with noise
+            images_t = self.gdf_util.q_sample(images, t, noise)
+
+            # 5. Pass the diffused images and time steps to the network
+            pred_noise = self.network([images_t, t], training=True)
+
+            # 6. Calculate the loss
+            loss = self.loss(noise, pred_noise)
+
+        # 7. Get the gradients
+        gradients = tape.gradient(loss, self.network.trainable_weights)
+
+        # 8. Update the weights of the network
+        self.optimizer.apply_gradients(zip(gradients, self.network.trainable_weights))
+
+        # 9. Updates the weight values for the network with EMA weights
+        for weight, ema_weight in zip(self.network.weights, self.ema_network.weights):
+            ema_weight.assign(self.ema * ema_weight + (1 - self.ema) * weight)
+
+        # 10. Return loss values
+        return {"loss": loss}
+
+    def generate_images(self, num_images=16):
+        # 1. Randomly sample noise (starting point for reverse process)
+        samples = tf.random.normal(
+            shape=(num_images, img_size, img_size, img_channels), dtype=tf.float32
+        )
+        # 2. Sample from the model iteratively
+        for t in reversed(range(0, self.timesteps)):
+            tt = tf.cast(tf.fill(num_images, t), dtype=tf.int64)
+            pred_noise = self.ema_network.predict(
+                [samples, tt], verbose=0, batch_size=num_images
+            )
+            samples = self.gdf_util.p_sample(
+                pred_noise, samples, tt, clip_denoised=True
+            )
+        # 3. Return generated samples
+        return samples
+
+    def plot_images(
+        self, epoch=None, logs=None, num_rows=2, num_cols=8, figsize=(12, 5)
+    ):
+        """Utility to plot images using the diffusion model during training."""
+        generated_samples = self.generate_images(num_images=num_rows * num_cols)
+        generated_samples = (
+            tf.clip_by_value(generated_samples * 127.5 + 127.5, 0.0, 255.0)
+            .numpy()
+            .astype(np.uint8)
+        )
+
+        _, ax = plt.subplots(num_rows, num_cols, figsize=figsize)
+        for i, image in enumerate(generated_samples):
+            if num_rows == 1:
+                ax[i].imshow(image)
+                ax[i].axis("off")
+            else:
+                ax[i // num_cols, i % num_cols].imshow(image)
+                ax[i // num_cols, i % num_cols].axis("off")
+
+        plt.tight_layout()
+        plt.show()
+
+
+# Build the unet model
+network = build_model(
+    img_size=img_size,
+    img_channels=img_channels,
+    widths=widths,
+    has_attention=has_attention,
+    num_res_blocks=num_res_blocks,
+    norm_groups=norm_groups,
+    activation_fn=keras.activations.swish,
+)
+ema_network = build_model(
+    img_size=img_size,
+    img_channels=img_channels,
+    widths=widths,
+    has_attention=has_attention,
+    num_res_blocks=num_res_blocks,
+    norm_groups=norm_groups,
+    activation_fn=keras.activations.swish,
+)
+ema_network.set_weights(network.get_weights())  # Initially the weights are the same
+
+# Get an instance of the Gaussian Diffusion utilities
+gdf_util = GaussianDiffusion(timesteps=total_timesteps)
+
+# Get the model
+model = DiffusionModel(
+    network=network,
+    ema_network=ema_network,
+    gdf_util=gdf_util,
+    timesteps=total_timesteps,
+)
+
+# Compile the model
+model.compile(
+    loss=keras.losses.MeanSquaredError(),
+    optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
+)
+
+# Train the model
+model.fit(
+    train_ds,
+    epochs=num_epochs,
+    batch_size=batch_size,
+    callbacks=[keras.callbacks.LambdaCallback(on_epoch_end=model.plot_images)],
+)
+
+"""
+## Results
+
+We trained this model for 800 epochs on a V100 GPU,
+and each epoch took almost 8 seconds to finish. We load those weights
+here, and we generate a few samples starting from pure noise.
+"""
+
+"""shell
+curl -LO https://github.com/AakashKumarNain/ddpms/releases/download/v3.0.0/checkpoints.zip
+unzip -qq checkpoints.zip
+"""
+
+# Load the model weights
+model.ema_network.load_weights("checkpoints/diffusion_model_checkpoint")
+
+# Generate and plot some samples
+model.plot_images(num_rows=4, num_cols=8)
+
+
+"""
+## Conclusion
+
+We successfully implemented and trained a diffusion model exactly in the same
+fashion as implemented by the authors of the DDPMs paper. You can find the
+original implementation [here](https://github.com/hojonathanho/diffusion).
+
+There are a few things that you can try to improve the model:
+
+1. Increasing the width of each block. A bigger model can learn to denoise
+in fewer epochs, though you may have to take care of overfitting.
+
+2. We implemented the linear schedule for variance scheduling. You can implement
+other schemes like cosine scheduling and compare the performance.
+"""
+
+"""
+## References
+
+1. [Denoising Diffusion Probabilistic Models](https://arxiv.org/abs/2006.11239)
+2. [Author's implementation](https://github.com/hojonathanho/diffusion)
+3. [A deep dive into DDPMs](https://magic-with-latents.github.io/latent/posts/ddpms/part3/)
+4. [Denoising Diffusion Implicit Models](https://keras.io/examples/generative/ddim/)
+5. [Annotated Diffusion Model](https://huggingface.co/blog/annotated-diffusion)
+6. [AIAIART](https://www.youtube.com/watch?v=XTs7M6TSK9I&t=14s)
+"""
diff --git a/knowledge_base/generative/deep_dream.py b/knowledge_base/generative/deep_dream.py
new file mode 100644
index 0000000000000000000000000000000000000000..f2694d0c12d6cfa02dbfca558dc4b283963db6da
--- /dev/null
+++ b/knowledge_base/generative/deep_dream.py
@@ -0,0 +1,209 @@
+"""
+Title: Deep Dream
+Author: [fchollet](https://twitter.com/fchollet)
+Date created: 2016/01/13
+Last modified: 2020/05/02
+Description: Generating Deep Dreams with Keras.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+"Deep dream" is an image-filtering technique which consists of taking an image
+classification model, and running gradient ascent over an input image to
+try to maximize the activations of specific layers (and sometimes, specific units in
+specific layers) for this input. It produces hallucination-like visuals.
+
+It was first introduced by Alexander Mordvintsev from Google in July 2015.
+
+Process:
+
+- Load the original image.
+- Define a number of processing scales ("octaves"),
+from smallest to largest.
+- Resize the original image to the smallest scale.
+- For every scale, starting with the smallest (i.e. current one):
+    - Run gradient ascent
+    - Upscale image to the next scale
+    - Reinject the detail that was lost at upscaling time
+- Stop when we are back to the original size.
+To obtain the detail lost during upscaling, we simply
+take the original image, shrink it down, upscale it,
+and compare the result to the (resized) original image.
+"""
+
+"""
+## Setup
+"""
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import numpy as np
+import tensorflow as tf
+import keras
+from keras.applications import inception_v3
+
+base_image_path = keras.utils.get_file("sky.jpg", "https://i.imgur.com/aGBdQyK.jpg")
+result_prefix = "sky_dream"
+
+# These are the names of the layers
+# for which we try to maximize activation,
+# as well as their weight in the final loss
+# we try to maximize.
+# You can tweak these setting to obtain new visual effects.
+layer_settings = {
+    "mixed4": 1.0,
+    "mixed5": 1.5,
+    "mixed6": 2.0,
+    "mixed7": 2.5,
+}
+
+# Playing with these hyperparameters will also allow you to achieve new effects
+step = 0.01  # Gradient ascent step size
+num_octave = 3  # Number of scales at which to run gradient ascent
+octave_scale = 1.4  # Size ratio between scales
+iterations = 20  # Number of ascent steps per scale
+max_loss = 15.0
+
+"""
+This is our base image:
+"""
+
+from IPython.display import Image, display
+
+display(Image(base_image_path))
+
+"""
+Let's set up some image preprocessing/deprocessing utilities:
+"""
+
+
+def preprocess_image(image_path):
+    # Util function to open, resize and format pictures
+    # into appropriate arrays.
+    img = keras.utils.load_img(image_path)
+    img = keras.utils.img_to_array(img)
+    img = np.expand_dims(img, axis=0)
+    img = inception_v3.preprocess_input(img)
+    return img
+
+
+def deprocess_image(x):
+    # Util function to convert a NumPy array into a valid image.
+    x = x.reshape((x.shape[1], x.shape[2], 3))
+    # Undo inception v3 preprocessing
+    x /= 2.0
+    x += 0.5
+    x *= 255.0
+    # Convert to uint8 and clip to the valid range [0, 255]
+    x = np.clip(x, 0, 255).astype("uint8")
+    return x
+
+
+"""
+## Compute the Deep Dream loss
+
+First, build a feature extraction model to retrieve the activations of our target layers
+given an input image.
+"""
+
+# Build an InceptionV3 model loaded with pre-trained ImageNet weights
+model = inception_v3.InceptionV3(weights="imagenet", include_top=False)
+
+# Get the symbolic outputs of each "key" layer (we gave them unique names).
+outputs_dict = dict(
+    [
+        (layer.name, layer.output)
+        for layer in [model.get_layer(name) for name in layer_settings.keys()]
+    ]
+)
+
+# Set up a model that returns the activation values for every target layer
+# (as a dict)
+feature_extractor = keras.Model(inputs=model.inputs, outputs=outputs_dict)
+
+"""
+The actual loss computation is very simple:
+"""
+
+
+def compute_loss(input_image):
+    features = feature_extractor(input_image)
+    # Initialize the loss
+    loss = tf.zeros(shape=())
+    for name in features.keys():
+        coeff = layer_settings[name]
+        activation = features[name]
+        # We avoid border artifacts by only involving non-border pixels in the loss.
+        scaling = tf.reduce_prod(tf.cast(tf.shape(activation), "float32"))
+        loss += coeff * tf.reduce_sum(tf.square(activation[:, 2:-2, 2:-2, :])) / scaling
+    return loss
+
+
+"""
+## Set up the gradient ascent loop for one octave
+"""
+
+
+@tf.function
+def gradient_ascent_step(img, learning_rate):
+    with tf.GradientTape() as tape:
+        tape.watch(img)
+        loss = compute_loss(img)
+    # Compute gradients.
+    grads = tape.gradient(loss, img)
+    # Normalize gradients.
+    grads /= tf.maximum(tf.reduce_mean(tf.abs(grads)), 1e-6)
+    img += learning_rate * grads
+    return loss, img
+
+
+def gradient_ascent_loop(img, iterations, learning_rate, max_loss=None):
+    for i in range(iterations):
+        loss, img = gradient_ascent_step(img, learning_rate)
+        if max_loss is not None and loss > max_loss:
+            break
+        print("... Loss value at step %d: %.2f" % (i, loss))
+    return img
+
+
+"""
+## Run the training loop, iterating over different octaves
+"""
+
+original_img = preprocess_image(base_image_path)
+original_shape = original_img.shape[1:3]
+
+successive_shapes = [original_shape]
+for i in range(1, num_octave):
+    shape = tuple([int(dim / (octave_scale**i)) for dim in original_shape])
+    successive_shapes.append(shape)
+successive_shapes = successive_shapes[::-1]
+shrunk_original_img = tf.image.resize(original_img, successive_shapes[0])
+
+img = tf.identity(original_img)  # Make a copy
+for i, shape in enumerate(successive_shapes):
+    print("Processing octave %d with shape %s" % (i, shape))
+    img = tf.image.resize(img, shape)
+    img = gradient_ascent_loop(
+        img, iterations=iterations, learning_rate=step, max_loss=max_loss
+    )
+    upscaled_shrunk_original_img = tf.image.resize(shrunk_original_img, shape)
+    same_size_original = tf.image.resize(original_img, shape)
+    lost_detail = same_size_original - upscaled_shrunk_original_img
+
+    img += lost_detail
+    shrunk_original_img = tf.image.resize(original_img, shape)
+
+keras.utils.save_img(result_prefix + ".png", deprocess_image(img.numpy()))
+
+"""
+Display the result.
+
+You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/keras-io/deep-dream)
+and try the demo on [Hugging Face Spaces](https://huggingface.co/spaces/keras-io/deep-dream).
+"""
+
+display(Image(result_prefix + ".png"))
diff --git a/knowledge_base/generative/dreambooth.py b/knowledge_base/generative/dreambooth.py
new file mode 100644
index 0000000000000000000000000000000000000000..e0e487a89bd67262e18765c7b3a8c2db34265e2a
--- /dev/null
+++ b/knowledge_base/generative/dreambooth.py
@@ -0,0 +1,657 @@
+"""
+Title: DreamBooth
+Author: [Sayak Paul](https://twitter.com/RisingSayak), [Chansung Park](https://twitter.com/algo_diver)
+Date created: 2023/02/01
+Last modified: 2023/02/05
+Description: Implementing DreamBooth.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+In this example, we implement DreamBooth, a fine-tuning technique to teach new visual
+concepts to text-conditioned Diffusion models with just 3 - 5 images. DreamBooth was
+proposed in
+[DreamBooth: Fine Tuning Text-to-Image Diffusion Models for Subject-Driven Generation](https://arxiv.org/abs/2208.12242)
+by Ruiz et al.
+
+DreamBooth, in a sense, is similar to the
+[traditional way of fine-tuning a text-conditioned Diffusion model except](https://keras.io/examples/generative/finetune_stable_diffusion/)
+for a few gotchas. This example assumes that you have basic familiarity with
+Diffusion models and how to fine-tune them. Here are some reference examples that might
+help you to get familiarized quickly:
+
+* [High-performance image generation using Stable Diffusion in KerasCV](https://keras.io/guides/keras_cv/generate_images_with_stable_diffusion/)
+* [Teach StableDiffusion new concepts via Textual Inversion](https://keras.io/examples/generative/fine_tune_via_textual_inversion/)
+* [Fine-tuning Stable Diffusion](https://keras.io/examples/generative/finetune_stable_diffusion/)
+
+First, let's install the latest versions of KerasCV and TensorFlow.
+
+"""
+
+"""shell
+pip install -q -U keras_cv==0.6.0
+pip install -q -U tensorflow
+"""
+
+"""
+If you're running the code, please ensure you're using a GPU with at least 24 GBs of
+VRAM.
+"""
+
+"""
+## Initial imports
+"""
+
+import math
+
+import keras_cv
+import matplotlib.pyplot as plt
+import numpy as np
+import tensorflow as tf
+from imutils import paths
+from tensorflow import keras
+
+"""
+## Usage of DreamBooth
+
+... is very versatile. By teaching Stable Diffusion about your favorite visual
+concepts, you can
+
+* Recontextualize objects in interesting ways:
+
+  ![](https://i.imgur.com/4Da9ozw.png)
+
+* Generate artistic renderings of the underlying visual concept:
+
+  ![](https://i.imgur.com/nI2N8bI.png)
+
+
+And many other applications. We welcome you to check out the original
+[DreamBooth paper](https://arxiv.org/abs/2208.12242) in this regard.
+"""
+
+"""
+## Download the instance and class images
+
+DreamBooth uses a technique called "prior preservation" to meaningfully guide the
+training procedure such that the fine-tuned models can still preserve some of the prior
+semantics of the visual concept you're introducing. To know more about the idea of "prior
+preservation" refer to [this document](https://dreambooth.github.io/).
+
+Here, we need to introduce a few key terms specific to DreamBooth:
+
+* **Unique class**: Examples include "dog", "person", etc. In this example, we use "dog".
+* **Unique identifier**: A unique identifier that is prepended to the unique class while
+forming the "instance prompts". In this example, we use "sks" as this unique identifier.
+* **Instance prompt**: Denotes a prompt that best describes the "instance images". An
+example prompt could be - "f"a photo of {unique_id} {unique_class}". So, for our example,
+this becomes - "a photo  of sks dog".
+* **Class prompt**: Denotes a prompt without the unique identifier. This prompt is used
+for generating "class images" for prior preservation. For our example, this prompt is -
+"a photo of dog".
+* **Instance images**: Denote the images that represent the visual concept you're trying
+to teach aka the "instance prompt". This number is typically just 3 - 5. We typically
+gather these images ourselves.
+* **Class images**: Denote the images generated using the "class prompt" for using prior
+preservation in DreamBooth training. We leverage the pre-trained model before fine-tuning
+it to generate these class images. Typically, 200 - 300 class images are enough.
+
+In code, this generation process looks quite simply:
+
+```py
+from tqdm import tqdm
+import numpy as np
+import hashlib
+import keras_cv
+import PIL
+import os
+
+class_images_dir = "class-images"
+os.makedirs(class_images_dir, exist_ok=True)
+
+model = keras_cv.models.StableDiffusion(img_width=512, img_height=512, jit_compile=True)
+
+class_prompt = "a photo of dog"
+num_imgs_to_generate = 200
+for i in tqdm(range(num_imgs_to_generate)):
+    images = model.text_to_image(
+        class_prompt,
+        batch_size=3,
+    )
+    idx = np.random.choice(len(images))
+    selected_image = PIL.Image.fromarray(images[idx])
+    hash_image = hashlib.sha1(selected_image.tobytes()).hexdigest()
+    image_filename = os.path.join(class_images_dir, f"{hash_image}.jpg")
+    selected_image.save(image_filename)
+```
+
+To keep the runtime of this example short, the authors of this example have gone ahead
+and generated some class images using
+[this notebook](https://colab.research.google.com/gist/sayakpaul/6b5de345d29cf5860f84b6d04d958692/generate_class_priors.ipynb).
+
+**Note** that prior preservation is an optional technique used in DreamBooth, but it
+almost always helps in improving the quality of the generated images.
+"""
+
+instance_images_root = tf.keras.utils.get_file(
+    origin="https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/instance-images.tar.gz",
+    untar=True,
+)
+class_images_root = tf.keras.utils.get_file(
+    origin="https://huggingface.co/datasets/sayakpaul/sample-datasets/resolve/main/class-images.tar.gz",
+    untar=True,
+)
+
+"""
+## Visualize images
+
+First, let's load the image paths.
+"""
+instance_image_paths = list(paths.list_images(instance_images_root))
+class_image_paths = list(paths.list_images(class_images_root))
+
+"""
+Then we load the images from the paths.
+"""
+
+
+def load_images(image_paths):
+    images = [np.array(keras.utils.load_img(path)) for path in image_paths]
+    return images
+
+
+"""
+And then we make use a utility function to plot the loaded images.
+"""
+
+
+def plot_images(images, title=None):
+    plt.figure(figsize=(20, 20))
+    for i in range(len(images)):
+        ax = plt.subplot(1, len(images), i + 1)
+        if title is not None:
+            plt.title(title)
+        plt.imshow(images[i])
+        plt.axis("off")
+
+
+"""
+**Instance images**:
+"""
+
+plot_images(load_images(instance_image_paths[:5]))
+
+"""
+**Class images**:
+"""
+
+plot_images(load_images(class_image_paths[:5]))
+
+"""
+## Prepare datasets
+
+Dataset preparation includes two stages: (1): preparing the captions, (2) processing the
+images.
+"""
+
+"""
+### Prepare the captions
+"""
+
+# Since we're using prior preservation, we need to match the number
+# of instance images we're using. We just repeat the instance image paths
+# to do so.
+new_instance_image_paths = []
+for index in range(len(class_image_paths)):
+    instance_image = instance_image_paths[index % len(instance_image_paths)]
+    new_instance_image_paths.append(instance_image)
+
+# We just repeat the prompts / captions per images.
+unique_id = "sks"
+class_label = "dog"
+
+instance_prompt = f"a photo of {unique_id} {class_label}"
+instance_prompts = [instance_prompt] * len(new_instance_image_paths)
+
+class_prompt = f"a photo of {class_label}"
+class_prompts = [class_prompt] * len(class_image_paths)
+
+"""
+Next, we embed the prompts to save some compute.
+"""
+
+import itertools
+
+# The padding token and maximum prompt length are specific to the text encoder.
+# If you're using a different text encoder be sure to change them accordingly.
+padding_token = 49407
+max_prompt_length = 77
+
+# Load the tokenizer.
+tokenizer = keras_cv.models.stable_diffusion.SimpleTokenizer()
+
+
+# Method to tokenize and pad the tokens.
+def process_text(caption):
+    tokens = tokenizer.encode(caption)
+    tokens = tokens + [padding_token] * (max_prompt_length - len(tokens))
+    return np.array(tokens)
+
+
+# Collate the tokenized captions into an array.
+tokenized_texts = np.empty(
+    (len(instance_prompts) + len(class_prompts), max_prompt_length)
+)
+
+for i, caption in enumerate(itertools.chain(instance_prompts, class_prompts)):
+    tokenized_texts[i] = process_text(caption)
+
+
+# We also pre-compute the text embeddings to save some memory during training.
+POS_IDS = tf.convert_to_tensor([list(range(max_prompt_length))], dtype=tf.int32)
+text_encoder = keras_cv.models.stable_diffusion.TextEncoder(max_prompt_length)
+
+gpus = tf.config.list_logical_devices("GPU")
+
+# Ensure the computation takes place on a GPU.
+# Note that it's done automatically when there's a GPU present.
+# This example just attempts at showing how you can do it
+# more explicitly.
+with tf.device(gpus[0].name):
+    embedded_text = text_encoder(
+        [tf.convert_to_tensor(tokenized_texts), POS_IDS], training=False
+    ).numpy()
+
+# To ensure text_encoder doesn't occupy any GPU space.
+del text_encoder
+
+"""
+## Prepare the images
+"""
+
+resolution = 512
+auto = tf.data.AUTOTUNE
+
+augmenter = keras.Sequential(
+    layers=[
+        keras_cv.layers.CenterCrop(resolution, resolution),
+        keras_cv.layers.RandomFlip(),
+        keras.layers.Rescaling(scale=1.0 / 127.5, offset=-1),
+    ]
+)
+
+
+def process_image(image_path, tokenized_text):
+    image = tf.io.read_file(image_path)
+    image = tf.io.decode_png(image, 3)
+    image = tf.image.resize(image, (resolution, resolution))
+    return image, tokenized_text
+
+
+def apply_augmentation(image_batch, embedded_tokens):
+    return augmenter(image_batch), embedded_tokens
+
+
+def prepare_dict(instance_only=True):
+    def fn(image_batch, embedded_tokens):
+        if instance_only:
+            batch_dict = {
+                "instance_images": image_batch,
+                "instance_embedded_texts": embedded_tokens,
+            }
+            return batch_dict
+        else:
+            batch_dict = {
+                "class_images": image_batch,
+                "class_embedded_texts": embedded_tokens,
+            }
+            return batch_dict
+
+    return fn
+
+
+def assemble_dataset(image_paths, embedded_texts, instance_only=True, batch_size=1):
+    dataset = tf.data.Dataset.from_tensor_slices((image_paths, embedded_texts))
+    dataset = dataset.map(process_image, num_parallel_calls=auto)
+    dataset = dataset.shuffle(5, reshuffle_each_iteration=True)
+    dataset = dataset.batch(batch_size)
+    dataset = dataset.map(apply_augmentation, num_parallel_calls=auto)
+
+    prepare_dict_fn = prepare_dict(instance_only=instance_only)
+    dataset = dataset.map(prepare_dict_fn, num_parallel_calls=auto)
+    return dataset
+
+
+"""
+## Assemble dataset
+"""
+instance_dataset = assemble_dataset(
+    new_instance_image_paths,
+    embedded_text[: len(new_instance_image_paths)],
+)
+class_dataset = assemble_dataset(
+    class_image_paths,
+    embedded_text[len(new_instance_image_paths) :],
+    instance_only=False,
+)
+train_dataset = tf.data.Dataset.zip((instance_dataset, class_dataset))
+"""
+## Check shapes
+
+Now that the dataset has been prepared, let's quickly check what's inside it.
+"""
+
+sample_batch = next(iter(train_dataset))
+print(sample_batch[0].keys(), sample_batch[1].keys())
+
+for k in sample_batch[0]:
+    print(k, sample_batch[0][k].shape)
+
+for k in sample_batch[1]:
+    print(k, sample_batch[1][k].shape)
+
+"""
+During training, we make use of these keys to gather the images and text embeddings and
+concat them accordingly.
+"""
+
+"""
+## DreamBooth training loop
+
+Our DreamBooth training loop is very much inspired by
+[this script](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth.py)
+provided by the Diffusers team at Hugging Face. However, there is an important
+difference to note. We only fine-tune the UNet (the model responsible for predicting
+noise) and don't fine-tune the text encoder in this example. If you're looking for an
+implementation that also performs the additional fine-tuning of the text encoder, refer
+to [this repository](https://github.com/sayakpaul/dreambooth-keras/).
+"""
+
+import tensorflow.experimental.numpy as tnp
+
+
+class DreamBoothTrainer(tf.keras.Model):
+    # Reference:
+    # https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth.py
+
+    def __init__(
+        self,
+        diffusion_model,
+        vae,
+        noise_scheduler,
+        use_mixed_precision=False,
+        prior_loss_weight=1.0,
+        max_grad_norm=1.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.diffusion_model = diffusion_model
+        self.vae = vae
+        self.noise_scheduler = noise_scheduler
+        self.prior_loss_weight = prior_loss_weight
+        self.max_grad_norm = max_grad_norm
+
+        self.use_mixed_precision = use_mixed_precision
+        self.vae.trainable = False
+
+    def train_step(self, inputs):
+        instance_batch = inputs[0]
+        class_batch = inputs[1]
+
+        instance_images = instance_batch["instance_images"]
+        instance_embedded_text = instance_batch["instance_embedded_texts"]
+        class_images = class_batch["class_images"]
+        class_embedded_text = class_batch["class_embedded_texts"]
+
+        images = tf.concat([instance_images, class_images], 0)
+        embedded_texts = tf.concat([instance_embedded_text, class_embedded_text], 0)
+        batch_size = tf.shape(images)[0]
+
+        with tf.GradientTape() as tape:
+            # Project image into the latent space and sample from it.
+            latents = self.sample_from_encoder_outputs(self.vae(images, training=False))
+            # Know more about the magic number here:
+            # https://keras.io/examples/generative/fine_tune_via_textual_inversion/
+            latents = latents * 0.18215
+
+            # Sample noise that we'll add to the latents.
+            noise = tf.random.normal(tf.shape(latents))
+
+            # Sample a random timestep for each image.
+            timesteps = tnp.random.randint(
+                0, self.noise_scheduler.train_timesteps, (batch_size,)
+            )
+
+            # Add noise to the latents according to the noise magnitude at each timestep
+            # (this is the forward diffusion process).
+            noisy_latents = self.noise_scheduler.add_noise(
+                tf.cast(latents, noise.dtype), noise, timesteps
+            )
+
+            # Get the target for loss depending on the prediction type
+            # just the sampled noise for now.
+            target = noise  # noise_schedule.predict_epsilon == True
+
+            # Predict the noise residual and compute loss.
+            timestep_embedding = tf.map_fn(
+                lambda t: self.get_timestep_embedding(t), timesteps, dtype=tf.float32
+            )
+            model_pred = self.diffusion_model(
+                [noisy_latents, timestep_embedding, embedded_texts], training=True
+            )
+            loss = self.compute_loss(target, model_pred)
+            if self.use_mixed_precision:
+                loss = self.optimizer.get_scaled_loss(loss)
+
+        # Update parameters of the diffusion model.
+        trainable_vars = self.diffusion_model.trainable_variables
+        gradients = tape.gradient(loss, trainable_vars)
+        if self.use_mixed_precision:
+            gradients = self.optimizer.get_unscaled_gradients(gradients)
+        gradients = [tf.clip_by_norm(g, self.max_grad_norm) for g in gradients]
+        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
+
+        return {m.name: m.result() for m in self.metrics}
+
+    def get_timestep_embedding(self, timestep, dim=320, max_period=10000):
+        half = dim // 2
+        log_max_period = tf.math.log(tf.cast(max_period, tf.float32))
+        freqs = tf.math.exp(
+            -log_max_period * tf.range(0, half, dtype=tf.float32) / half
+        )
+        args = tf.convert_to_tensor([timestep], dtype=tf.float32) * freqs
+        embedding = tf.concat([tf.math.cos(args), tf.math.sin(args)], 0)
+        return embedding
+
+    def sample_from_encoder_outputs(self, outputs):
+        mean, logvar = tf.split(outputs, 2, axis=-1)
+        logvar = tf.clip_by_value(logvar, -30.0, 20.0)
+        std = tf.exp(0.5 * logvar)
+        sample = tf.random.normal(tf.shape(mean), dtype=mean.dtype)
+        return mean + std * sample
+
+    def compute_loss(self, target, model_pred):
+        # Chunk the noise and model_pred into two parts and compute the loss
+        # on each part separately.
+        # Since the first half of the inputs has instance samples and the second half
+        # has class samples, we do the chunking accordingly.
+        model_pred, model_pred_prior = tf.split(
+            model_pred, num_or_size_splits=2, axis=0
+        )
+        target, target_prior = tf.split(target, num_or_size_splits=2, axis=0)
+
+        # Compute instance loss.
+        loss = self.compiled_loss(target, model_pred)
+
+        # Compute prior loss.
+        prior_loss = self.compiled_loss(target_prior, model_pred_prior)
+
+        # Add the prior loss to the instance loss.
+        loss = loss + self.prior_loss_weight * prior_loss
+        return loss
+
+    def save_weights(self, filepath, overwrite=True, save_format=None, options=None):
+        # Overriding this method will allow us to use the `ModelCheckpoint`
+        # callback directly with this trainer class. In this case, it will
+        # only checkpoint the `diffusion_model` since that's what we're training
+        # during fine-tuning.
+        self.diffusion_model.save_weights(
+            filepath=filepath,
+            overwrite=overwrite,
+            save_format=save_format,
+            options=options,
+        )
+
+    def load_weights(self, filepath, by_name=False, skip_mismatch=False, options=None):
+        # Similarly override `load_weights()` so that we can directly call it on
+        # the trainer class object.
+        self.diffusion_model.load_weights(
+            filepath=filepath,
+            by_name=by_name,
+            skip_mismatch=skip_mismatch,
+            options=options,
+        )
+
+
+"""
+## Trainer initialization
+"""
+
+# Comment it if you are not using a GPU having tensor cores.
+tf.keras.mixed_precision.set_global_policy("mixed_float16")
+
+use_mp = True  # Set it to False if you're not using a GPU with tensor cores.
+
+image_encoder = keras_cv.models.stable_diffusion.ImageEncoder()
+dreambooth_trainer = DreamBoothTrainer(
+    diffusion_model=keras_cv.models.stable_diffusion.DiffusionModel(
+        resolution, resolution, max_prompt_length
+    ),
+    # Remove the top layer from the encoder, which cuts off the variance and only
+    # returns the mean.
+    vae=tf.keras.Model(
+        image_encoder.input,
+        image_encoder.layers[-2].output,
+    ),
+    noise_scheduler=keras_cv.models.stable_diffusion.NoiseScheduler(),
+    use_mixed_precision=use_mp,
+)
+
+# These hyperparameters come from this tutorial by Hugging Face:
+# https://github.com/huggingface/diffusers/tree/main/examples/dreambooth
+learning_rate = 5e-6
+beta_1, beta_2 = 0.9, 0.999
+weight_decay = (1e-2,)
+epsilon = 1e-08
+
+optimizer = tf.keras.optimizers.experimental.AdamW(
+    learning_rate=learning_rate,
+    weight_decay=weight_decay,
+    beta_1=beta_1,
+    beta_2=beta_2,
+    epsilon=epsilon,
+)
+dreambooth_trainer.compile(optimizer=optimizer, loss="mse")
+
+"""
+## Train!
+
+We first calculate the number of epochs, we need to train for.
+"""
+
+num_update_steps_per_epoch = train_dataset.cardinality()
+max_train_steps = 800
+epochs = math.ceil(max_train_steps / num_update_steps_per_epoch)
+print(f"Training for {epochs} epochs.")
+
+"""
+And then we start training!
+"""
+
+ckpt_path = "dreambooth-unet.h5"
+ckpt_callback = tf.keras.callbacks.ModelCheckpoint(
+    ckpt_path,
+    save_weights_only=True,
+    monitor="loss",
+    mode="min",
+)
+dreambooth_trainer.fit(train_dataset, epochs=epochs, callbacks=[ckpt_callback])
+
+"""
+## Experiments and inference
+
+We ran various experiments with a slightly modified version of this example. Our
+experiments are based on
+[this repository](https://github.com/sayakpaul/dreambooth-keras/) and are inspired by
+[this blog post](https://huggingface.co/blog/dreambooth) from Hugging Face.
+
+First, let's see how we can use the fine-tuned checkpoint for running inference.
+"""
+
+# Initialize a new Stable Diffusion model.
+dreambooth_model = keras_cv.models.StableDiffusion(
+    img_width=resolution, img_height=resolution, jit_compile=True
+)
+dreambooth_model.diffusion_model.load_weights(ckpt_path)
+
+# Note how the unique identifier and the class have been used in the prompt.
+prompt = f"A photo of {unique_id} {class_label} in a bucket"
+num_imgs_to_gen = 3
+
+images_dreamboothed = dreambooth_model.text_to_image(prompt, batch_size=num_imgs_to_gen)
+plot_images(images_dreamboothed, prompt)
+
+"""
+Now, let's load checkpoints from a different experiment we conducted where we also
+fine-tuned the text encoder along with the UNet:
+"""
+
+unet_weights = tf.keras.utils.get_file(
+    origin="https://huggingface.co/chansung/dreambooth-dog/resolve/main/lr%409e-06-max_train_steps%40200-train_text_encoder%40True-unet.h5"
+)
+text_encoder_weights = tf.keras.utils.get_file(
+    origin="https://huggingface.co/chansung/dreambooth-dog/resolve/main/lr%409e-06-max_train_steps%40200-train_text_encoder%40True-text_encoder.h5"
+)
+
+dreambooth_model.diffusion_model.load_weights(unet_weights)
+dreambooth_model.text_encoder.load_weights(text_encoder_weights)
+
+images_dreamboothed = dreambooth_model.text_to_image(prompt, batch_size=num_imgs_to_gen)
+plot_images(images_dreamboothed, prompt)
+
+"""
+The default number of steps for generating an image in `text_to_image()`
+[is 50](https://github.com/keras-team/keras-cv/blob/3575bc3b944564fe15b46b917e6555aa6a9d7be0/keras_cv/models/stable_diffusion/stable_diffusion.py#L73).
+Let's increase it to 100.
+"""
+
+images_dreamboothed = dreambooth_model.text_to_image(
+    prompt, batch_size=num_imgs_to_gen, num_steps=100
+)
+plot_images(images_dreamboothed, prompt)
+
+"""
+Feel free to experiment with different prompts (don't forget to add the unique identifier
+and the class label!) to see how the results change. We welcome you to check out our
+codebase and more experimental results
+[here](https://github.com/sayakpaul/dreambooth-keras#results). You can also read
+[this blog post](https://huggingface.co/blog/dreambooth) to get more ideas.
+"""
+
+"""
+## Acknowledgements
+
+* Thanks to the
+[DreamBooth example script](https://github.com/huggingface/diffusers/blob/main/examples/dreambooth/train_dreambooth.py)
+provided by Hugging Face which helped us a lot in getting the initial implementation
+ready quickly.
+* Getting DreamBooth to work on human faces can be challenging. We have compiled some
+general recommendations
+[here](https://github.com/sayakpaul/dreambooth-keras#notes-on-preparing-data-for-dreambooth-training-of-faces).
+Thanks to
+[Abhishek Thakur](https://no.linkedin.com/in/abhi1thakur)
+for helping with these.
+"""
diff --git a/knowledge_base/generative/fine_tune_via_textual_inversion.py b/knowledge_base/generative/fine_tune_via_textual_inversion.py
new file mode 100644
index 0000000000000000000000000000000000000000..9ad904d91b72b380ec786c77ee6750c610193c76
--- /dev/null
+++ b/knowledge_base/generative/fine_tune_via_textual_inversion.py
@@ -0,0 +1,729 @@
+"""
+Title: Teach StableDiffusion new concepts via Textual Inversion
+Authors: Ian Stenbit, [lukewood](https://lukewood.xyz)
+Date created: 2022/12/09
+Last modified: 2022/12/09
+Description: Learning new visual concepts with KerasCV's StableDiffusion implementation.
+"""
+
+"""
+## Textual Inversion
+
+Since its release, StableDiffusion has quickly become a favorite amongst
+the generative machine learning community.
+The high volume of traffic has led to open source contributed improvements,
+heavy prompt engineering, and even the invention of novel algorithms.
+
+Perhaps the most impressive new algorithm being used is
+[Textual Inversion](https://github.com/rinongal/textual_inversion), presented in
+[_An Image is Worth One Word: Personalizing Text-to-Image Generation using Textual Inversion_](https://textual-inversion.github.io/).
+
+Textual Inversion is the process of teaching an image generator a specific visual concept
+through the use of fine-tuning. In the diagram below, you can see an
+example of this process where the authors teach the model new concepts, calling them
+"S_*".
+
+![https://i.imgur.com/KqEeBsM.jpg](https://i.imgur.com/KqEeBsM.jpg)
+
+Conceptually, textual inversion works by learning a token embedding for a new text
+token, keeping the remaining components of StableDiffusion frozen.
+
+This guide shows you how to fine-tune the StableDiffusion model shipped in KerasCV
+using the Textual-Inversion algorithm.  By the end of the guide, you will be able to
+write the "Gandalf the Gray as a &lt;my-funny-cat-token&gt;".
+
+![https://i.imgur.com/rcb1Yfx.png](https://i.imgur.com/rcb1Yfx.png)
+
+
+First, let's import the packages we need, and create a
+StableDiffusion instance so we can use some of its subcomponents for fine-tuning.
+"""
+
+"""shell
+pip install -q git+https://github.com/keras-team/keras-cv.git
+pip install -q tensorflow==2.11.0
+"""
+
+import math
+
+import keras_cv
+import numpy as np
+import tensorflow as tf
+from keras_cv import layers as cv_layers
+from keras_cv.models.stable_diffusion import NoiseScheduler
+from tensorflow import keras
+import matplotlib.pyplot as plt
+
+stable_diffusion = keras_cv.models.StableDiffusion()
+
+"""
+Next, let's define a visualization utility to show off the generated images:
+"""
+
+
+def plot_images(images):
+    plt.figure(figsize=(20, 20))
+    for i in range(len(images)):
+        ax = plt.subplot(1, len(images), i + 1)
+        plt.imshow(images[i])
+        plt.axis("off")
+
+
+"""
+## Assembling a text-image pair dataset
+
+In order to train the embedding of our new token, we first must assemble a dataset
+consisting of text-image pairs.
+Each sample from the dataset must contain an image of the concept we are teaching
+StableDiffusion, as well as a caption accurately representing the content of the image.
+In this tutorial, we will teach StableDiffusion the concept of Luke and Ian's GitHub
+avatars:
+
+![gh-avatars](https://i.imgur.com/WyEHDIR.jpg)
+
+First, let's construct an image dataset of cat dolls:
+"""
+
+
+def assemble_image_dataset(urls):
+    # Fetch all remote files
+    files = [tf.keras.utils.get_file(origin=url) for url in urls]
+
+    # Resize images
+    resize = keras.layers.Resizing(height=512, width=512, crop_to_aspect_ratio=True)
+    images = [keras.utils.load_img(img) for img in files]
+    images = [keras.utils.img_to_array(img) for img in images]
+    images = np.array([resize(img) for img in images])
+
+    # The StableDiffusion image encoder requires images to be normalized to the
+    # [-1, 1] pixel value range
+    images = images / 127.5 - 1
+
+    # Create the tf.data.Dataset
+    image_dataset = tf.data.Dataset.from_tensor_slices(images)
+
+    # Shuffle and introduce random noise
+    image_dataset = image_dataset.shuffle(50, reshuffle_each_iteration=True)
+    image_dataset = image_dataset.map(
+        cv_layers.RandomCropAndResize(
+            target_size=(512, 512),
+            crop_area_factor=(0.8, 1.0),
+            aspect_ratio_factor=(1.0, 1.0),
+        ),
+        num_parallel_calls=tf.data.AUTOTUNE,
+    )
+    image_dataset = image_dataset.map(
+        cv_layers.RandomFlip(mode="horizontal"),
+        num_parallel_calls=tf.data.AUTOTUNE,
+    )
+    return image_dataset
+
+
+"""
+Next, we assemble a text dataset:
+"""
+
+MAX_PROMPT_LENGTH = 77
+placeholder_token = "<my-funny-cat-token>"
+
+
+def pad_embedding(embedding):
+    return embedding + (
+        [stable_diffusion.tokenizer.end_of_text] * (MAX_PROMPT_LENGTH - len(embedding))
+    )
+
+
+stable_diffusion.tokenizer.add_tokens(placeholder_token)
+
+
+def assemble_text_dataset(prompts):
+    prompts = [prompt.format(placeholder_token) for prompt in prompts]
+    embeddings = [stable_diffusion.tokenizer.encode(prompt) for prompt in prompts]
+    embeddings = [np.array(pad_embedding(embedding)) for embedding in embeddings]
+    text_dataset = tf.data.Dataset.from_tensor_slices(embeddings)
+    text_dataset = text_dataset.shuffle(100, reshuffle_each_iteration=True)
+    return text_dataset
+
+
+"""
+Finally, we zip our datasets together to produce a text-image pair dataset.
+"""
+
+
+def assemble_dataset(urls, prompts):
+    image_dataset = assemble_image_dataset(urls)
+    text_dataset = assemble_text_dataset(prompts)
+    # the image dataset is quite short, so we repeat it to match the length of the
+    # text prompt dataset
+    image_dataset = image_dataset.repeat()
+    # we use the text prompt dataset to determine the length of the dataset.  Due to
+    # the fact that there are relatively few prompts we repeat the dataset 5 times.
+    # we have found that this anecdotally improves results.
+    text_dataset = text_dataset.repeat(5)
+    return tf.data.Dataset.zip((image_dataset, text_dataset))
+
+
+"""
+In order to ensure our prompts are descriptive, we use extremely generic prompts.
+
+Let's try this out with some sample images and prompts.
+"""
+
+train_ds = assemble_dataset(
+    urls=[
+        "https://i.imgur.com/VIedH1X.jpg",
+        "https://i.imgur.com/eBw13hE.png",
+        "https://i.imgur.com/oJ3rSg7.png",
+        "https://i.imgur.com/5mCL6Df.jpg",
+        "https://i.imgur.com/4Q6WWyI.jpg",
+    ],
+    prompts=[
+        "a photo of a {}",
+        "a rendering of a {}",
+        "a cropped photo of the {}",
+        "the photo of a {}",
+        "a photo of a clean {}",
+        "a dark photo of the {}",
+        "a photo of my {}",
+        "a photo of the cool {}",
+        "a close-up photo of a {}",
+        "a bright photo of the {}",
+        "a cropped photo of a {}",
+        "a photo of the {}",
+        "a good photo of the {}",
+        "a photo of one {}",
+        "a close-up photo of the {}",
+        "a rendition of the {}",
+        "a photo of the clean {}",
+        "a rendition of a {}",
+        "a photo of a nice {}",
+        "a good photo of a {}",
+        "a photo of the nice {}",
+        "a photo of the small {}",
+        "a photo of the weird {}",
+        "a photo of the large {}",
+        "a photo of a cool {}",
+        "a photo of a small {}",
+    ],
+)
+
+"""
+## On the importance of prompt accuracy
+
+During our first attempt at writing this guide we included images of groups of these cat
+dolls in our dataset but continued to use the generic prompts listed above.
+Our results were anecdotally poor. For example, here's cat doll gandalf using this method:
+
+![mediocre-wizard](https://i.imgur.com/Thq7XOu.jpg)
+
+It's conceptually close, but it isn't as great as it can be.
+
+In order to remedy this, we began experimenting with splitting our images into images of
+singular cat dolls and groups of cat dolls.
+Following this split, we came up with new prompts for the group shots.
+
+Training on text-image pairs that accurately represent the content boosted the quality
+of our results *substantially*.  This speaks to the importance of prompt accuracy.
+
+In addition to separating the images into singular and group images, we also remove some
+inaccurate prompts; such as "a dark photo of the {}"
+
+Keeping this in mind, we assemble our final training dataset below:
+"""
+
+single_ds = assemble_dataset(
+    urls=[
+        "https://i.imgur.com/VIedH1X.jpg",
+        "https://i.imgur.com/eBw13hE.png",
+        "https://i.imgur.com/oJ3rSg7.png",
+        "https://i.imgur.com/5mCL6Df.jpg",
+        "https://i.imgur.com/4Q6WWyI.jpg",
+    ],
+    prompts=[
+        "a photo of a {}",
+        "a rendering of a {}",
+        "a cropped photo of the {}",
+        "the photo of a {}",
+        "a photo of a clean {}",
+        "a photo of my {}",
+        "a photo of the cool {}",
+        "a close-up photo of a {}",
+        "a bright photo of the {}",
+        "a cropped photo of a {}",
+        "a photo of the {}",
+        "a good photo of the {}",
+        "a photo of one {}",
+        "a close-up photo of the {}",
+        "a rendition of the {}",
+        "a photo of the clean {}",
+        "a rendition of a {}",
+        "a photo of a nice {}",
+        "a good photo of a {}",
+        "a photo of the nice {}",
+        "a photo of the small {}",
+        "a photo of the weird {}",
+        "a photo of the large {}",
+        "a photo of a cool {}",
+        "a photo of a small {}",
+    ],
+)
+
+"""
+![https://i.imgur.com/gQCRjK6.png](https://i.imgur.com/gQCRjK6.png)
+
+Looks great!
+
+Next, we assemble a dataset of groups of our GitHub avatars:
+"""
+
+group_ds = assemble_dataset(
+    urls=[
+        "https://i.imgur.com/yVmZ2Qa.jpg",
+        "https://i.imgur.com/JbyFbZJ.jpg",
+        "https://i.imgur.com/CCubd3q.jpg",
+    ],
+    prompts=[
+        "a photo of a group of {}",
+        "a rendering of a group of {}",
+        "a cropped photo of the group of {}",
+        "the photo of a group of {}",
+        "a photo of a clean group of {}",
+        "a photo of my group of {}",
+        "a photo of a cool group of {}",
+        "a close-up photo of a group of {}",
+        "a bright photo of the group of {}",
+        "a cropped photo of a group of {}",
+        "a photo of the group of {}",
+        "a good photo of the group of {}",
+        "a photo of one group of {}",
+        "a close-up photo of the group of {}",
+        "a rendition of the group of {}",
+        "a photo of the clean group of {}",
+        "a rendition of a group of {}",
+        "a photo of a nice group of {}",
+        "a good photo of a group of {}",
+        "a photo of the nice group of {}",
+        "a photo of the small group of {}",
+        "a photo of the weird group of {}",
+        "a photo of the large group of {}",
+        "a photo of a cool group of {}",
+        "a photo of a small group of {}",
+    ],
+)
+
+"""
+![https://i.imgur.com/GY9Pf3D.png](https://i.imgur.com/GY9Pf3D.png)
+
+Finally, we concatenate the two datasets:
+"""
+
+train_ds = single_ds.concatenate(group_ds)
+train_ds = train_ds.batch(1).shuffle(
+    train_ds.cardinality(), reshuffle_each_iteration=True
+)
+
+"""
+## Adding a new token to the text encoder
+
+Next, we create a new text encoder for the StableDiffusion model and add our new
+embedding for '<my-funny-cat-token>' into the model.
+"""
+tokenized_initializer = stable_diffusion.tokenizer.encode("cat")[1]
+new_weights = stable_diffusion.text_encoder.layers[2].token_embedding(
+    tf.constant(tokenized_initializer)
+)
+
+# Get len of .vocab instead of tokenizer
+new_vocab_size = len(stable_diffusion.tokenizer.vocab)
+
+# The embedding layer is the 2nd layer in the text encoder
+old_token_weights = stable_diffusion.text_encoder.layers[
+    2
+].token_embedding.get_weights()
+old_position_weights = stable_diffusion.text_encoder.layers[
+    2
+].position_embedding.get_weights()
+
+old_token_weights = old_token_weights[0]
+new_weights = np.expand_dims(new_weights, axis=0)
+new_weights = np.concatenate([old_token_weights, new_weights], axis=0)
+
+
+"""
+Let's construct a new TextEncoder and prepare it.
+"""
+
+# Have to set download_weights False so we can init (otherwise tries to load weights)
+new_encoder = keras_cv.models.stable_diffusion.TextEncoder(
+    keras_cv.models.stable_diffusion.stable_diffusion.MAX_PROMPT_LENGTH,
+    vocab_size=new_vocab_size,
+    download_weights=False,
+)
+for index, layer in enumerate(stable_diffusion.text_encoder.layers):
+    # Layer 2 is the embedding layer, so we omit it from our weight-copying
+    if index == 2:
+        continue
+    new_encoder.layers[index].set_weights(layer.get_weights())
+
+
+new_encoder.layers[2].token_embedding.set_weights([new_weights])
+new_encoder.layers[2].position_embedding.set_weights(old_position_weights)
+
+stable_diffusion._text_encoder = new_encoder
+stable_diffusion._text_encoder.compile(jit_compile=True)
+
+"""
+## Training
+
+Now we can move on to the exciting part: training!
+
+In TextualInversion, the only piece of the model that is trained is the embedding vector.
+Let's freeze the rest of the model.
+"""
+
+
+stable_diffusion.diffusion_model.trainable = False
+stable_diffusion.decoder.trainable = False
+stable_diffusion.text_encoder.trainable = True
+
+stable_diffusion.text_encoder.layers[2].trainable = True
+
+
+def traverse_layers(layer):
+    if hasattr(layer, "layers"):
+        for layer in layer.layers:
+            yield layer
+    if hasattr(layer, "token_embedding"):
+        yield layer.token_embedding
+    if hasattr(layer, "position_embedding"):
+        yield layer.position_embedding
+
+
+for layer in traverse_layers(stable_diffusion.text_encoder):
+    if isinstance(layer, keras.layers.Embedding) or "clip_embedding" in layer.name:
+        layer.trainable = True
+    else:
+        layer.trainable = False
+
+new_encoder.layers[2].position_embedding.trainable = False
+
+"""
+Let's confirm the proper weights are set to trainable.
+"""
+
+all_models = [
+    stable_diffusion.text_encoder,
+    stable_diffusion.diffusion_model,
+    stable_diffusion.decoder,
+]
+print([[w.shape for w in model.trainable_weights] for model in all_models])
+
+"""
+## Training the new embedding
+
+In order to train the embedding, we need a couple of utilities.
+We import a NoiseScheduler from KerasCV, and define the following utilities below:
+
+- `sample_from_encoder_outputs` is a wrapper around the base StableDiffusion image
+encoder which samples from the statistical distribution produced by the image
+encoder, rather than taking just the mean (like many other SD applications)
+- `get_timestep_embedding` produces an embedding for a specified timestep for the
+diffusion model
+- `get_position_ids` produces a tensor of position IDs for the text encoder (which is just a
+series from `[1, MAX_PROMPT_LENGTH]`)
+"""
+
+
+# Remove the top layer from the encoder, which cuts off the variance and only returns
+# the mean
+training_image_encoder = keras.Model(
+    stable_diffusion.image_encoder.input,
+    stable_diffusion.image_encoder.layers[-2].output,
+)
+
+
+def sample_from_encoder_outputs(outputs):
+    mean, logvar = tf.split(outputs, 2, axis=-1)
+    logvar = tf.clip_by_value(logvar, -30.0, 20.0)
+    std = tf.exp(0.5 * logvar)
+    sample = tf.random.normal(tf.shape(mean))
+    return mean + std * sample
+
+
+def get_timestep_embedding(timestep, dim=320, max_period=10000):
+    half = dim // 2
+    freqs = tf.math.exp(
+        -math.log(max_period) * tf.range(0, half, dtype=tf.float32) / half
+    )
+    args = tf.convert_to_tensor([timestep], dtype=tf.float32) * freqs
+    embedding = tf.concat([tf.math.cos(args), tf.math.sin(args)], 0)
+    return embedding
+
+
+def get_position_ids():
+    return tf.convert_to_tensor([list(range(MAX_PROMPT_LENGTH))], dtype=tf.int32)
+
+
+"""
+Next, we implement a `StableDiffusionFineTuner`, which is a subclass of `keras.Model`
+that overrides `train_step` to train the token embeddings of our text encoder.
+This is the core of the Textual Inversion algorithm.
+
+Abstractly speaking, the train step takes a sample from the output of the frozen SD
+image encoder's latent distribution for a training image, adds noise to that sample, and
+then passes that noisy sample to the frozen diffusion model.
+The hidden state of the diffusion model is the output of the text encoder for the prompt
+corresponding to the image.
+
+Our final goal state is that the diffusion model is able to separate the noise from the
+sample using the text encoding as hidden state, so our loss is the mean-squared error of
+the noise and the output of the diffusion model (which has, ideally, removed the image
+latents from the noise).
+
+We compute gradients for only the token embeddings of the text encoder, and in the
+train step we zero-out the gradients for all tokens other than the token that we're
+learning.
+
+See in-line code comments for more details about the train step.
+"""
+
+
+class StableDiffusionFineTuner(keras.Model):
+    def __init__(self, stable_diffusion, noise_scheduler, **kwargs):
+        super().__init__(**kwargs)
+        self.stable_diffusion = stable_diffusion
+        self.noise_scheduler = noise_scheduler
+
+    def train_step(self, data):
+        images, embeddings = data
+
+        with tf.GradientTape() as tape:
+            # Sample from the predicted distribution for the training image
+            latents = sample_from_encoder_outputs(training_image_encoder(images))
+            # The latents must be downsampled to match the scale of the latents used
+            # in the training of StableDiffusion.  This number is truly just a "magic"
+            # constant that they chose when training the model.
+            latents = latents * 0.18215
+
+            # Produce random noise in the same shape as the latent sample
+            noise = tf.random.normal(tf.shape(latents))
+            batch_dim = tf.shape(latents)[0]
+
+            # Pick a random timestep for each sample in the batch
+            timesteps = tf.random.uniform(
+                (batch_dim,),
+                minval=0,
+                maxval=noise_scheduler.train_timesteps,
+                dtype=tf.int64,
+            )
+
+            # Add noise to the latents based on the timestep for each sample
+            noisy_latents = self.noise_scheduler.add_noise(latents, noise, timesteps)
+
+            # Encode the text in the training samples to use as hidden state in the
+            # diffusion model
+            encoder_hidden_state = self.stable_diffusion.text_encoder(
+                [embeddings, get_position_ids()]
+            )
+
+            # Compute timestep embeddings for the randomly-selected timesteps for each
+            # sample in the batch
+            timestep_embeddings = tf.map_fn(
+                fn=get_timestep_embedding,
+                elems=timesteps,
+                fn_output_signature=tf.float32,
+            )
+
+            # Call the diffusion model
+            noise_pred = self.stable_diffusion.diffusion_model(
+                [noisy_latents, timestep_embeddings, encoder_hidden_state]
+            )
+
+            # Compute the mean-squared error loss and reduce it.
+            loss = self.compiled_loss(noise_pred, noise)
+            loss = tf.reduce_mean(loss, axis=2)
+            loss = tf.reduce_mean(loss, axis=1)
+            loss = tf.reduce_mean(loss)
+
+        # Load the trainable weights and compute the gradients for them
+        trainable_weights = self.stable_diffusion.text_encoder.trainable_weights
+        grads = tape.gradient(loss, trainable_weights)
+
+        # Gradients are stored in indexed slices, so we have to find the index
+        # of the slice(s) which contain the placeholder token.
+        index_of_placeholder_token = tf.reshape(tf.where(grads[0].indices == 49408), ())
+        condition = grads[0].indices == 49408
+        condition = tf.expand_dims(condition, axis=-1)
+
+        # Override the gradients, zeroing out the gradients for all slices that
+        # aren't for the placeholder token, effectively freezing the weights for
+        # all other tokens.
+        grads[0] = tf.IndexedSlices(
+            values=tf.where(condition, grads[0].values, 0),
+            indices=grads[0].indices,
+            dense_shape=grads[0].dense_shape,
+        )
+
+        self.optimizer.apply_gradients(zip(grads, trainable_weights))
+        return {"loss": loss}
+
+
+"""
+Before we start training, let's take a look at what StableDiffusion produces for our
+token.
+"""
+
+generated = stable_diffusion.text_to_image(
+    f"an oil painting of {placeholder_token}", seed=1337, batch_size=3
+)
+plot_images(generated)
+
+"""
+As you can see, the model still thinks of our token as a cat, as this was the seed token
+we used to initialize our custom token.
+
+Now, to get started with training, we can just `compile()` our model like any other
+Keras model. Before doing so, we also instantiate a noise scheduler for training and
+configure our training parameters such as learning rate and optimizer.
+"""
+
+noise_scheduler = NoiseScheduler(
+    beta_start=0.00085,
+    beta_end=0.012,
+    beta_schedule="scaled_linear",
+    train_timesteps=1000,
+)
+trainer = StableDiffusionFineTuner(stable_diffusion, noise_scheduler, name="trainer")
+EPOCHS = 50
+learning_rate = keras.optimizers.schedules.CosineDecay(
+    initial_learning_rate=1e-4, decay_steps=train_ds.cardinality() * EPOCHS
+)
+optimizer = keras.optimizers.Adam(
+    weight_decay=0.004, learning_rate=learning_rate, epsilon=1e-8, global_clipnorm=10
+)
+
+trainer.compile(
+    optimizer=optimizer,
+    # We are performing reduction manually in our train step, so none is required here.
+    loss=keras.losses.MeanSquaredError(reduction="none"),
+)
+
+"""
+To monitor training, we can produce a `keras.callbacks.Callback` to produce a few images
+every epoch using our custom token.
+
+We create three callbacks with different prompts so that we can see how they progress
+over the course of training. We use a fixed seed so that we can easily see the
+progression of the learned token.
+"""
+
+
+class GenerateImages(keras.callbacks.Callback):
+    def __init__(
+        self, stable_diffusion, prompt, steps=50, frequency=10, seed=None, **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.stable_diffusion = stable_diffusion
+        self.prompt = prompt
+        self.seed = seed
+        self.frequency = frequency
+        self.steps = steps
+
+    def on_epoch_end(self, epoch, logs):
+        if epoch % self.frequency == 0:
+            images = self.stable_diffusion.text_to_image(
+                self.prompt, batch_size=3, num_steps=self.steps, seed=self.seed
+            )
+            plot_images(
+                images,
+            )
+
+
+cbs = [
+    GenerateImages(
+        stable_diffusion, prompt=f"an oil painting of {placeholder_token}", seed=1337
+    ),
+    GenerateImages(
+        stable_diffusion, prompt=f"gandalf the gray as a {placeholder_token}", seed=1337
+    ),
+    GenerateImages(
+        stable_diffusion,
+        prompt=f"two {placeholder_token} getting married, photorealistic, high quality",
+        seed=1337,
+    ),
+]
+
+"""
+Now, all that is left to do is to call `model.fit()`!
+"""
+
+trainer.fit(
+    train_ds,
+    epochs=EPOCHS,
+    callbacks=cbs,
+)
+
+"""
+It's pretty fun to see how the model learns our new token over time. Play around with it
+and see how you can tune training parameters and your training dataset to produce the
+best images!
+"""
+
+"""
+## Taking the Fine Tuned Model for a Spin
+
+Now for the really fun part. We've learned a token embedding for our custom token, so
+now we can generate images with StableDiffusion the same way we would for any other
+token!
+
+Here are some fun example prompts to get you started, with sample outputs from our cat
+doll token!
+"""
+
+generated = stable_diffusion.text_to_image(
+    f"Gandalf as a {placeholder_token} fantasy art drawn by disney concept artists, "
+    "golden colour, high quality, highly detailed, elegant, sharp focus, concept art, "
+    "character concepts, digital painting, mystery, adventure",
+    batch_size=3,
+)
+plot_images(generated)
+
+"""
+"""
+
+generated = stable_diffusion.text_to_image(
+    f"A masterpiece of a {placeholder_token} crying out to the heavens. "
+    f"Behind the {placeholder_token}, an dark, evil shade looms over it - sucking the "
+    "life right out of it.",
+    batch_size=3,
+)
+plot_images(generated)
+
+"""
+"""
+
+generated = stable_diffusion.text_to_image(
+    f"An evil {placeholder_token}.", batch_size=3
+)
+plot_images(generated)
+
+"""
+"""
+
+generated = stable_diffusion.text_to_image(
+    f"A mysterious {placeholder_token} approaches the great pyramids of egypt.",
+    batch_size=3,
+)
+plot_images(generated)
+
+"""
+## Conclusions
+
+Using the Textual Inversion algorithm you can teach StableDiffusion new concepts!
+
+Some possible next steps to follow:
+
+- Try out your own prompts
+- Teach the model a style
+- Gather a dataset of your favorite pet cat or dog and teach the model about it
+"""
diff --git a/knowledge_base/generative/finetune_stable_diffusion.py b/knowledge_base/generative/finetune_stable_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..09e7c6fdf324f4fb23dc713104a15fd26378dce6
--- /dev/null
+++ b/knowledge_base/generative/finetune_stable_diffusion.py
@@ -0,0 +1,513 @@
+"""
+Title: Fine-tuning Stable Diffusion
+Author: [Sayak Paul](https://twitter.com/RisingSayak), [Chansung Park](https://twitter.com/algo_diver)
+Date created: 2022/12/28
+Last modified: 2023/01/13
+Description: Fine-tuning Stable Diffusion using a custom image-caption dataset.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+This tutorial shows how to fine-tune a
+[Stable Diffusion model](https://keras.io/guides/keras_cv/generate_images_with_stable_diffusion/)
+on a custom dataset of `{image, caption}` pairs. We build on top of the fine-tuning
+script provided by Hugging Face
+[here](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py).
+
+We assume that you have a high-level understanding of the Stable Diffusion model.
+The following resources can be helpful if you're looking for more information in that regard:
+
+* [High-performance image generation using Stable Diffusion in KerasCV](https://keras.io/guides/keras_cv/generate_images_with_stable_diffusion/)
+* [Stable Diffusion with Diffusers](https://huggingface.co/blog/stable_diffusion)
+
+It's highly recommended that you use a GPU with at least 30GB of memory to execute
+the code.
+
+By the end of the guide, you'll be able to generate images of interesting Pokémon:
+
+![custom-pokemons](https://i.imgur.com/X4m614M.png)
+
+The tutorial relies on KerasCV 0.4.0. Additionally, we need
+at least TensorFlow 2.11 in order to use AdamW with mixed precision.
+"""
+
+"""shell
+pip install keras-cv==0.6.0 -q
+pip install -U tensorflow -q
+pip install keras-core -q
+"""
+
+"""
+## What are we fine-tuning?
+
+A Stable Diffusion model can be decomposed into several key models:
+
+* A text encoder that projects the input prompt to a latent space. (The caption
+associated with an image is referred to as the "prompt".)
+* A variational autoencoder (VAE) that projects an input image to a latent space acting
+as an image vector space.
+* A diffusion model that refines a latent vector and produces another latent vector, conditioned
+on the encoded text prompt
+* A decoder that generates images given a latent vector from the diffusion model.
+
+It's worth noting that during the process of generating an image from a text prompt, the
+image encoder is not typically employed.
+
+However, during the process of fine-tuning, the workflow goes like the following:
+
+1. An input text prompt is projected to a latent space by the text encoder.
+2. An input image is projected to a latent space by the image encoder portion of the VAE.
+3. A small amount of noise is added to the image latent vector for a given timestep.
+4. The diffusion model uses latent vectors from these two spaces along with a timestep embedding
+to predict the noise that was added to the image latent.
+5. A reconstruction loss is calculated between the predicted noise and the original noise
+added in step 3.
+6. Finally, the diffusion model parameters are optimized w.r.t this loss using
+gradient descent.
+
+Note that only the diffusion model parameters are updated during fine-tuning, while the
+(pre-trained) text and the image encoders are kept frozen.
+
+Don't worry if this sounds complicated. The code is much simpler than this!
+"""
+
+"""
+## Imports
+"""
+
+from textwrap import wrap
+import os
+
+import keras_cv
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+import tensorflow.experimental.numpy as tnp
+from keras_cv.models.stable_diffusion.clip_tokenizer import SimpleTokenizer
+from keras_cv.models.stable_diffusion.diffusion_model import DiffusionModel
+from keras_cv.models.stable_diffusion.image_encoder import ImageEncoder
+from keras_cv.models.stable_diffusion.noise_scheduler import NoiseScheduler
+from keras_cv.models.stable_diffusion.text_encoder import TextEncoder
+from tensorflow import keras
+
+"""
+## Data loading
+
+We use the dataset
+[Pokémon BLIP captions](https://huggingface.co/datasets/lambdalabs/pokemon-blip-captions).
+However, we'll use a slightly different version which was derived from the original
+dataset to fit better with `tf.data`. Refer to
+[the documentation](https://huggingface.co/datasets/sayakpaul/pokemon-blip-original-version)
+for more details.
+"""
+
+data_path = tf.keras.utils.get_file(
+    origin="https://huggingface.co/datasets/sayakpaul/pokemon-blip-original-version/resolve/main/pokemon_dataset.tar.gz",
+    untar=True,
+)
+
+data_frame = pd.read_csv(os.path.join(data_path, "data.csv"))
+
+data_frame["image_path"] = data_frame["image_path"].apply(
+    lambda x: os.path.join(data_path, x)
+)
+data_frame.head()
+
+"""
+Since we have only 833 `{image, caption}` pairs, we can precompute the text embeddings from
+the captions. Moreover, the text encoder will be kept frozen during the course of
+fine-tuning, so we can save some compute by doing this.
+
+Before we use the text encoder, we need to tokenize the captions.
+"""
+
+# The padding token and maximum prompt length are specific to the text encoder.
+# If you're using a different text encoder be sure to change them accordingly.
+PADDING_TOKEN = 49407
+MAX_PROMPT_LENGTH = 77
+
+# Load the tokenizer.
+tokenizer = SimpleTokenizer()
+
+
+#  Method to tokenize and pad the tokens.
+def process_text(caption):
+    tokens = tokenizer.encode(caption)
+    tokens = tokens + [PADDING_TOKEN] * (MAX_PROMPT_LENGTH - len(tokens))
+    return np.array(tokens)
+
+
+# Collate the tokenized captions into an array.
+tokenized_texts = np.empty((len(data_frame), MAX_PROMPT_LENGTH))
+
+all_captions = list(data_frame["caption"].values)
+for i, caption in enumerate(all_captions):
+    tokenized_texts[i] = process_text(caption)
+
+"""
+## Prepare a `tf.data.Dataset`
+
+In this section, we'll prepare a `tf.data.Dataset` object from the input image file paths
+and their corresponding caption tokens. The section will include the following:
+
+* Pre-computation of the text embeddings from the tokenized captions.
+* Loading and augmentation of the input images.
+* Shuffling and batching of the dataset.
+"""
+
+RESOLUTION = 256
+AUTO = tf.data.AUTOTUNE
+POS_IDS = tf.convert_to_tensor([list(range(MAX_PROMPT_LENGTH))], dtype=tf.int32)
+
+augmenter = keras.Sequential(
+    layers=[
+        keras_cv.layers.CenterCrop(RESOLUTION, RESOLUTION),
+        keras_cv.layers.RandomFlip(),
+        tf.keras.layers.Rescaling(scale=1.0 / 127.5, offset=-1),
+    ]
+)
+text_encoder = TextEncoder(MAX_PROMPT_LENGTH)
+
+
+def process_image(image_path, tokenized_text):
+    image = tf.io.read_file(image_path)
+    image = tf.io.decode_png(image, 3)
+    image = tf.image.resize(image, (RESOLUTION, RESOLUTION))
+    return image, tokenized_text
+
+
+def apply_augmentation(image_batch, token_batch):
+    return augmenter(image_batch), token_batch
+
+
+def run_text_encoder(image_batch, token_batch):
+    return (
+        image_batch,
+        token_batch,
+        text_encoder([token_batch, POS_IDS], training=False),
+    )
+
+
+def prepare_dict(image_batch, token_batch, encoded_text_batch):
+    return {
+        "images": image_batch,
+        "tokens": token_batch,
+        "encoded_text": encoded_text_batch,
+    }
+
+
+def prepare_dataset(image_paths, tokenized_texts, batch_size=1):
+    dataset = tf.data.Dataset.from_tensor_slices((image_paths, tokenized_texts))
+    dataset = dataset.shuffle(batch_size * 10)
+    dataset = dataset.map(process_image, num_parallel_calls=AUTO).batch(batch_size)
+    dataset = dataset.map(apply_augmentation, num_parallel_calls=AUTO)
+    dataset = dataset.map(run_text_encoder, num_parallel_calls=AUTO)
+    dataset = dataset.map(prepare_dict, num_parallel_calls=AUTO)
+    return dataset.prefetch(AUTO)
+
+
+"""
+The baseline Stable Diffusion model was trained using images with 512x512 resolution. It's
+unlikely for a model that's trained using higher-resolution images to transfer well to
+lower-resolution images. However, the current model will lead to OOM if we keep the
+resolution to 512x512 (without enabling mixed-precision). Therefore, in the interest of
+interactive demonstrations, we kept the input resolution to 256x256.
+"""
+
+# Prepare the dataset.
+training_dataset = prepare_dataset(
+    np.array(data_frame["image_path"]), tokenized_texts, batch_size=4
+)
+
+# Take a sample batch and investigate.
+sample_batch = next(iter(training_dataset))
+
+for k in sample_batch:
+    print(k, sample_batch[k].shape)
+
+"""
+We can also take a look at the training images and their corresponding captions.
+"""
+
+plt.figure(figsize=(20, 10))
+
+for i in range(3):
+    ax = plt.subplot(1, 4, i + 1)
+    plt.imshow((sample_batch["images"][i] + 1) / 2)
+
+    text = tokenizer.decode(sample_batch["tokens"][i].numpy().squeeze())
+    text = text.replace("<|startoftext|>", "")
+    text = text.replace("<|endoftext|>", "")
+    text = "\n".join(wrap(text, 12))
+    plt.title(text, fontsize=15)
+
+    plt.axis("off")
+
+"""
+## A trainer class for the fine-tuning loop
+"""
+
+
+class Trainer(tf.keras.Model):
+    # Reference:
+    # https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py
+
+    def __init__(
+        self,
+        diffusion_model,
+        vae,
+        noise_scheduler,
+        use_mixed_precision=False,
+        max_grad_norm=1.0,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        self.diffusion_model = diffusion_model
+        self.vae = vae
+        self.noise_scheduler = noise_scheduler
+        self.max_grad_norm = max_grad_norm
+
+        self.use_mixed_precision = use_mixed_precision
+        self.vae.trainable = False
+
+    def train_step(self, inputs):
+        images = inputs["images"]
+        encoded_text = inputs["encoded_text"]
+        batch_size = tf.shape(images)[0]
+
+        with tf.GradientTape() as tape:
+            # Project image into the latent space and sample from it.
+            latents = self.sample_from_encoder_outputs(self.vae(images, training=False))
+            # Know more about the magic number here:
+            # https://keras.io/examples/generative/fine_tune_via_textual_inversion/
+            latents = latents * 0.18215
+
+            # Sample noise that we'll add to the latents.
+            noise = tf.random.normal(tf.shape(latents))
+
+            # Sample a random timestep for each image.
+            timesteps = tnp.random.randint(
+                0, self.noise_scheduler.train_timesteps, (batch_size,)
+            )
+
+            # Add noise to the latents according to the noise magnitude at each timestep
+            # (this is the forward diffusion process).
+            noisy_latents = self.noise_scheduler.add_noise(
+                tf.cast(latents, noise.dtype), noise, timesteps
+            )
+
+            # Get the target for loss depending on the prediction type
+            # just the sampled noise for now.
+            target = noise  # noise_schedule.predict_epsilon == True
+
+            # Predict the noise residual and compute loss.
+            timestep_embedding = tf.map_fn(
+                lambda t: self.get_timestep_embedding(t), timesteps, dtype=tf.float32
+            )
+            timestep_embedding = tf.squeeze(timestep_embedding, 1)
+            model_pred = self.diffusion_model(
+                [noisy_latents, timestep_embedding, encoded_text], training=True
+            )
+            loss = self.compiled_loss(target, model_pred)
+            if self.use_mixed_precision:
+                loss = self.optimizer.get_scaled_loss(loss)
+
+        # Update parameters of the diffusion model.
+        trainable_vars = self.diffusion_model.trainable_variables
+        gradients = tape.gradient(loss, trainable_vars)
+        if self.use_mixed_precision:
+            gradients = self.optimizer.get_unscaled_gradients(gradients)
+        gradients = [tf.clip_by_norm(g, self.max_grad_norm) for g in gradients]
+        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
+
+        return {m.name: m.result() for m in self.metrics}
+
+    def get_timestep_embedding(self, timestep, dim=320, max_period=10000):
+        half = dim // 2
+        log_max_period = tf.math.log(tf.cast(max_period, tf.float32))
+        freqs = tf.math.exp(
+            -log_max_period * tf.range(0, half, dtype=tf.float32) / half
+        )
+        args = tf.convert_to_tensor([timestep], dtype=tf.float32) * freqs
+        embedding = tf.concat([tf.math.cos(args), tf.math.sin(args)], 0)
+        embedding = tf.reshape(embedding, [1, -1])
+        return embedding
+
+    def sample_from_encoder_outputs(self, outputs):
+        mean, logvar = tf.split(outputs, 2, axis=-1)
+        logvar = tf.clip_by_value(logvar, -30.0, 20.0)
+        std = tf.exp(0.5 * logvar)
+        sample = tf.random.normal(tf.shape(mean), dtype=mean.dtype)
+        return mean + std * sample
+
+    def save_weights(self, filepath, overwrite=True, save_format=None, options=None):
+        # Overriding this method will allow us to use the `ModelCheckpoint`
+        # callback directly with this trainer class. In this case, it will
+        # only checkpoint the `diffusion_model` since that's what we're training
+        # during fine-tuning.
+        self.diffusion_model.save_weights(
+            filepath=filepath,
+            overwrite=overwrite,
+            save_format=save_format,
+            options=options,
+        )
+
+
+"""
+One important implementation detail to note here: Instead of directly taking
+the latent vector produced by the image encoder (which is a VAE), we sample from the
+mean and log-variance predicted by it. This way, we can achieve better sample
+quality and diversity.
+
+It's common to add support for mixed-precision training along with exponential
+moving averaging of model weights for fine-tuning these models. However, in the interest
+of brevity, we discard those elements. More on this later in the tutorial.
+"""
+
+"""
+## Initialize the trainer and compile it
+"""
+
+# Enable mixed-precision training if the underlying GPU has tensor cores.
+USE_MP = True
+if USE_MP:
+    keras.mixed_precision.set_global_policy("mixed_float16")
+
+image_encoder = ImageEncoder()
+diffusion_ft_trainer = Trainer(
+    diffusion_model=DiffusionModel(RESOLUTION, RESOLUTION, MAX_PROMPT_LENGTH),
+    # Remove the top layer from the encoder, which cuts off the variance and only
+    # returns the mean.
+    vae=tf.keras.Model(
+        image_encoder.input,
+        image_encoder.layers[-2].output,
+    ),
+    noise_scheduler=NoiseScheduler(),
+    use_mixed_precision=USE_MP,
+)
+
+# These hyperparameters come from this tutorial by Hugging Face:
+# https://huggingface.co/docs/diffusers/training/text2image
+lr = 1e-5
+beta_1, beta_2 = 0.9, 0.999
+weight_decay = (1e-2,)
+epsilon = 1e-08
+
+optimizer = tf.keras.optimizers.experimental.AdamW(
+    learning_rate=lr,
+    weight_decay=weight_decay,
+    beta_1=beta_1,
+    beta_2=beta_2,
+    epsilon=epsilon,
+)
+diffusion_ft_trainer.compile(optimizer=optimizer, loss="mse")
+
+"""
+## Fine-tuning
+
+To keep the runtime of this tutorial short, we just fine-tune for an epoch.
+"""
+
+epochs = 1
+ckpt_path = "finetuned_stable_diffusion.h5"
+ckpt_callback = tf.keras.callbacks.ModelCheckpoint(
+    ckpt_path,
+    save_weights_only=True,
+    monitor="loss",
+    mode="min",
+)
+diffusion_ft_trainer.fit(training_dataset, epochs=epochs, callbacks=[ckpt_callback])
+
+"""
+## Inference
+
+We fine-tuned the model for 60 epochs on an image resolution of 512x512. To allow
+training with this resolution, we incorporated mixed-precision support. You can
+check out
+[this repository](https://github.com/sayakpaul/stabe-diffusion-keras-ft)
+for more details. It additionally provides support for exponential moving averaging of
+the fine-tuned model parameters and model checkpointing.
+
+
+For this section, we'll use the checkpoint derived after 60 epochs of fine-tuning.
+"""
+
+weights_path = tf.keras.utils.get_file(
+    origin="https://huggingface.co/sayakpaul/kerascv_sd_pokemon_finetuned/resolve/main/ckpt_epochs_72_res_512_mp_True.h5"
+)
+
+img_height = img_width = 512
+pokemon_model = keras_cv.models.StableDiffusion(
+    img_width=img_width, img_height=img_height
+)
+# We just reload the weights of the fine-tuned diffusion model.
+pokemon_model.diffusion_model.load_weights(weights_path)
+
+"""
+Now, we can take this model for a test-drive.
+"""
+
+prompts = ["Yoda", "Hello Kitty", "A pokemon with red eyes"]
+images_to_generate = 3
+outputs = {}
+
+for prompt in prompts:
+    generated_images = pokemon_model.text_to_image(
+        prompt, batch_size=images_to_generate, unconditional_guidance_scale=40
+    )
+    outputs.update({prompt: generated_images})
+
+"""
+With 60 epochs of fine-tuning (a good number is about 70), the generated images were not
+up to the mark. So, we experimented with the number of steps Stable Diffusion takes
+during the inference time and the `unconditional_guidance_scale` parameter.
+
+We found the best results with this checkpoint with `unconditional_guidance_scale` set to
+40.
+"""
+
+
+def plot_images(images, title):
+    plt.figure(figsize=(20, 20))
+    for i in range(len(images)):
+        ax = plt.subplot(1, len(images), i + 1)
+        plt.imshow(images[i])
+        plt.title(title, fontsize=12)
+        plt.axis("off")
+
+
+for prompt in outputs:
+    plot_images(outputs[prompt], prompt)
+
+"""
+We can notice that the model has started adapting to the style of our dataset. You can
+check the
+[accompanying repository](https://github.com/sayakpaul/stable-diffusion-keras-ft#results)
+for more comparisons and commentary. If you're feeling adventurous to try out a demo,
+you can check out
+[this resource](https://huggingface.co/spaces/sayakpaul/pokemon-sd-kerascv).
+"""
+
+"""
+## Conclusion and acknowledgements
+
+We demonstrated how to fine-tune the Stable Diffusion model on a custom dataset. While
+the results are far from aesthetically pleasing, we believe with more epochs of
+fine-tuning, they will likely improve. To enable that, having support for gradient
+accumulation and distributed training is crucial. This can be thought of as the next step
+in this tutorial.
+
+There is another interesting way in which Stable Diffusion models can be fine-tuned,
+called textual inversion. You can refer to
+[this tutorial](https://keras.io/examples/generative/fine_tune_via_textual_inversion/)
+to know more about it.
+
+We'd like to acknowledge the GCP Credit support from ML Developer Programs' team at
+Google. We'd like to thank the Hugging Face team for providing the
+[fine-tuning script](https://github.com/huggingface/diffusers/blob/main/examples/text_to_image/train_text_to_image.py)
+. It's very readable and easy to understand.
+"""
diff --git a/knowledge_base/generative/gan_ada.py b/knowledge_base/generative/gan_ada.py
new file mode 100644
index 0000000000000000000000000000000000000000..ac79a8209010850f716b7b8c353b435771b80a69
--- /dev/null
+++ b/knowledge_base/generative/gan_ada.py
@@ -0,0 +1,772 @@
+"""
+Title: Data-efficient GANs with Adaptive Discriminator Augmentation
+Author: [András Béres](https://www.linkedin.com/in/andras-beres-789190210)
+Date created: 2021/10/28
+Last modified: 2025/01/23
+Description: Generating images from limited data using the Caltech Birds dataset.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+### GANs
+
+[Generative Adversarial Networks (GANs)](https://arxiv.org/abs/1406.2661) are a popular
+class of generative deep learning models, commonly used for image generation. They
+consist of a pair of dueling neural networks, called the discriminator and the generator.
+The discriminator's task is to distinguish real images from generated (fake) ones, while
+the generator network tries to fool the discriminator by generating more and more
+realistic images. If the generator is however too easy or too hard to fool, it might fail
+to provide useful learning signal for the generator, therefore training GANs is usually
+considered a difficult task.
+
+### Data augmentation for GANS
+
+Data augmentation, a popular technique in deep learning, is the process of randomly
+applying semantics-preserving transformations to the input data to generate multiple
+realistic versions of it, thereby effectively multiplying the amount of training data
+available. The simplest example is left-right flipping an image, which preserves its
+contents while generating a second unique training sample. Data augmentation is commonly
+used in supervised learning to prevent overfitting and enhance generalization.
+
+The authors of [StyleGAN2-ADA](https://arxiv.org/abs/2006.06676) show that discriminator
+overfitting can be an issue in GANs, especially when only low amounts of training data is
+available. They propose Adaptive Discriminator Augmentation to mitigate this issue.
+
+Applying data augmentation to GANs however is not straightforward. Since the generator is
+updated using the discriminator's gradients, if the generated images are augmented, the
+augmentation pipeline has to be differentiable and also has to be GPU-compatible for
+computational efficiency. Luckily, the
+[Keras image augmentation layers](https://keras.io/api/layers/preprocessing_layers/image_augmentation/)
+fulfill both these requirements, and are therefore very well suited for this task.
+
+### Invertible data augmentation
+
+A possible difficulty when using data augmentation in generative models is the issue of
+["leaky augmentations" (section 2.2)](https://arxiv.org/abs/2006.06676), namely when the
+model generates images that are already augmented. This would mean that it was not able
+to separate the augmentation from the underlying data distribution, which can be caused
+by using non-invertible data transformations. For example, if either 0, 90, 180 or 270
+degree rotations are performed with equal probability, the original orientation of the
+images is impossible to infer, and this information is destroyed.
+
+A simple trick to make data augmentations invertible is to only apply them with some
+probability. That way the original version of the images will be more common, and the
+data distribution can be inferred. By properly choosing this probability, one can
+effectively regularize the discriminator without making the augmentations leaky.
+
+"""
+
+"""
+## Setup
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import matplotlib.pyplot as plt
+import tensorflow as tf
+import tensorflow_datasets as tfds
+
+import keras
+from keras import ops
+from keras import layers
+
+"""
+## Hyperparameterers
+"""
+
+# data
+num_epochs = 10  # train for 400 epochs for good results
+image_size = 64
+# resolution of Kernel Inception Distance measurement, see related section
+kid_image_size = 75
+padding = 0.25
+dataset_name = "caltech_birds2011"
+
+# adaptive discriminator augmentation
+max_translation = 0.125
+max_rotation = 0.125
+max_zoom = 0.25
+target_accuracy = 0.85
+integration_steps = 1000
+
+# architecture
+noise_size = 64
+depth = 4
+width = 128
+leaky_relu_slope = 0.2
+dropout_rate = 0.4
+
+# optimization
+batch_size = 128
+learning_rate = 2e-4
+beta_1 = 0.5  # not using the default value of 0.9 is important
+ema = 0.99
+
+"""
+## Data pipeline
+
+In this example, we will use the
+[Caltech Birds (2011)](https://www.tensorflow.org/datasets/catalog/caltech_birds2011) dataset for
+generating images of birds, which is a diverse natural dataset containing less then 6000
+images for training. When working with such low amounts of data, one has to take extra
+care to retain as high data quality as possible. In this example, we use the provided
+bounding boxes of the birds to cut them out with square crops while preserving their
+aspect ratios when possible.
+"""
+
+
+def round_to_int(float_value):
+    return ops.cast(ops.round(float_value), "int32")
+
+
+def preprocess_image(data):
+    # unnormalize bounding box coordinates
+    height = ops.cast(ops.shape(data["image"])[0], "float32")
+    width = ops.cast(ops.shape(data["image"])[1], "float32")
+    bounding_box = data["bbox"] * ops.stack([height, width, height, width])
+
+    # calculate center and length of longer side, add padding
+    target_center_y = 0.5 * (bounding_box[0] + bounding_box[2])
+    target_center_x = 0.5 * (bounding_box[1] + bounding_box[3])
+    target_size = ops.maximum(
+        (1.0 + padding) * (bounding_box[2] - bounding_box[0]),
+        (1.0 + padding) * (bounding_box[3] - bounding_box[1]),
+    )
+
+    # modify crop size to fit into image
+    target_height = ops.min(
+        [target_size, 2.0 * target_center_y, 2.0 * (height - target_center_y)]
+    )
+    target_width = ops.min(
+        [target_size, 2.0 * target_center_x, 2.0 * (width - target_center_x)]
+    )
+
+    # crop image, `ops.image.crop_images` only works with non-tensor croppings
+    image = ops.slice(
+        data["image"],
+        start_indices=(
+            round_to_int(target_center_y - 0.5 * target_height),
+            round_to_int(target_center_x - 0.5 * target_width),
+            0,
+        ),
+        shape=(round_to_int(target_height), round_to_int(target_width), 3),
+    )
+
+    # resize and clip
+    image = ops.cast(image, "float32")
+    image = ops.image.resize(image, [image_size, image_size])
+
+    return ops.clip(image / 255.0, 0.0, 1.0)
+
+
+def prepare_dataset(split):
+    # the validation dataset is shuffled as well, because data order matters
+    # for the KID calculation
+    return (
+        tfds.load(dataset_name, split=split, shuffle_files=True)
+        .map(preprocess_image, num_parallel_calls=tf.data.AUTOTUNE)
+        .cache()
+        .shuffle(10 * batch_size)
+        .batch(batch_size, drop_remainder=True)
+        .prefetch(buffer_size=tf.data.AUTOTUNE)
+    )
+
+
+train_dataset = prepare_dataset("train")
+val_dataset = prepare_dataset("test")
+
+"""
+After preprocessing the training images look like the following:
+![birds dataset](https://i.imgur.com/Ru5HgBM.png)
+"""
+
+"""
+## Kernel inception distance
+
+[Kernel Inception Distance (KID)](https://arxiv.org/abs/1801.01401) was proposed as a
+replacement for the popular
+[Frechet Inception Distance (FID)](https://arxiv.org/abs/1706.08500)
+metric for measuring image generation quality.
+Both metrics measure the difference in the generated and training distributions in the
+representation space of an [InceptionV3](https://keras.io/api/applications/inceptionv3/)
+network pretrained on
+[ImageNet](https://www.tensorflow.org/datasets/catalog/imagenet2012).
+
+According to the paper, KID was proposed because FID has no unbiased estimator, its
+expected value is higher when it is measured on fewer images. KID is more suitable for
+small datasets because its expected value does not depend on the number of samples it is
+measured on. In my experience it is also computationally lighter, numerically more
+stable, and simpler to implement because it can be estimated in a per-batch manner.
+
+In this example, the images are evaluated at the minimal possible resolution of the
+Inception network (75x75 instead of 299x299), and the metric is only measured on the
+validation set for computational efficiency.
+
+
+"""
+
+
+class KID(keras.metrics.Metric):
+    def __init__(self, name="kid", **kwargs):
+        super().__init__(name=name, **kwargs)
+
+        # KID is estimated per batch and is averaged across batches
+        self.kid_tracker = keras.metrics.Mean()
+
+        # a pretrained InceptionV3 is used without its classification layer
+        # transform the pixel values to the 0-255 range, then use the same
+        # preprocessing as during pretraining
+        self.encoder = keras.Sequential(
+            [
+                layers.InputLayer(input_shape=(image_size, image_size, 3)),
+                layers.Rescaling(255.0),
+                layers.Resizing(height=kid_image_size, width=kid_image_size),
+                layers.Lambda(keras.applications.inception_v3.preprocess_input),
+                keras.applications.InceptionV3(
+                    include_top=False,
+                    input_shape=(kid_image_size, kid_image_size, 3),
+                    weights="imagenet",
+                ),
+                layers.GlobalAveragePooling2D(),
+            ],
+            name="inception_encoder",
+        )
+
+    def polynomial_kernel(self, features_1, features_2):
+        feature_dimensions = ops.cast(ops.shape(features_1)[1], "float32")
+        return (
+            features_1 @ ops.transpose(features_2) / feature_dimensions + 1.0
+        ) ** 3.0
+
+    def update_state(self, real_images, generated_images, sample_weight=None):
+        real_features = self.encoder(real_images, training=False)
+        generated_features = self.encoder(generated_images, training=False)
+
+        # compute polynomial kernels using the two sets of features
+        kernel_real = self.polynomial_kernel(real_features, real_features)
+        kernel_generated = self.polynomial_kernel(
+            generated_features, generated_features
+        )
+        kernel_cross = self.polynomial_kernel(real_features, generated_features)
+
+        # estimate the squared maximum mean discrepancy using the average kernel values
+        batch_size = ops.shape(real_features)[0]
+        batch_size_f = ops.cast(batch_size, "float32")
+        mean_kernel_real = ops.sum(kernel_real * (1.0 - ops.eye(batch_size))) / (
+            batch_size_f * (batch_size_f - 1.0)
+        )
+        mean_kernel_generated = ops.sum(
+            kernel_generated * (1.0 - ops.eye(batch_size))
+        ) / (batch_size_f * (batch_size_f - 1.0))
+        mean_kernel_cross = ops.mean(kernel_cross)
+        kid = mean_kernel_real + mean_kernel_generated - 2.0 * mean_kernel_cross
+
+        # update the average KID estimate
+        self.kid_tracker.update_state(kid)
+
+    def result(self):
+        return self.kid_tracker.result()
+
+    def reset_state(self):
+        self.kid_tracker.reset_state()
+
+
+"""
+
+## Adaptive discriminator augmentation
+
+The authors of [StyleGAN2-ADA](https://arxiv.org/abs/2006.06676) propose to change the
+augmentation probability adaptively during training. Though it is explained differently
+in the paper, they use [integral control](https://en.wikipedia.org/wiki/PID_controller#Integral) on the augmentation
+probability to keep the discriminator's accuracy on real images close to a target value.
+Note, that their controlled variable is actually the average sign of the discriminator
+logits (r_t in the paper), which corresponds to 2 * accuracy - 1.
+
+This method requires two hyperparameters:
+
+1. `target_accuracy`: the target value for the discriminator's accuracy on real images. I
+recommend selecting its value from the 80-90% range.
+2. [`integration_steps`](https://en.wikipedia.org/wiki/PID_controller#Mathematical_form):
+the number of update steps required for an accuracy error of 100% to transform into an
+augmentation probability increase of 100%. To give an intuition, this defines how slowly
+the augmentation probability is changed. I recommend setting this to a relatively high
+value (1000 in this case) so that the augmentation strength is only adjusted slowly.
+
+The main motivation for this procedure is that the optimal value of the target accuracy
+is similar across different dataset sizes (see [figure 4 and 5 in the paper](https://arxiv.org/abs/2006.06676)),
+so it does not have to be re-tuned, because the
+process automatically applies stronger data augmentation when it is needed.
+
+"""
+
+
+# "hard sigmoid", useful for binary accuracy calculation from logits
+def step(values):
+    # negative values -> 0.0, positive values -> 1.0
+    return 0.5 * (1.0 + ops.sign(values))
+
+
+# augments images with a probability that is dynamically updated during training
+class AdaptiveAugmenter(keras.Model):
+    def __init__(self):
+        super().__init__()
+
+        # stores the current probability of an image being augmented
+        self.probability = keras.Variable(0.0)
+        self.seed_generator = keras.random.SeedGenerator(42)
+
+        # the corresponding augmentation names from the paper are shown above each layer
+        # the authors show (see figure 4), that the blitting and geometric augmentations
+        # are the most helpful in the low-data regime
+        self.augmenter = keras.Sequential(
+            [
+                layers.InputLayer(input_shape=(image_size, image_size, 3)),
+                # blitting/x-flip:
+                layers.RandomFlip("horizontal"),
+                # blitting/integer translation:
+                layers.RandomTranslation(
+                    height_factor=max_translation,
+                    width_factor=max_translation,
+                    interpolation="nearest",
+                ),
+                # geometric/rotation:
+                layers.RandomRotation(factor=max_rotation),
+                # geometric/isotropic and anisotropic scaling:
+                layers.RandomZoom(
+                    height_factor=(-max_zoom, 0.0), width_factor=(-max_zoom, 0.0)
+                ),
+            ],
+            name="adaptive_augmenter",
+        )
+
+    def call(self, images, training):
+        if training:
+            augmented_images = self.augmenter(images, training=training)
+
+            # during training either the original or the augmented images are selected
+            # based on self.probability
+            augmentation_values = keras.random.uniform(
+                shape=(batch_size, 1, 1, 1), seed=self.seed_generator
+            )
+            augmentation_bools = ops.less(augmentation_values, self.probability)
+
+            images = ops.where(augmentation_bools, augmented_images, images)
+        return images
+
+    def update(self, real_logits):
+        current_accuracy = ops.mean(step(real_logits))
+
+        # the augmentation probability is updated based on the discriminator's
+        # accuracy on real images
+        accuracy_error = current_accuracy - target_accuracy
+        self.probability.assign(
+            ops.clip(self.probability + accuracy_error / integration_steps, 0.0, 1.0)
+        )
+
+
+"""
+## Network architecture
+
+Here we specify the architecture of the two networks:
+
+* generator: maps a random vector to an image, which should be as realistic as possible
+* discriminator: maps an image to a scalar score, which should be high for real and low
+for generated images
+
+GANs tend to be sensitive to the network architecture, I implemented a DCGAN architecture
+in this example, because it is relatively stable during training while being simple to
+implement. We use a constant number of filters throughout the network, use a sigmoid
+instead of tanh in the last layer of the generator, and use default initialization
+instead of random normal as further simplifications.
+
+As a good practice, we disable the learnable scale parameter in the batch normalization
+layers, because on one hand the following relu + convolutional layers make it redundant
+(as noted in the
+[documentation](https://keras.io/api/layers/normalization_layers/batch_normalization/)).
+But also because it should be disabled based on theory when using [spectral normalization
+(section 4.1)](https://arxiv.org/abs/1802.05957), which is not used here, but is common
+in GANs. We also disable the bias in the fully connected and convolutional layers, because
+the following batch normalization makes it redundant.
+"""
+
+
+# DCGAN generator
+def get_generator():
+    noise_input = keras.Input(shape=(noise_size,))
+    x = layers.Dense(4 * 4 * width, use_bias=False)(noise_input)
+    x = layers.BatchNormalization(scale=False)(x)
+    x = layers.ReLU()(x)
+    x = layers.Reshape(target_shape=(4, 4, width))(x)
+    for _ in range(depth - 1):
+        x = layers.Conv2DTranspose(
+            width,
+            kernel_size=4,
+            strides=2,
+            padding="same",
+            use_bias=False,
+        )(x)
+        x = layers.BatchNormalization(scale=False)(x)
+        x = layers.ReLU()(x)
+    image_output = layers.Conv2DTranspose(
+        3,
+        kernel_size=4,
+        strides=2,
+        padding="same",
+        activation="sigmoid",
+    )(x)
+
+    return keras.Model(noise_input, image_output, name="generator")
+
+
+# DCGAN discriminator
+def get_discriminator():
+    image_input = keras.Input(shape=(image_size, image_size, 3))
+    x = image_input
+    for _ in range(depth):
+        x = layers.Conv2D(
+            width,
+            kernel_size=4,
+            strides=2,
+            padding="same",
+            use_bias=False,
+        )(x)
+        x = layers.BatchNormalization(scale=False)(x)
+        x = layers.LeakyReLU(alpha=leaky_relu_slope)(x)
+    x = layers.Flatten()(x)
+    x = layers.Dropout(dropout_rate)(x)
+    output_score = layers.Dense(1)(x)
+
+    return keras.Model(image_input, output_score, name="discriminator")
+
+
+"""
+## GAN model
+"""
+
+
+class GAN_ADA(keras.Model):
+    def __init__(self):
+        super().__init__()
+
+        self.seed_generator = keras.random.SeedGenerator(seed=42)
+        self.augmenter = AdaptiveAugmenter()
+        self.generator = get_generator()
+        self.ema_generator = keras.models.clone_model(self.generator)
+        self.discriminator = get_discriminator()
+
+        self.generator.summary()
+        self.discriminator.summary()
+        # we have created all layers at this point, so we can mark the model
+        # as having been built
+        self.built = True
+
+    def compile(self, generator_optimizer, discriminator_optimizer, **kwargs):
+        super().compile(**kwargs)
+
+        # separate optimizers for the two networks
+        self.generator_optimizer = generator_optimizer
+        self.discriminator_optimizer = discriminator_optimizer
+
+        self.generator_loss_tracker = keras.metrics.Mean(name="g_loss")
+        self.discriminator_loss_tracker = keras.metrics.Mean(name="d_loss")
+        self.real_accuracy = keras.metrics.BinaryAccuracy(name="real_acc")
+        self.generated_accuracy = keras.metrics.BinaryAccuracy(name="gen_acc")
+        self.augmentation_probability_tracker = keras.metrics.Mean(name="aug_p")
+        self.kid = KID()
+
+    @property
+    def metrics(self):
+        return [
+            self.generator_loss_tracker,
+            self.discriminator_loss_tracker,
+            self.real_accuracy,
+            self.generated_accuracy,
+            self.augmentation_probability_tracker,
+            self.kid,
+        ]
+
+    def generate(self, batch_size, training):
+        latent_samples = keras.random.normal(
+            shape=(batch_size, noise_size), seed=self.seed_generator
+        )
+        # use ema_generator during inference
+        if training:
+            generated_images = self.generator(latent_samples, training=training)
+        else:
+            generated_images = self.ema_generator(latent_samples, training=training)
+        return generated_images
+
+    def adversarial_loss(self, real_logits, generated_logits):
+        # this is usually called the non-saturating GAN loss
+
+        real_labels = ops.ones(shape=(batch_size, 1))
+        generated_labels = ops.zeros(shape=(batch_size, 1))
+
+        # the generator tries to produce images that the discriminator considers as real
+        generator_loss = keras.losses.binary_crossentropy(
+            real_labels, generated_logits, from_logits=True
+        )
+        # the discriminator tries to determine if images are real or generated
+        discriminator_loss = keras.losses.binary_crossentropy(
+            ops.concatenate([real_labels, generated_labels], axis=0),
+            ops.concatenate([real_logits, generated_logits], axis=0),
+            from_logits=True,
+        )
+
+        return ops.mean(generator_loss), ops.mean(discriminator_loss)
+
+    def train_step(self, real_images):
+        real_images = self.augmenter(real_images, training=True)
+
+        # use persistent gradient tape because gradients will be calculated twice
+        with tf.GradientTape(persistent=True) as tape:
+            generated_images = self.generate(batch_size, training=True)
+            # gradient is calculated through the image augmentation
+            generated_images = self.augmenter(generated_images, training=True)
+
+            # separate forward passes for the real and generated images, meaning
+            # that batch normalization is applied separately
+            real_logits = self.discriminator(real_images, training=True)
+            generated_logits = self.discriminator(generated_images, training=True)
+
+            generator_loss, discriminator_loss = self.adversarial_loss(
+                real_logits, generated_logits
+            )
+
+        # calculate gradients and update weights
+        generator_gradients = tape.gradient(
+            generator_loss, self.generator.trainable_weights
+        )
+        discriminator_gradients = tape.gradient(
+            discriminator_loss, self.discriminator.trainable_weights
+        )
+        self.generator_optimizer.apply_gradients(
+            zip(generator_gradients, self.generator.trainable_weights)
+        )
+        self.discriminator_optimizer.apply_gradients(
+            zip(discriminator_gradients, self.discriminator.trainable_weights)
+        )
+
+        # update the augmentation probability based on the discriminator's performance
+        self.augmenter.update(real_logits)
+
+        self.generator_loss_tracker.update_state(generator_loss)
+        self.discriminator_loss_tracker.update_state(discriminator_loss)
+        self.real_accuracy.update_state(1.0, step(real_logits))
+        self.generated_accuracy.update_state(0.0, step(generated_logits))
+        self.augmentation_probability_tracker.update_state(self.augmenter.probability)
+
+        # track the exponential moving average of the generator's weights to decrease
+        # variance in the generation quality
+        for weight, ema_weight in zip(
+            self.generator.weights, self.ema_generator.weights
+        ):
+            ema_weight.assign(ema * ema_weight + (1 - ema) * weight)
+
+        # KID is not measured during the training phase for computational efficiency
+        return {m.name: m.result() for m in self.metrics[:-1]}
+
+    def test_step(self, real_images):
+        generated_images = self.generate(batch_size, training=False)
+
+        self.kid.update_state(real_images, generated_images)
+
+        # only KID is measured during the evaluation phase for computational efficiency
+        return {self.kid.name: self.kid.result()}
+
+    def plot_images(self, epoch=None, logs=None, num_rows=3, num_cols=6, interval=5):
+        # plot random generated images for visual evaluation of generation quality
+        if epoch is None or (epoch + 1) % interval == 0:
+            num_images = num_rows * num_cols
+            generated_images = self.generate(num_images, training=False)
+
+            plt.figure(figsize=(num_cols * 2.0, num_rows * 2.0))
+            for row in range(num_rows):
+                for col in range(num_cols):
+                    index = row * num_cols + col
+                    plt.subplot(num_rows, num_cols, index + 1)
+                    plt.imshow(generated_images[index])
+                    plt.axis("off")
+            plt.tight_layout()
+            plt.show()
+            plt.close()
+
+
+"""
+## Training
+
+One can should see from the metrics during training, that if the real accuracy
+(discriminator's accuracy on real images) is below the target accuracy, the augmentation
+probability is increased, and vice versa. In my experience, during a healthy GAN
+training, the discriminator accuracy should stay in the 80-95% range. Below that, the
+discriminator is too weak, above that it is too strong.
+
+Note that we track the exponential moving average of the generator's weights, and use that
+for image generation and KID evaluation.
+"""
+
+# create and compile the model
+model = GAN_ADA()
+model.compile(
+    generator_optimizer=keras.optimizers.Adam(learning_rate, beta_1),
+    discriminator_optimizer=keras.optimizers.Adam(learning_rate, beta_1),
+)
+
+# save the best model based on the validation KID metric
+checkpoint_path = "gan_model.weights.h5"
+checkpoint_callback = keras.callbacks.ModelCheckpoint(
+    filepath=checkpoint_path,
+    save_weights_only=True,
+    monitor="val_kid",
+    mode="min",
+    save_best_only=True,
+)
+
+# run training and plot generated images periodically
+model.fit(
+    train_dataset,
+    epochs=num_epochs,
+    validation_data=val_dataset,
+    callbacks=[
+        keras.callbacks.LambdaCallback(on_epoch_end=model.plot_images),
+        checkpoint_callback,
+    ],
+)
+
+"""
+## Inference
+"""
+
+# load the best model and generate images
+model.load_weights(checkpoint_path)
+model.plot_images()
+
+"""
+## Results
+
+By running the training for 400 epochs (which takes 2-3 hours in a Colab notebook), one
+can get high quality image generations using this code example.
+
+The evolution of a random batch of images over a 400 epoch training (ema=0.999 for
+animation smoothness):
+![birds evolution gif](https://i.imgur.com/ecGuCcz.gif)
+
+Latent-space interpolation between a batch of selected images:
+![birds interpolation gif](https://i.imgur.com/nGvzlsC.gif)
+
+I also recommend trying out training on other datasets, such as
+[CelebA](https://www.tensorflow.org/datasets/catalog/celeb_a) for example. In my
+experience good results can be achieved without changing any hyperparameters (though
+discriminator augmentation might not be necessary).
+"""
+
+"""
+## GAN tips and tricks
+
+My goal with this example was to find a good tradeoff between ease of implementation and
+generation quality for GANs. During preparation, I have run numerous ablations using
+[this repository](https://github.com/beresandras/gan-flavours-keras).
+
+In this section I list the lessons learned and my recommendations in my subjective order
+of importance.
+
+I recommend checking out the [DCGAN paper](https://arxiv.org/abs/1511.06434), this
+[NeurIPS talk](https://www.youtube.com/watch?v=myGAju4L7O8), and this
+[large scale GAN study](https://arxiv.org/abs/1711.10337) for others' takes on this subject.
+
+### Architectural tips
+
+* **resolution**: Training GANs at higher resolutions tends to get more difficult, I
+recommend experimenting at 32x32 or 64x64 resolutions initially.
+* **initialization**: If you see strong colorful patterns early on in the training, the
+initialization might be the issue. Set the kernel_initializer parameters of layers to
+[random normal](https://keras.io/api/layers/initializers/#randomnormal-class), and
+decrease the standard deviation (recommended value: 0.02, following DCGAN) until the
+issue disappears.
+* **upsampling**: There are two main methods for upsampling in the generator.
+[Transposed convolution](https://keras.io/api/layers/convolution_layers/convolution2d_transpose/)
+is faster, but can lead to
+[checkerboard artifacts](https://distill.pub/2016/deconv-checkerboard/), which can be reduced by using
+a kernel size that is divisible with the stride (recommended kernel size is 4 for a stride of 2).
+[Upsampling](https://keras.io/api/layers/reshaping_layers/up_sampling2d/) +
+[standard convolution](https://keras.io/api/layers/convolution_layers/convolution2d/) can have slightly
+lower quality, but checkerboard artifacts are not an issue. I recommend using nearest-neighbor
+interpolation over bilinear for it.
+* **batch normalization in discriminator**: Sometimes has a high impact, I recommend
+trying out both ways.
+* **[spectral normalization](https://www.tensorflow.org/addons/api_docs/python/tfa/layers/SpectralNormalization)**:
+A popular technique for training GANs, can help with stability. I recommend
+disabling batch normalization's learnable scale parameters along with it.
+* **[residual connections](https://keras.io/guides/functional_api/#a-toy-resnet-model)**:
+While residual discriminators behave similarly, residual generators are more difficult to
+train in my experience. They are however necessary for training large and deep
+architectures. I recommend starting with non-residual architectures.
+* **dropout**: Using dropout before the last layer of the discriminator improves
+generation quality in my experience. Recommended dropout rate is below 0.5.
+* **[leaky ReLU](https://keras.io/api/layers/activation_layers/leaky_relu/)**: Use leaky
+ReLU activations in the discriminator to make its gradients less sparse. Recommended
+slope/alpha is 0.2 following DCGAN.
+
+### Algorithmic tips
+
+* **loss functions**: Numerous losses have been proposed over the years for training
+GANs, promising improved performance and stability. I have implemented 5 of them in
+[this repository](https://github.com/beresandras/gan-flavours-keras), and my experience is in
+line with [this GAN study](https://arxiv.org/abs/1711.10337): no loss seems to
+consistently outperform the default non-saturating GAN loss. I recommend using that as a
+default.
+* **Adam's beta_1 parameter**: The beta_1 parameter in Adam can be interpreted as the
+momentum of mean gradient estimation. Using 0.5 or even 0.0 instead of the default 0.9
+value was proposed in DCGAN and is important. This example would not work using its
+default value.
+* **separate batch normalization for generated and real images**: The forward pass of the
+discriminator should be separate for the generated and real images. Doing otherwise can
+lead to artifacts (45 degree stripes in my case) and decreased performance.
+* **exponential moving average of generator's weights**: This helps to reduce the
+variance of the KID measurement, and helps in averaging out the rapid color palette
+changes during training.
+* **[different learning rate for generator and discriminator](https://arxiv.org/abs/1706.08500)**:
+If one has the resources, it can help
+to tune the learning rates of the two networks separately. A similar idea is to update
+either network's (usually the discriminator's) weights multiple times for each of the
+other network's updates. I recommend using the same learning rate of 2e-4 (Adam),
+following DCGAN for both networks, and only updating both of them once as a default.
+* **label noise**: [One-sided label smoothing](https://arxiv.org/abs/1606.03498) (using
+less than 1.0 for real labels), or adding noise to the labels can regularize the
+discriminator not to get overconfident, however in my case they did not improve
+performance.
+* **adaptive data augmentation**: Since it adds another dynamic component to the training
+process, disable it as a default, and only enable it when the other components already
+work well.
+"""
+
+"""
+## Related works
+
+Other GAN-related Keras code examples:
+
+* [DCGAN + CelebA](https://keras.io/examples/generative/dcgan_overriding_train_step/)
+* [WGAN + FashionMNIST](https://keras.io/examples/generative/wgan_gp/)
+* [WGAN + Molecules](https://keras.io/examples/generative/wgan-graphs/)
+* [ConditionalGAN + MNIST](https://keras.io/examples/generative/conditional_gan/)
+* [CycleGAN + Horse2Zebra](https://keras.io/examples/generative/cyclegan/)
+* [StyleGAN](https://keras.io/examples/generative/stylegan/)
+
+Modern GAN architecture-lines:
+
+* [SAGAN](https://arxiv.org/abs/1805.08318), [BigGAN](https://arxiv.org/abs/1809.11096)
+* [ProgressiveGAN](https://arxiv.org/abs/1710.10196),
+[StyleGAN](https://arxiv.org/abs/1812.04948),
+[StyleGAN2](https://arxiv.org/abs/1912.04958),
+[StyleGAN2-ADA](https://arxiv.org/abs/2006.06676),
+[AliasFreeGAN](https://arxiv.org/abs/2106.12423)
+
+Concurrent papers on discriminator data augmentation:
+[1](https://arxiv.org/abs/2006.02595), [2](https://arxiv.org/abs/2006.05338), [3](https://arxiv.org/abs/2006.10738)
+
+Recent literature overview on GANs: [talk](https://www.youtube.com/watch?v=3ktD752xq5k)
+"""
diff --git a/knowledge_base/generative/gaugan.py b/knowledge_base/generative/gaugan.py
new file mode 100644
index 0000000000000000000000000000000000000000..3c45f6795dfcfa432b73757c178ba913f786b39a
--- /dev/null
+++ b/knowledge_base/generative/gaugan.py
@@ -0,0 +1,845 @@
+"""
+Title: GauGAN for conditional image generation
+Author: [Soumik Rakshit](https://github.com/soumik12345), [Sayak Paul](https://twitter.com/RisingSayak)
+Date created: 2021/12/26
+Last modified: 2022/01/03
+Description: Implementing a GauGAN for conditional image generation.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+In this example, we present an implementation of the GauGAN architecture proposed in
+[Semantic Image Synthesis with Spatially-Adaptive Normalization](https://arxiv.org/abs/1903.07291).
+Briefly, GauGAN uses a Generative Adversarial Network (GAN) to generate realistic images
+that are conditioned on cue images and segmentation maps, as shown below
+([image source](https://nvlabs.github.io/SPADE/)):
+
+![](https://i.ibb.co/p305dzv/image.png)
+
+The main components of a GauGAN are:
+
+- **SPADE (aka spatially-adaptive normalization)** : The authors of GauGAN argue that the
+more conventional normalization layers (such as
+[Batch Normalization](https://arxiv.org/abs/1502.03167))
+destroy the semantic information obtained from segmentation maps that
+are provided as inputs. To address this problem, the authors introduce SPADE, a
+normalization layer particularly suitable for learning affine parameters (scale and bias)
+that are spatially adaptive. This is done by learning different sets of scaling and
+bias parameters for each semantic label.
+- **Variational encoder**: Inspired by
+[Variational Autoencoders](https://arxiv.org/abs/1312.6114), GauGAN uses a
+variational formulation wherein an encoder learns the mean and variance of a
+normal (Gaussian) distribution from the cue images. This is where GauGAN gets its name
+from. The generator of GauGAN takes as inputs the latents sampled from the Gaussian
+distribution as well as the one-hot encoded semantic segmentation label maps. The cue
+images act as style images that guide the generator to stylistic generation. This
+variational formulation helps GauGAN achieve image diversity as well as fidelity.
+- **Multi-scale patch discriminator** : Inspired by the
+[PatchGAN](https://paperswithcode.com/method/patchgan) model,
+GauGAN uses a discriminator that assesses a given image on a patch basis
+and produces an averaged score.
+
+As we proceed with the example, we will discuss each of the different
+components in further detail.
+
+For a thorough review of GauGAN, please refer to
+[this article](https://blog.paperspace.com/nvidia-gaugan-introduction/).
+We also encourage you to check out
+[the official GauGAN website](https://nvlabs.github.io/SPADE/), which
+has many creative applications of GauGAN. This example assumes that the reader is already
+familiar with the fundamental concepts of GANs. If you need a refresher, the following
+resources might be useful:
+
+* [Chapter on GANs](https://livebook.manning.com/book/deep-learning-with-python/chapter-8)
+from the Deep Learning with Python book by François Chollet.
+* GAN implementations on keras.io:
+
+    * [Data efficient GANs](https://keras.io/examples/generative/gan_ada)
+    * [CycleGAN](https://keras.io/examples/generative/cyclegan)
+    * [Conditional GAN](https://keras.io/examples/generative/conditional_gan)
+"""
+
+"""
+## Data collection
+
+We will be using the
+[Facades dataset](https://cmp.felk.cvut.cz/~tylecr1/facade/)
+for training our GauGAN model. Let's first download it.
+"""
+
+"""shell
+wget https://drive.google.com/uc?id=1q4FEjQg1YSb4mPx2VdxL7LXKYu3voTMj -O facades_data.zip
+unzip -q facades_data.zip
+"""
+
+"""
+## Imports
+"""
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+import tensorflow as tf
+import keras
+from keras import ops
+from keras import layers
+
+from glob import glob
+
+"""
+## Data splitting
+"""
+
+PATH = "./facades_data/"
+SPLIT = 0.2
+
+files = glob(PATH + "*.jpg")
+np.random.shuffle(files)
+
+split_index = int(len(files) * (1 - SPLIT))
+train_files = files[:split_index]
+val_files = files[split_index:]
+
+print(f"Total samples: {len(files)}.")
+print(f"Total training samples: {len(train_files)}.")
+print(f"Total validation samples: {len(val_files)}.")
+
+"""
+## Data loader
+"""
+
+BATCH_SIZE = 4
+IMG_HEIGHT = IMG_WIDTH = 256
+NUM_CLASSES = 12
+AUTOTUNE = tf.data.AUTOTUNE
+
+
+def load(image_files, batch_size, is_train=True):
+    def _random_crop(
+        segmentation_map,
+        image,
+        labels,
+        crop_size=(IMG_HEIGHT, IMG_WIDTH),
+    ):
+        crop_size = tf.convert_to_tensor(crop_size)
+        image_shape = tf.shape(image)[:2]
+        margins = image_shape - crop_size
+        y1 = tf.random.uniform(shape=(), maxval=margins[0], dtype=tf.int32)
+        x1 = tf.random.uniform(shape=(), maxval=margins[1], dtype=tf.int32)
+        y2 = y1 + crop_size[0]
+        x2 = x1 + crop_size[1]
+
+        cropped_images = []
+        images = [segmentation_map, image, labels]
+        for img in images:
+            cropped_images.append(img[y1:y2, x1:x2])
+        return cropped_images
+
+    def _load_data_tf(image_file, segmentation_map_file, label_file):
+        image = tf.image.decode_png(tf.io.read_file(image_file), channels=3)
+        segmentation_map = tf.image.decode_png(
+            tf.io.read_file(segmentation_map_file), channels=3
+        )
+        labels = tf.image.decode_bmp(tf.io.read_file(label_file), channels=0)
+        labels = tf.squeeze(labels)
+
+        image = tf.cast(image, tf.float32) / 127.5 - 1
+        segmentation_map = tf.cast(segmentation_map, tf.float32) / 127.5 - 1
+        return segmentation_map, image, labels
+
+    def _one_hot(segmentation_maps, real_images, labels):
+        labels = tf.one_hot(labels, NUM_CLASSES)
+        labels.set_shape((None, None, NUM_CLASSES))
+        return segmentation_maps, real_images, labels
+
+    segmentation_map_files = [
+        image_file.replace("images", "segmentation_map").replace("jpg", "png")
+        for image_file in image_files
+    ]
+    label_files = [
+        image_file.replace("images", "segmentation_labels").replace("jpg", "bmp")
+        for image_file in image_files
+    ]
+    dataset = tf.data.Dataset.from_tensor_slices(
+        (image_files, segmentation_map_files, label_files)
+    )
+
+    dataset = dataset.shuffle(batch_size * 10) if is_train else dataset
+    dataset = dataset.map(_load_data_tf, num_parallel_calls=AUTOTUNE)
+    dataset = dataset.map(_random_crop, num_parallel_calls=AUTOTUNE)
+    dataset = dataset.map(_one_hot, num_parallel_calls=AUTOTUNE)
+    dataset = dataset.batch(batch_size, drop_remainder=True)
+    return dataset
+
+
+train_dataset = load(train_files, batch_size=BATCH_SIZE, is_train=True)
+val_dataset = load(val_files, batch_size=BATCH_SIZE, is_train=False)
+
+"""
+Now, let's visualize a few samples from the training set.
+"""
+
+sample_train_batch = next(iter(train_dataset))
+print(f"Segmentation map batch shape: {sample_train_batch[0].shape}.")
+print(f"Image batch shape: {sample_train_batch[1].shape}.")
+print(f"One-hot encoded label map shape: {sample_train_batch[2].shape}.")
+
+# Plot a view samples from the training set.
+for segmentation_map, real_image in zip(sample_train_batch[0], sample_train_batch[1]):
+    fig = plt.figure(figsize=(10, 10))
+    fig.add_subplot(1, 2, 1).set_title("Segmentation Map")
+    plt.imshow((segmentation_map + 1) / 2)
+    fig.add_subplot(1, 2, 2).set_title("Real Image")
+    plt.imshow((real_image + 1) / 2)
+    plt.show()
+
+"""
+Note that in the rest of this example, we use a couple of figures from the
+[original GauGAN paper](https://arxiv.org/abs/1903.07291) for convenience.
+"""
+
+"""
+## Custom layers
+
+In the following section, we implement the following layers:
+
+* SPADE
+* Residual block including SPADE
+* Gaussian sampler
+"""
+
+"""
+### Some more notes on SPADE
+
+![](https://i.imgur.com/DgMWrrs.png)
+
+**SPatially-Adaptive (DE) normalization** or **SPADE** is a simple but effective layer
+for synthesizing photorealistic images given an input semantic layout. Previous methods
+for conditional image generation from semantic input such as
+Pix2Pix ([Isola et al.](https://arxiv.org/abs/1611.07004))
+or Pix2PixHD ([Wang et al.](https://arxiv.org/abs/1711.11585))
+directly feed the semantic layout as input to the deep network, which is then processed
+through stacks of convolution, normalization, and nonlinearity layers. This is often
+suboptimal as the normalization layers have a tendency to wash away semantic information.
+
+In SPADE, the segmentation mask is first projected onto an embedding space, and then
+convolved to produce the modulation parameters `γ` and `β`. Unlike prior conditional
+normalization methods, `γ` and `β` are not vectors, but tensors with spatial dimensions.
+The produced `γ` and `β` are multiplied and added to the normalized activation
+element-wise. As the modulation parameters are adaptive to the input segmentation mask,
+SPADE is better suited for semantic image synthesis.
+"""
+
+
+class SPADE(layers.Layer):
+    def __init__(self, filters, epsilon=1e-5, **kwargs):
+        super().__init__(**kwargs)
+        self.epsilon = epsilon
+        self.conv = layers.Conv2D(128, 3, padding="same", activation="relu")
+        self.conv_gamma = layers.Conv2D(filters, 3, padding="same")
+        self.conv_beta = layers.Conv2D(filters, 3, padding="same")
+
+    def build(self, input_shape):
+        self.resize_shape = input_shape[1:3]
+
+    def call(self, input_tensor, raw_mask):
+        mask = ops.image.resize(raw_mask, self.resize_shape, interpolation="nearest")
+        x = self.conv(mask)
+        gamma = self.conv_gamma(x)
+        beta = self.conv_beta(x)
+        mean, var = ops.moments(input_tensor, axes=(0, 1, 2), keepdims=True)
+        std = ops.sqrt(var + self.epsilon)
+        normalized = (input_tensor - mean) / std
+        output = gamma * normalized + beta
+        return output
+
+
+class ResBlock(layers.Layer):
+    def __init__(self, filters, **kwargs):
+        super().__init__(**kwargs)
+        self.filters = filters
+
+    def build(self, input_shape):
+        input_filter = input_shape[-1]
+        self.spade_1 = SPADE(input_filter)
+        self.spade_2 = SPADE(self.filters)
+        self.conv_1 = layers.Conv2D(self.filters, 3, padding="same")
+        self.conv_2 = layers.Conv2D(self.filters, 3, padding="same")
+        self.learned_skip = False
+
+        if self.filters != input_filter:
+            self.learned_skip = True
+            self.spade_3 = SPADE(input_filter)
+            self.conv_3 = layers.Conv2D(self.filters, 3, padding="same")
+
+    def call(self, input_tensor, mask):
+        x = self.spade_1(input_tensor, mask)
+        x = self.conv_1(keras.activations.leaky_relu(x, 0.2))
+        x = self.spade_2(x, mask)
+        x = self.conv_2(keras.activations.leaky_relu(x, 0.2))
+        skip = (
+            self.conv_3(
+                keras.activations.leaky_relu(self.spade_3(input_tensor, mask), 0.2)
+            )
+            if self.learned_skip
+            else input_tensor
+        )
+        output = skip + x
+        return output
+
+
+class GaussianSampler(layers.Layer):
+    def __init__(self, batch_size, latent_dim, **kwargs):
+        super().__init__(**kwargs)
+        self.batch_size = batch_size
+        self.latent_dim = latent_dim
+        self.seed_generator = keras.random.SeedGenerator(1337)
+
+    def call(self, inputs):
+        means, variance = inputs
+        epsilon = keras.random.normal(
+            shape=(self.batch_size, self.latent_dim),
+            mean=0.0,
+            stddev=1.0,
+            seed=self.seed_generator,
+        )
+        samples = means + ops.exp(0.5 * variance) * epsilon
+        return samples
+
+
+"""
+Next, we implement the downsampling block for the encoder.
+"""
+
+
+def downsample(
+    channels,
+    kernels,
+    strides=2,
+    apply_norm=True,
+    apply_activation=True,
+    apply_dropout=False,
+):
+    block = keras.Sequential()
+    block.add(
+        layers.Conv2D(
+            channels,
+            kernels,
+            strides=strides,
+            padding="same",
+            use_bias=False,
+            kernel_initializer=keras.initializers.GlorotNormal(),
+        )
+    )
+    if apply_norm:
+        block.add(layers.GroupNormalization(groups=-1))
+    if apply_activation:
+        block.add(layers.LeakyReLU(0.2))
+    if apply_dropout:
+        block.add(layers.Dropout(0.5))
+    return block
+
+
+"""
+The GauGAN encoder consists of a few downsampling blocks. It outputs the mean and
+variance of a distribution.
+
+![](https://i.imgur.com/JgAv1EW.png)
+
+"""
+
+
+def build_encoder(image_shape, encoder_downsample_factor=64, latent_dim=256):
+    input_image = keras.Input(shape=image_shape)
+    x = downsample(encoder_downsample_factor, 3, apply_norm=False)(input_image)
+    x = downsample(2 * encoder_downsample_factor, 3)(x)
+    x = downsample(4 * encoder_downsample_factor, 3)(x)
+    x = downsample(8 * encoder_downsample_factor, 3)(x)
+    x = downsample(8 * encoder_downsample_factor, 3)(x)
+    x = layers.Flatten()(x)
+    mean = layers.Dense(latent_dim, name="mean")(x)
+    variance = layers.Dense(latent_dim, name="variance")(x)
+    return keras.Model(input_image, [mean, variance], name="encoder")
+
+
+"""
+Next, we implement the generator, which consists of the modified residual blocks and
+upsampling blocks. It takes latent vectors and one-hot encoded segmentation labels, and
+produces new images.
+
+![](https://i.imgur.com/9iP1TsB.png)
+
+With SPADE, there is no need to feed the segmentation map to the first layer of the
+generator, since the latent inputs have enough structural information about the style we
+want the generator to emulate. We also discard the encoder part of the generator, which is
+commonly used in prior architectures. This results in a more lightweight
+generator network, which can also take a random vector as input, enabling a simple and
+natural path to multi-modal synthesis.
+"""
+
+
+def build_generator(mask_shape, latent_dim=256):
+    latent = keras.Input(shape=(latent_dim,))
+    mask = keras.Input(shape=mask_shape)
+    x = layers.Dense(16384)(latent)
+    x = layers.Reshape((4, 4, 1024))(x)
+    x = ResBlock(filters=1024)(x, mask)
+    x = layers.UpSampling2D((2, 2))(x)
+    x = ResBlock(filters=1024)(x, mask)
+    x = layers.UpSampling2D((2, 2))(x)
+    x = ResBlock(filters=1024)(x, mask)
+    x = layers.UpSampling2D((2, 2))(x)
+    x = ResBlock(filters=512)(x, mask)
+    x = layers.UpSampling2D((2, 2))(x)
+    x = ResBlock(filters=256)(x, mask)
+    x = layers.UpSampling2D((2, 2))(x)
+    x = ResBlock(filters=128)(x, mask)
+    x = layers.UpSampling2D((2, 2))(x)
+    x = keras.activations.leaky_relu(x, 0.2)
+    output_image = keras.activations.tanh(layers.Conv2D(3, 4, padding="same")(x))
+    return keras.Model([latent, mask], output_image, name="generator")
+
+
+"""
+The discriminator takes a segmentation map and an image and concatenates them. It
+then predicts if patches of the concatenated image are real or fake.
+
+![](https://i.imgur.com/rn71PlM.png)
+"""
+
+
+def build_discriminator(image_shape, downsample_factor=64):
+    input_image_A = keras.Input(shape=image_shape, name="discriminator_image_A")
+    input_image_B = keras.Input(shape=image_shape, name="discriminator_image_B")
+    x = layers.Concatenate()([input_image_A, input_image_B])
+    x1 = downsample(downsample_factor, 4, apply_norm=False)(x)
+    x2 = downsample(2 * downsample_factor, 4)(x1)
+    x3 = downsample(4 * downsample_factor, 4)(x2)
+    x4 = downsample(8 * downsample_factor, 4, strides=1)(x3)
+    x5 = layers.Conv2D(1, 4)(x4)
+    outputs = [x1, x2, x3, x4, x5]
+    return keras.Model([input_image_A, input_image_B], outputs)
+
+
+"""
+## Loss functions
+
+GauGAN uses the following loss functions:
+
+* Generator:
+
+    * Expectation over the discriminator predictions.
+    * [KL divergence](https://en.wikipedia.org/wiki/Kullback%E2%80%93Leibler_divergence)
+    for learning the mean and variance predicted by the encoder.
+    * Minimization between the discriminator predictions on original and generated
+    images to align the feature space of the generator.
+    * [Perceptual loss](https://arxiv.org/abs/1603.08155) for encouraging the generated
+    images to have perceptual quality.
+
+* Discriminator:
+
+    * [Hinge loss](https://en.wikipedia.org/wiki/Hinge_loss).
+"""
+
+
+def generator_loss(y):
+    return -ops.mean(y)
+
+
+def kl_divergence_loss(mean, variance):
+    return -0.5 * ops.sum(1 + variance - ops.square(mean) - ops.exp(variance))
+
+
+class FeatureMatchingLoss(keras.losses.Loss):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.mae = keras.losses.MeanAbsoluteError()
+
+    def call(self, y_true, y_pred):
+        loss = 0
+        for i in range(len(y_true) - 1):
+            loss += self.mae(y_true[i], y_pred[i])
+        return loss
+
+
+class VGGFeatureMatchingLoss(keras.losses.Loss):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.encoder_layers = [
+            "block1_conv1",
+            "block2_conv1",
+            "block3_conv1",
+            "block4_conv1",
+            "block5_conv1",
+        ]
+        self.weights = [1.0 / 32, 1.0 / 16, 1.0 / 8, 1.0 / 4, 1.0]
+        vgg = keras.applications.VGG19(include_top=False, weights="imagenet")
+        layer_outputs = [vgg.get_layer(x).output for x in self.encoder_layers]
+        self.vgg_model = keras.Model(vgg.input, layer_outputs, name="VGG")
+        self.mae = keras.losses.MeanAbsoluteError()
+
+    def call(self, y_true, y_pred):
+        y_true = keras.applications.vgg19.preprocess_input(127.5 * (y_true + 1))
+        y_pred = keras.applications.vgg19.preprocess_input(127.5 * (y_pred + 1))
+        real_features = self.vgg_model(y_true)
+        fake_features = self.vgg_model(y_pred)
+        loss = 0
+        for i in range(len(real_features)):
+            loss += self.weights[i] * self.mae(real_features[i], fake_features[i])
+        return loss
+
+
+class DiscriminatorLoss(keras.losses.Loss):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.hinge_loss = keras.losses.Hinge()
+
+    def call(self, y, is_real):
+        return self.hinge_loss(is_real, y)
+
+
+"""
+## GAN monitor callback
+
+Next, we implement a callback to monitor the GauGAN results while it is training.
+"""
+
+
+class GanMonitor(keras.callbacks.Callback):
+    def __init__(self, val_dataset, n_samples, epoch_interval=5):
+        self.val_images = next(iter(val_dataset))
+        self.n_samples = n_samples
+        self.epoch_interval = epoch_interval
+        self.seed_generator = keras.random.SeedGenerator(42)
+
+    def infer(self):
+        latent_vector = keras.random.normal(
+            shape=(self.model.batch_size, self.model.latent_dim),
+            mean=0.0,
+            stddev=2.0,
+            seed=self.seed_generator,
+        )
+        return self.model.predict([latent_vector, self.val_images[2]])
+
+    def on_epoch_end(self, epoch, logs=None):
+        if epoch % self.epoch_interval == 0:
+            generated_images = self.infer()
+            for _ in range(self.n_samples):
+                grid_row = min(generated_images.shape[0], 3)
+                f, axarr = plt.subplots(grid_row, 3, figsize=(18, grid_row * 6))
+                for row in range(grid_row):
+                    ax = axarr if grid_row == 1 else axarr[row]
+                    ax[0].imshow((self.val_images[0][row] + 1) / 2)
+                    ax[0].axis("off")
+                    ax[0].set_title("Mask", fontsize=20)
+                    ax[1].imshow((self.val_images[1][row] + 1) / 2)
+                    ax[1].axis("off")
+                    ax[1].set_title("Ground Truth", fontsize=20)
+                    ax[2].imshow((generated_images[row] + 1) / 2)
+                    ax[2].axis("off")
+                    ax[2].set_title("Generated", fontsize=20)
+                plt.show()
+
+
+"""
+## Subclassed GauGAN model
+
+Finally, we put everything together inside a subclassed model (from `tf.keras.Model`)
+overriding its `train_step()` method.
+"""
+
+
+class GauGAN(keras.Model):
+    def __init__(
+        self,
+        image_size,
+        num_classes,
+        batch_size,
+        latent_dim,
+        feature_loss_coeff=10,
+        vgg_feature_loss_coeff=0.1,
+        kl_divergence_loss_coeff=0.1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.image_size = image_size
+        self.latent_dim = latent_dim
+        self.batch_size = batch_size
+        self.num_classes = num_classes
+        self.image_shape = (image_size, image_size, 3)
+        self.mask_shape = (image_size, image_size, num_classes)
+        self.feature_loss_coeff = feature_loss_coeff
+        self.vgg_feature_loss_coeff = vgg_feature_loss_coeff
+        self.kl_divergence_loss_coeff = kl_divergence_loss_coeff
+
+        self.discriminator = build_discriminator(self.image_shape)
+        self.generator = build_generator(self.mask_shape)
+        self.encoder = build_encoder(self.image_shape)
+        self.sampler = GaussianSampler(batch_size, latent_dim)
+        self.patch_size, self.combined_model = self.build_combined_generator()
+
+        self.disc_loss_tracker = keras.metrics.Mean(name="disc_loss")
+        self.gen_loss_tracker = keras.metrics.Mean(name="gen_loss")
+        self.feat_loss_tracker = keras.metrics.Mean(name="feat_loss")
+        self.vgg_loss_tracker = keras.metrics.Mean(name="vgg_loss")
+        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")
+
+    @property
+    def metrics(self):
+        return [
+            self.disc_loss_tracker,
+            self.gen_loss_tracker,
+            self.feat_loss_tracker,
+            self.vgg_loss_tracker,
+            self.kl_loss_tracker,
+        ]
+
+    def build_combined_generator(self):
+        # This method builds a model that takes as inputs the following:
+        # latent vector, one-hot encoded segmentation label map, and
+        # a segmentation map. It then (i) generates an image with the generator,
+        # (ii) passes the generated images and segmentation map to the discriminator.
+        # Finally, the model produces the following outputs: (a) discriminator outputs,
+        # (b) generated image.
+        # We will be using this model to simplify the implementation.
+        self.discriminator.trainable = False
+        mask_input = keras.Input(shape=self.mask_shape, name="mask")
+        image_input = keras.Input(shape=self.image_shape, name="image")
+        latent_input = keras.Input(shape=(self.latent_dim,), name="latent")
+        generated_image = self.generator([latent_input, mask_input])
+        discriminator_output = self.discriminator([image_input, generated_image])
+        combined_outputs = discriminator_output + [generated_image]
+        patch_size = discriminator_output[-1].shape[1]
+        combined_model = keras.Model(
+            [latent_input, mask_input, image_input], combined_outputs
+        )
+        return patch_size, combined_model
+
+    def compile(self, gen_lr=1e-4, disc_lr=4e-4, **kwargs):
+        super().compile(**kwargs)
+        self.generator_optimizer = keras.optimizers.Adam(
+            gen_lr, beta_1=0.0, beta_2=0.999
+        )
+        self.discriminator_optimizer = keras.optimizers.Adam(
+            disc_lr, beta_1=0.0, beta_2=0.999
+        )
+        self.discriminator_loss = DiscriminatorLoss()
+        self.feature_matching_loss = FeatureMatchingLoss()
+        self.vgg_loss = VGGFeatureMatchingLoss()
+
+    def train_discriminator(self, latent_vector, segmentation_map, real_image, labels):
+        fake_images = self.generator([latent_vector, labels])
+        with tf.GradientTape() as gradient_tape:
+            pred_fake = self.discriminator([segmentation_map, fake_images])[-1]
+            pred_real = self.discriminator([segmentation_map, real_image])[-1]
+            loss_fake = self.discriminator_loss(pred_fake, -1.0)
+            loss_real = self.discriminator_loss(pred_real, 1.0)
+            total_loss = 0.5 * (loss_fake + loss_real)
+
+        self.discriminator.trainable = True
+        gradients = gradient_tape.gradient(
+            total_loss, self.discriminator.trainable_variables
+        )
+        self.discriminator_optimizer.apply_gradients(
+            zip(gradients, self.discriminator.trainable_variables)
+        )
+        return total_loss
+
+    def train_generator(
+        self, latent_vector, segmentation_map, labels, image, mean, variance
+    ):
+        # Generator learns through the signal provided by the discriminator. During
+        # backpropagation, we only update the generator parameters.
+        self.discriminator.trainable = False
+        with tf.GradientTape() as tape:
+            real_d_output = self.discriminator([segmentation_map, image])
+            combined_outputs = self.combined_model(
+                [latent_vector, labels, segmentation_map]
+            )
+            fake_d_output, fake_image = combined_outputs[:-1], combined_outputs[-1]
+            pred = fake_d_output[-1]
+
+            # Compute generator losses.
+            g_loss = generator_loss(pred)
+            kl_loss = self.kl_divergence_loss_coeff * kl_divergence_loss(mean, variance)
+            vgg_loss = self.vgg_feature_loss_coeff * self.vgg_loss(image, fake_image)
+            feature_loss = self.feature_loss_coeff * self.feature_matching_loss(
+                real_d_output, fake_d_output
+            )
+            total_loss = g_loss + kl_loss + vgg_loss + feature_loss
+
+        all_trainable_variables = (
+            self.combined_model.trainable_variables + self.encoder.trainable_variables
+        )
+
+        gradients = tape.gradient(total_loss, all_trainable_variables)
+        self.generator_optimizer.apply_gradients(
+            zip(gradients, all_trainable_variables)
+        )
+        return total_loss, feature_loss, vgg_loss, kl_loss
+
+    def train_step(self, data):
+        segmentation_map, image, labels = data
+        mean, variance = self.encoder(image)
+        latent_vector = self.sampler([mean, variance])
+        discriminator_loss = self.train_discriminator(
+            latent_vector, segmentation_map, image, labels
+        )
+        (generator_loss, feature_loss, vgg_loss, kl_loss) = self.train_generator(
+            latent_vector, segmentation_map, labels, image, mean, variance
+        )
+
+        # Report progress.
+        self.disc_loss_tracker.update_state(discriminator_loss)
+        self.gen_loss_tracker.update_state(generator_loss)
+        self.feat_loss_tracker.update_state(feature_loss)
+        self.vgg_loss_tracker.update_state(vgg_loss)
+        self.kl_loss_tracker.update_state(kl_loss)
+        results = {m.name: m.result() for m in self.metrics}
+        return results
+
+    def test_step(self, data):
+        segmentation_map, image, labels = data
+        # Obtain the learned moments of the real image distribution.
+        mean, variance = self.encoder(image)
+
+        # Sample a latent from the distribution defined by the learned moments.
+        latent_vector = self.sampler([mean, variance])
+
+        # Generate the fake images.
+        fake_images = self.generator([latent_vector, labels])
+
+        # Calculate the losses.
+        pred_fake = self.discriminator([segmentation_map, fake_images])[-1]
+        pred_real = self.discriminator([segmentation_map, image])[-1]
+        loss_fake = self.discriminator_loss(pred_fake, -1.0)
+        loss_real = self.discriminator_loss(pred_real, 1.0)
+        total_discriminator_loss = 0.5 * (loss_fake + loss_real)
+        real_d_output = self.discriminator([segmentation_map, image])
+        combined_outputs = self.combined_model(
+            [latent_vector, labels, segmentation_map]
+        )
+        fake_d_output, fake_image = combined_outputs[:-1], combined_outputs[-1]
+        pred = fake_d_output[-1]
+        g_loss = generator_loss(pred)
+        kl_loss = self.kl_divergence_loss_coeff * kl_divergence_loss(mean, variance)
+        vgg_loss = self.vgg_feature_loss_coeff * self.vgg_loss(image, fake_image)
+        feature_loss = self.feature_loss_coeff * self.feature_matching_loss(
+            real_d_output, fake_d_output
+        )
+        total_generator_loss = g_loss + kl_loss + vgg_loss + feature_loss
+
+        # Report progress.
+        self.disc_loss_tracker.update_state(total_discriminator_loss)
+        self.gen_loss_tracker.update_state(total_generator_loss)
+        self.feat_loss_tracker.update_state(feature_loss)
+        self.vgg_loss_tracker.update_state(vgg_loss)
+        self.kl_loss_tracker.update_state(kl_loss)
+        results = {m.name: m.result() for m in self.metrics}
+        return results
+
+    def call(self, inputs):
+        latent_vectors, labels = inputs
+        return self.generator([latent_vectors, labels])
+
+
+"""
+## GauGAN training
+"""
+
+gaugan = GauGAN(IMG_HEIGHT, NUM_CLASSES, BATCH_SIZE, latent_dim=256)
+gaugan.compile()
+history = gaugan.fit(
+    train_dataset,
+    validation_data=val_dataset,
+    epochs=15,
+    callbacks=[GanMonitor(val_dataset, BATCH_SIZE)],
+)
+
+
+def plot_history(item):
+    plt.plot(history.history[item], label=item)
+    plt.plot(history.history["val_" + item], label="val_" + item)
+    plt.xlabel("Epochs")
+    plt.ylabel(item)
+    plt.title("Train and Validation {} Over Epochs".format(item), fontsize=14)
+    plt.legend()
+    plt.grid()
+    plt.show()
+
+
+plot_history("disc_loss")
+plot_history("gen_loss")
+plot_history("feat_loss")
+plot_history("vgg_loss")
+plot_history("kl_loss")
+
+"""
+## Inference
+"""
+
+val_iterator = iter(val_dataset)
+
+for _ in range(5):
+    val_images = next(val_iterator)
+    # Sample latent from a normal distribution.
+    latent_vector = keras.random.normal(
+        shape=(gaugan.batch_size, gaugan.latent_dim), mean=0.0, stddev=2.0
+    )
+    # Generate fake images.
+    fake_images = gaugan.predict([latent_vector, val_images[2]])
+
+    real_images = val_images
+    grid_row = min(fake_images.shape[0], 3)
+    grid_col = 3
+    f, axarr = plt.subplots(grid_row, grid_col, figsize=(grid_col * 6, grid_row * 6))
+    for row in range(grid_row):
+        ax = axarr if grid_row == 1 else axarr[row]
+        ax[0].imshow((real_images[0][row] + 1) / 2)
+        ax[0].axis("off")
+        ax[0].set_title("Mask", fontsize=20)
+        ax[1].imshow((real_images[1][row] + 1) / 2)
+        ax[1].axis("off")
+        ax[1].set_title("Ground Truth", fontsize=20)
+        ax[2].imshow((fake_images[row] + 1) / 2)
+        ax[2].axis("off")
+        ax[2].set_title("Generated", fontsize=20)
+    plt.show()
+
+"""
+## Final words
+
+* The dataset we used in this example is a small one. For obtaining even better results
+we recommend to use a bigger dataset. GauGAN results were demonstrated with the
+[COCO-Stuff](https://github.com/nightrome/cocostuff) and
+[CityScapes](https://www.cityscapes-dataset.com/) datasets.
+* This example was inspired the Chapter 6 of
+[Hands-On Image Generation with TensorFlow](https://www.packtpub.com/product/hands-on-image-generation-with-tensorflow/9781838826789)
+by [Soon-Yau Cheong](https://www.linkedin.com/in/soonyau/) and
+[Implementing SPADE using fastai](https://towardsdatascience.com/implementing-spade-using-fastai-6ad86b94030a) by
+[Divyansh Jha](https://medium.com/@divyanshj.16).
+* If you found this example interesting and exciting, you might want to check out
+[our repository](https://github.com/soumik12345/tf2_gans) which we are
+currently building. It will include reimplementations of popular GANs and pretrained
+models. Our focus will be on readability and making the code as accessible as possible.
+Our plain is to first train our implementation of GauGAN (following the code of
+this example) on a bigger dataset and then make the repository public. We welcome
+contributions!
+* Recently GauGAN2 was also released. You can check it out
+[here](https://blogs.nvidia.com/blog/2021/11/22/gaugan2-ai-art-demo/).
+
+"""
+"""
+Example available on HuggingFace.
+
+| Trained Model | Demo |
+| :--: | :--: |
+| [![Generic badge](https://img.shields.io/badge/%F0%9F%A4%97%20Model-GauGAN%20Image%20Generation-black.svg)](https://huggingface.co/keras-io/GauGAN-Image-generation) | [![Generic badge](https://img.shields.io/badge/%F0%9F%A4%97%20Spaces-GauGAN%20Image%20Generation-black.svg)](https://huggingface.co/spaces/keras-io/GauGAN_Conditional_Image_Generation) |
+"""
diff --git a/knowledge_base/generative/gpt2_text_generation_with_keras_hub.py b/knowledge_base/generative/gpt2_text_generation_with_keras_hub.py
new file mode 100644
index 0000000000000000000000000000000000000000..8076053470ff4c07a2b25a915aa568ec4f68b2e2
--- /dev/null
+++ b/knowledge_base/generative/gpt2_text_generation_with_keras_hub.py
@@ -0,0 +1,377 @@
+"""
+Title: GPT2 Text Generation with KerasHub
+Author: Chen Qian
+Date created: 2023/04/17
+Last modified: 2024/04/12
+Description: Use KerasHub GPT2 model and `samplers` to do text generation.
+Accelerator: GPU
+"""
+
+"""
+In this tutorial, you will learn to use [KerasHub](https://keras.io/keras_hub/) to load a
+pre-trained Large Language Model (LLM) - [GPT-2 model](https://openai.com/research/better-language-models)
+(originally invented by OpenAI), finetune it to a specific text style, and
+generate text based on users' input (also known as prompt). You will also learn
+how GPT2 adapts quickly to non-English languages, such as Chinese.
+"""
+
+"""
+##  Before we begin
+
+Colab offers different kinds of runtimes. Make sure to go to **Runtime ->
+Change runtime type** and choose the GPU Hardware Accelerator runtime
+(which should have >12G host RAM and ~15G GPU RAM) since you will finetune the
+GPT-2 model. Running this tutorial on CPU runtime will take hours.
+"""
+
+"""
+## Install KerasHub, Choose Backend and Import Dependencies
+
+This examples uses [Keras 3](https://keras.io/keras_3/) to work in any of
+`"tensorflow"`, `"jax"` or `"torch"`. Support for Keras 3 is baked into
+KerasHub, simply change the `"KERAS_BACKEND"` environment variable to select
+the backend of your choice. We select the JAX backend below.
+"""
+
+"""shell
+pip install git+https://github.com/keras-team/keras-hub.git -q
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "jax"  # or "tensorflow" or "torch"
+
+import keras_hub
+import keras
+import tensorflow as tf
+import time
+
+keras.mixed_precision.set_global_policy("mixed_float16")
+
+"""
+## Introduction to Generative Large Language Models (LLMs)
+
+Large language models (LLMs) are a type of machine learning models that are
+trained on a large corpus of text data to generate outputs for various natural
+language processing (NLP) tasks, such as text generation, question answering,
+and machine translation.
+
+Generative LLMs are typically based on deep learning neural networks, such as
+the [Transformer architecture](https://arxiv.org/abs/1706.03762) invented by
+Google researchers in 2017, and are trained on massive amounts of text data,
+often involving billions of words. These models, such as Google [LaMDA](https://blog.google/technology/ai/lamda/)
+and [PaLM](https://ai.googleblog.com/2022/04/pathways-language-model-palm-scaling-to.html),
+are trained with a large dataset from various data sources which allows them to
+generate output for many tasks. The core of Generative LLMs is predicting the
+next word in a sentence, often referred as **Causal LM Pretraining**. In this
+way LLMs can generate coherent text based on user prompts. For a more
+pedagogical discussion on language models, you can refer to the
+[Stanford CS324 LLM class](https://stanford-cs324.github.io/winter2022/lectures/introduction/).
+"""
+
+"""
+## Introduction to KerasHub
+
+Large Language Models are complex to build and expensive to train from scratch.
+Luckily there are pretrained LLMs available for use right away. [KerasHub](https://keras.io/keras_hub/)
+provides a large number of pre-trained checkpoints that allow you to experiment
+with SOTA models without needing to train them yourself.
+
+KerasHub is a natural language processing library that supports users through
+their entire development cycle. KerasHub offers both pretrained models and
+modularized building blocks, so developers could easily reuse pretrained models
+or stack their own LLM.
+
+In a nutshell, for generative LLM, KerasHub offers:
+
+- Pretrained models with `generate()` method, e.g.,
+    `keras_hub.models.GPT2CausalLM` and `keras_hub.models.OPTCausalLM`.
+- Sampler class that implements generation algorithms such as Top-K, Beam and
+    contrastive search. These samplers can be used to generate text with
+    custom models.
+"""
+
+"""
+## Load a pre-trained GPT-2 model and generate some text
+
+KerasHub provides a number of pre-trained models, such as [Google
+Bert](https://ai.googleblog.com/2018/11/open-sourcing-bert-state-of-art-pre.html)
+and [GPT-2](https://openai.com/research/better-language-models). You can see
+the list of models available in the [KerasHub repository](https://github.com/keras-team/keras-hub/tree/master/keras_hub/models).
+
+It's very easy to load the GPT-2 model as you can see below:
+"""
+
+# To speed up training and generation, we use preprocessor of length 128
+# instead of full length 1024.
+preprocessor = keras_hub.models.GPT2CausalLMPreprocessor.from_preset(
+    "gpt2_base_en",
+    sequence_length=128,
+)
+gpt2_lm = keras_hub.models.GPT2CausalLM.from_preset(
+    "gpt2_base_en", preprocessor=preprocessor
+)
+
+"""
+Once the model is loaded, you can use it to generate some text right away. Run
+the cells below to give it a try. It's as simple as calling a single function
+*generate()*:
+"""
+
+start = time.time()
+
+output = gpt2_lm.generate("My trip to Yosemite was", max_length=200)
+print("\nGPT-2 output:")
+print(output)
+
+end = time.time()
+print(f"TOTAL TIME ELAPSED: {end - start:.2f}s")
+
+"""
+Try another one:
+"""
+
+start = time.time()
+
+output = gpt2_lm.generate("That Italian restaurant is", max_length=200)
+print("\nGPT-2 output:")
+print(output)
+
+end = time.time()
+print(f"TOTAL TIME ELAPSED: {end - start:.2f}s")
+
+"""
+Notice how much faster the second call is. This is because the computational
+graph is [XLA compiled](https://www.tensorflow.org/xla) in the 1st run and
+re-used in the 2nd behind the scenes.
+
+The quality of the generated text looks OK, but we can improve it via
+fine-tuning.
+"""
+
+"""
+## More on the GPT-2 model from KerasHub
+
+Next up, we will actually fine-tune the model to update its parameters, but
+before we do, let's take a look at the full set of tools we have to for working
+with for GPT2.
+
+The code of GPT2 can be found
+[here](https://github.com/keras-team/keras-hub/blob/master/keras_hub/models/gpt2/).
+Conceptually the `GPT2CausalLM` can be hierarchically broken down into several
+modules in KerasHub, all of which have a *from_preset()* function that loads a
+pretrained model:
+
+- `keras_hub.models.GPT2Tokenizer`: The tokenizer used by GPT2 model, which is a
+    [byte-pair encoder](https://huggingface.co/course/chapter6/5?fw=pt).
+- `keras_hub.models.GPT2CausalLMPreprocessor`: the preprocessor used by GPT2
+    causal LM training. It does the tokenization along with other preprocessing
+    works such as creating the label and appending the end token.
+- `keras_hub.models.GPT2Backbone`: the GPT2 model, which is a stack of
+    `keras_hub.layers.TransformerDecoder`. This is usually just referred as
+    `GPT2`.
+- `keras_hub.models.GPT2CausalLM`: wraps `GPT2Backbone`, it multiplies the
+    output of `GPT2Backbone` by embedding matrix to generate logits over
+    vocab tokens.
+"""
+
+"""
+## Finetune on Reddit dataset
+
+Now you have the knowledge of the GPT-2 model from KerasHub, you can take one
+step further to finetune the model so that it generates text in a specific
+style, short or long, strict or casual. In this tutorial, we will use reddit
+dataset for example.
+"""
+
+import tensorflow_datasets as tfds
+
+reddit_ds = tfds.load("reddit_tifu", split="train", as_supervised=True)
+
+"""
+Let's take a look inside sample data from the reddit TensorFlow Dataset. There
+are two features:
+
+- **__document__**: text of the post.
+- **__title__**: the title.
+
+"""
+
+for document, title in reddit_ds:
+    print(document.numpy())
+    print(title.numpy())
+    break
+
+"""
+In our case, we are performing next word prediction in a language model, so we
+only need the 'document' feature.
+"""
+
+train_ds = (
+    reddit_ds.map(lambda document, _: document)
+    .batch(32)
+    .cache()
+    .prefetch(tf.data.AUTOTUNE)
+)
+
+"""
+Now you can finetune the model using the familiar *fit()* function. Note that
+`preprocessor` will be automatically called inside `fit` method since
+`GPT2CausalLM` is a `keras_hub.models.Task` instance.
+
+This step takes quite a bit of GPU memory and a long time if we were to train
+it all the way to a fully trained state. Here we just use part of the dataset
+for demo purposes.
+"""
+
+train_ds = train_ds.take(500)
+num_epochs = 1
+
+# Linearly decaying learning rate.
+learning_rate = keras.optimizers.schedules.PolynomialDecay(
+    5e-5,
+    decay_steps=train_ds.cardinality() * num_epochs,
+    end_learning_rate=0.0,
+)
+loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+gpt2_lm.compile(
+    optimizer=keras.optimizers.Adam(learning_rate),
+    loss=loss,
+    weighted_metrics=["accuracy"],
+)
+
+gpt2_lm.fit(train_ds, epochs=num_epochs)
+
+"""
+After fine-tuning is finished, you can again generate text using the same
+*generate()* function. This time, the text will be closer to Reddit writing
+style, and the generated length will be close to our preset length in the
+training set.
+"""
+
+start = time.time()
+
+output = gpt2_lm.generate("I like basketball", max_length=200)
+print("\nGPT-2 output:")
+print(output)
+
+end = time.time()
+print(f"TOTAL TIME ELAPSED: {end - start:.2f}s")
+
+"""
+## Into the Sampling Method
+
+In KerasHub, we offer a few sampling methods, e.g., contrastive search,
+Top-K and beam sampling. By default, our `GPT2CausalLM` uses Top-k search, but
+you can choose your own sampling method.
+
+Much like optimizer and activations, there are two ways to specify your custom
+sampler:
+
+- Use a string identifier, such as "greedy", you are using the default
+configuration via this way.
+- Pass a `keras_hub.samplers.Sampler` instance, you can use custom configuration
+via this way.
+"""
+
+# Use a string identifier.
+gpt2_lm.compile(sampler="top_k")
+output = gpt2_lm.generate("I like basketball", max_length=200)
+print("\nGPT-2 output:")
+print(output)
+
+# Use a `Sampler` instance. `GreedySampler` tends to repeat itself,
+greedy_sampler = keras_hub.samplers.GreedySampler()
+gpt2_lm.compile(sampler=greedy_sampler)
+
+output = gpt2_lm.generate("I like basketball", max_length=200)
+print("\nGPT-2 output:")
+print(output)
+
+"""
+For more details on KerasHub `Sampler` class, you can check the code
+[here](https://github.com/keras-team/keras-hub/tree/master/keras_hub/samplers).
+"""
+
+"""
+## Finetune on Chinese Poem Dataset
+
+We can also finetune GPT2 on non-English datasets. For readers knowing Chinese,
+this part illustrates how to fine-tune GPT2 on Chinese poem dataset to teach our
+model to become a poet!
+
+Because GPT2 uses byte-pair encoder, and the original pretraining dataset
+contains some Chinese characters, we can use the original vocab to finetune on
+Chinese dataset.
+"""
+
+"""shell
+# Load chinese poetry dataset.
+git clone https://github.com/chinese-poetry/chinese-poetry.git
+"""
+
+"""
+Load text from the json file. We only use《全唐诗》for demo purposes.
+"""
+
+import os
+import json
+
+poem_collection = []
+for file in os.listdir("chinese-poetry/全唐诗"):
+    if ".json" not in file or "poet" not in file:
+        continue
+    full_filename = "%s/%s" % ("chinese-poetry/全唐诗", file)
+    with open(full_filename, "r") as f:
+        content = json.load(f)
+        poem_collection.extend(content)
+
+paragraphs = ["".join(data["paragraphs"]) for data in poem_collection]
+
+"""
+Let's take a look at sample data.
+"""
+
+print(paragraphs[0])
+
+"""
+Similar as Reddit example, we convert to TF dataset, and only use partial data
+to train.
+"""
+
+train_ds = (
+    tf.data.Dataset.from_tensor_slices(paragraphs)
+    .batch(16)
+    .cache()
+    .prefetch(tf.data.AUTOTUNE)
+)
+
+# Running through the whole dataset takes long, only take `500` and run 1
+# epochs for demo purposes.
+train_ds = train_ds.take(500)
+num_epochs = 1
+
+learning_rate = keras.optimizers.schedules.PolynomialDecay(
+    5e-4,
+    decay_steps=train_ds.cardinality() * num_epochs,
+    end_learning_rate=0.0,
+)
+loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+gpt2_lm.compile(
+    optimizer=keras.optimizers.Adam(learning_rate),
+    loss=loss,
+    weighted_metrics=["accuracy"],
+)
+
+gpt2_lm.fit(train_ds, epochs=num_epochs)
+
+"""
+Let's check the result!
+"""
+
+output = gpt2_lm.generate("昨夜雨疏风骤", max_length=200)
+print(output)
+
+"""
+Not bad 😀
+"""
diff --git a/knowledge_base/generative/lstm_character_level_text_generation.py b/knowledge_base/generative/lstm_character_level_text_generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..66b6401350f12f726a4cd9f6c8cd3e1379a5c093
--- /dev/null
+++ b/knowledge_base/generative/lstm_character_level_text_generation.py
@@ -0,0 +1,132 @@
+"""
+Title: Character-level text generation with LSTM
+Author: [fchollet](https://twitter.com/fchollet)
+Date created: 2015/06/15
+Last modified: 2020/04/30
+Description: Generate text from Nietzsche's writings with a character-level LSTM.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+This example demonstrates how to use a LSTM model to generate
+text character-by-character.
+
+At least 20 epochs are required before the generated text
+starts sounding locally coherent.
+
+It is recommended to run this script on GPU, as recurrent
+networks are quite computationally intensive.
+
+If you try this script on new data, make sure your corpus
+has at least ~100k characters. ~1M is better.
+"""
+
+"""
+## Setup
+"""
+import keras
+from keras import layers
+
+import numpy as np
+import random
+import io
+
+"""
+## Prepare the data
+"""
+
+path = keras.utils.get_file(
+    "nietzsche.txt",
+    origin="https://s3.amazonaws.com/text-datasets/nietzsche.txt",
+)
+with io.open(path, encoding="utf-8") as f:
+    text = f.read().lower()
+text = text.replace("\n", " ")  # We remove newlines chars for nicer display
+print("Corpus length:", len(text))
+
+chars = sorted(list(set(text)))
+print("Total chars:", len(chars))
+char_indices = dict((c, i) for i, c in enumerate(chars))
+indices_char = dict((i, c) for i, c in enumerate(chars))
+
+# cut the text in semi-redundant sequences of maxlen characters
+maxlen = 40
+step = 3
+sentences = []
+next_chars = []
+for i in range(0, len(text) - maxlen, step):
+    sentences.append(text[i : i + maxlen])
+    next_chars.append(text[i + maxlen])
+print("Number of sequences:", len(sentences))
+
+x = np.zeros((len(sentences), maxlen, len(chars)), dtype="bool")
+y = np.zeros((len(sentences), len(chars)), dtype="bool")
+for i, sentence in enumerate(sentences):
+    for t, char in enumerate(sentence):
+        x[i, t, char_indices[char]] = 1
+    y[i, char_indices[next_chars[i]]] = 1
+
+
+"""
+## Build the model: a single LSTM layer
+"""
+
+model = keras.Sequential(
+    [
+        keras.Input(shape=(maxlen, len(chars))),
+        layers.LSTM(128),
+        layers.Dense(len(chars), activation="softmax"),
+    ]
+)
+optimizer = keras.optimizers.RMSprop(learning_rate=0.01)
+model.compile(loss="categorical_crossentropy", optimizer=optimizer)
+
+"""
+## Prepare the text sampling function
+"""
+
+
+def sample(preds, temperature=1.0):
+    # helper function to sample an index from a probability array
+    preds = np.asarray(preds).astype("float64")
+    preds = np.log(preds) / temperature
+    exp_preds = np.exp(preds)
+    preds = exp_preds / np.sum(exp_preds)
+    probas = np.random.multinomial(1, preds, 1)
+    return np.argmax(probas)
+
+
+"""
+## Train the model
+"""
+
+epochs = 40
+batch_size = 128
+
+for epoch in range(epochs):
+    model.fit(x, y, batch_size=batch_size, epochs=1)
+    print()
+    print("Generating text after epoch: %d" % epoch)
+
+    start_index = random.randint(0, len(text) - maxlen - 1)
+    for diversity in [0.2, 0.5, 1.0, 1.2]:
+        print("...Diversity:", diversity)
+
+        generated = ""
+        sentence = text[start_index : start_index + maxlen]
+        print('...Generating with seed: "' + sentence + '"')
+
+        for i in range(400):
+            x_pred = np.zeros((1, maxlen, len(chars)))
+            for t, char in enumerate(sentence):
+                x_pred[0, t, char_indices[char]] = 1.0
+            preds = model.predict(x_pred, verbose=0)[0]
+            next_index = sample(preds, diversity)
+            next_char = indices_char[next_index]
+            sentence = sentence[1:] + next_char
+            generated += next_char
+
+        print("...Generated: ", generated)
+        print("-")
diff --git a/knowledge_base/generative/midi_generation_with_transformer.py b/knowledge_base/generative/midi_generation_with_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..1527dc86da693ca1d7b4bc6ebd6521d477cf4e3c
--- /dev/null
+++ b/knowledge_base/generative/midi_generation_with_transformer.py
@@ -0,0 +1,722 @@
+"""
+Title: Music Generation with Transformer Models
+Author: [Joaquin Jimenez](https://github.com/johacks/)
+Date created: 2024/11/22
+Last modified: 2024/11/26
+Description: Use a Transformer model to train on MIDI data and generate music sequences.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+In this tutorial, we learn how to build a music generation model using a
+Transformer decode-only architecture.
+The model is trained on the [Maestro dataset](https://magenta.tensorflow.org/datasets/maestro)
+and implemented using keras 3.
+In the process, we explore MIDI tokenization, and relative global attention mechanisms.
+
+This example is based on the paper "Music Transformer" by Huang et al. (2018).
+Check out the original [paper](https://arxiv.org/abs/1809.04281) and
+[code](https://github.com/jason9693/MusicTransformer-tensorflow2.0).
+"""
+
+"""
+## Setup
+
+Before we start, let's import and install all the libraries we need.
+"""
+
+"""shell
+pip install -qq midi_neural_processor
+pip install -qq keras_hub
+pip install -qq "keras>=3.6.0"  # Allows use of keras.utils.Config.
+"""
+
+"""
+### Optional dependencies
+
+To hear the audio, install the following additional dependencies:
+"""
+
+"""shell
+sudo apt-get -qq install -y fluidsynth 2> /dev/null
+pip install -qq pyfluidsynth scipy
+"""
+
+import os
+import random
+import tempfile
+
+import keras
+import midi_neural_processor.processor as midi_tokenizer
+import numpy as np
+from keras import callbacks, layers, ops, optimizers, utils
+from keras_hub import layers as hub_layers
+from os import path
+
+"""
+## Configuration
+
+Lets define the configuration for the model and the dataset to be used in this example.
+"""
+event_range = midi_tokenizer.RANGE_NOTE_ON
+event_range += midi_tokenizer.RANGE_NOTE_OFF
+event_range += midi_tokenizer.RANGE_TIME_SHIFT
+event_range += midi_tokenizer.RANGE_VEL
+CONFIG = utils.Config(
+    max_sequence_len=2048,
+    embedding_dim=256,
+    num_transformer_blocks=6,
+    batch_size=6,
+    token_pad=event_range,
+    token_start_of_sentence=event_range + 1,
+    token_end_of_sentence=event_range + 2,
+    vocabulary_size=event_range + 3,
+    model_out="tmp/music_transformer.keras",
+    seed=42,
+)
+utils.set_random_seed(CONFIG.seed)
+
+
+"""
+## Maestro dataset
+
+The Maestro dataset contains MIDI files for piano performances.
+
+### Download the dataset
+
+We now download and extract the dataset, then move the MIDI files to a new directory.
+"""
+
+
+def download_maestro(output_dir=None):
+    """Download the Maestro MIDI dataset.
+    Extracted from: https://magenta.tensorflow.org/datasets/maestro
+    """
+    # Ensure the output directory exists
+    output_dir = tempfile.mkdtemp() if output_dir is None else output_dir
+    os.makedirs(output_dir, exist_ok=True)
+
+    # Download and extract zip file
+    dir = utils.get_file(
+        origin="https://storage.googleapis.com/magentadata/datasets/maestro/v3.0.0/maestro-v3.0.0-midi.zip",
+        extract=True,
+    )
+
+    # Gather all MIDI files
+    midi_files, file_paths = set(), list()
+    for root, _, files in os.walk(dir):
+        for file in files:
+            if file.lower().endswith(".midi") or file.lower().endswith(".mid"):
+                midi_files.add(path.join(root, file))
+
+    # Move the files to the output directory
+    for file in sorted(midi_files):
+        file_paths.append(new_path := path.join(output_dir, path.basename(file)))
+        os.rename(file, new_path)
+    return file_paths
+
+
+paths = list(sorted(download_maestro(output_dir="datasets/maestro")))
+output_dir = path.dirname(paths[0])
+
+
+"""
+### Split the dataset
+
+We can now split the dataset into training and validation sets.
+"""
+
+indices = np.random.permutation(len(paths))
+split = int(len(paths) * 0.1)
+train_paths = [paths[i] for i in indices[split:]]
+val_paths = [paths[i] for i in indices[:split]]
+
+"""
+### Hear a MIDI file
+
+We use the pretty_midi library and fluidsynth to convert MIDI files into waveform audio.
+This allows us to listen to the data samples before and after processing.
+
+The following dependencies are required to play the audio:
+- fluidsynth: `sudo apt install -y fluidsynth`
+- pyfluidsynth, scipy: `pip install pyfluidsynth scipy`
+"""
+
+
+def visualize_midi(midi_path, sampling_rate=16000, seconds=15, out_dir=None):
+    import pretty_midi
+    from scipy.io.wavfile import write as write_wav
+    from IPython.display import Audio
+
+    # Create the audio waveform
+    pretty_midi_file = pretty_midi.PrettyMIDI(midi_path)
+    waveform = pretty_midi_file.fluidsynth(fs=sampling_rate)[: seconds * sampling_rate]
+
+    # Display the audio if no path is provided
+    if out_dir is None:
+        # IPython display
+        return Audio(waveform, rate=sampling_rate)
+
+    # Save the audio to a file
+    os.makedirs(out_dir, exist_ok=True)
+    audio_path = path.join(out_dir, path.basename(midi_path).split(".")[0] + ".wav")
+    write_wav(audio_path, sampling_rate, (waveform * 32767).astype(np.int16))
+    return audio_path
+
+
+print(visualize_midi(train_paths[0], out_dir="tmp/"))  # Saved audio path
+visualize_midi(train_paths[0])  # Display the audio if in a Jupyter notebook
+
+
+"""
+### Tokenize the data
+
+We now preprocess the MIDI files into a tokenized format for training.
+"""
+
+
+def encode_midi_task(midi_path):
+    """Define a task that tokenizes a MIDI file."""
+    import midi_neural_processor.processor as midi_tokenizer
+
+    return midi_tokenizer.encode_midi(midi_path)
+
+
+def preprocess_midi_files(file_paths, save_dir=None):
+    """Preprocess a list of MIDI files and save the notes to a file."""
+    from multiprocessing import Pool, cpu_count
+
+    # Assume all files are in the same directory and save to the same directory
+    save_dir = path.dirname(file_paths[0]) if save_dir is None else save_dir
+    os.makedirs(save_dir, exist_ok=True)
+
+    # Check if the notes have already been preprocessed
+    output_file = path.join(save_dir, "notes.npz")
+    if path.exists(output_file):
+        npz_file = np.load(output_file)
+        return [npz_file[key] for key in npz_file.keys()]
+
+    # Preprocess the MIDI files in parallel
+    progbar = utils.Progbar(len(file_paths), unit_name="MIDI_file", interval=5)
+    pool = Pool(cpu_count() - 1)
+    all_notes = []
+    for notes in pool.imap_unordered(encode_midi_task, file_paths):
+        progbar.add(1)
+        all_notes.append(np.array(notes))
+
+    # Save the notes to a file
+    np.savez(output_file, *all_notes)
+    return all_notes
+
+
+train_midis = preprocess_midi_files(train_paths, path.join(output_dir, "train"))
+val_midis = preprocess_midi_files(val_paths, path.join(output_dir, "val"))
+
+
+"""
+### Dataset objects
+
+We now define a dataset class that yields batches of input sequences and target sequences.
+"""
+
+
+class MidiDataset(utils.PyDataset):
+    """A dataset for MIDI files that yields batches of input sequences and target sequences."""
+
+    def __init__(
+        self,
+        encoded_midis,
+        batch_size=CONFIG.batch_size,
+        max_sequence_len=CONFIG.max_sequence_len,
+    ):
+        super(MidiDataset, self).__init__()
+        self.batch_size = batch_size
+        self.max_sequence_len = max_sequence_len
+        self.encoded_midis = encoded_midis
+        batches, last_batch_size = divmod(len(encoded_midis), batch_size)
+        self._num_batches = batches + int(last_batch_size > 0)
+
+    def __len__(self):
+        """Get the number of batches."""
+        return self._num_batches
+
+    def __getitem__(self, idx):
+        """Generate random inputs and corresponding targets for the model."""
+        # Same as in the original paper, we always get a random batch.
+        # See: https://github.com/jason9693/MusicTransformer-tensorflow2.0/blob/f7c06c0cb2e9cdddcbf6db779cb39cd650282778/data.py
+        batch = random.sample(self.encoded_midis, k=self.batch_size)
+
+        # Convert the batch to sequences
+        batch_data = [
+            self._get_sequence(midi, self.max_sequence_len + 1) for midi in batch
+        ]
+        batch_data = np.array(batch_data)
+
+        # Split the data into input and target sequences
+        return batch_data[:, :-1], batch_data[:, 1:]
+
+    def _get_sequence(self, data, max_length):
+        """Get a random sequence of notes from a file."""
+        # Truncate or pad the sequence
+        if len(data) > max_length:
+            start = random.randrange(0, len(data) - max_length)
+            data = data[start : start + max_length]
+        elif len(data) < max_length:
+            data = np.append(data, CONFIG.token_end_of_sentence)
+
+        # Pad the sequence if necessary
+        if len(data) < max_length:
+            data = np.concatenate(
+                (data, np.full(max_length - len(data), CONFIG.token_pad))
+            )
+        return np.asanyarray(data, dtype="int32")
+
+
+train_dataset, val_dataset = MidiDataset(train_midis), MidiDataset(val_midis)
+
+
+"""
+## Model definition
+
+It is time to define the model architecture. We use a Transformer decoder
+architecture with a custom attention mechanism, relative global attention.
+
+### Relative Global Attention
+
+The following code implements the Relative Global Attention layer. It is used
+in place of the standard multi-head attention layer in the Transformer decoder.
+The main difference is that it includes a relative positional encoding that
+allows the model to learn relative positional information between tokens.
+"""
+
+
+@keras.utils.register_keras_serializable()
+class RelativeGlobalAttention(layers.Layer):
+    """
+    From Music Transformer (Huang et al., 2018)
+    https://arxiv.org/abs/1809.04281
+    """
+
+    def __init__(self, num_heads, embedding_dim, max_sequence_len, **kwargs):
+        super().__init__(**kwargs)
+        self.key_length = None
+        self.max_sequence_len = max_sequence_len
+        self.relative_embedding = None
+        self.num_heads = num_heads
+        self.embedding_dim = embedding_dim
+        self.head_dim = embedding_dim // num_heads
+        self.query_dense = layers.Dense(int(self.embedding_dim))
+        self.key_dense = layers.Dense(int(self.embedding_dim))
+        self.value_dense = layers.Dense(int(self.embedding_dim))
+        self.output_dense = layers.Dense(embedding_dim, name="output")
+
+    def build(self, input_shape):
+        self.query_length = input_shape[0][1]
+        self.key_length = input_shape[1][1]
+        self.relative_embedding = self.add_weight(
+            (self.max_sequence_len, int(self.head_dim)), name="relative_embedding"
+        )
+
+    def _apply_dense_layer_and_split_heads(self, inputs, dense_layer):
+        # Apply linear transformation
+        inputs = dense_layer(inputs)
+        new_shape = ops.shape(inputs)
+        # Reshape to split by attention heads
+        reshaped = ops.reshape(inputs, (new_shape[0], new_shape[1], self.num_heads, -1))
+        # Transpose for head-first format
+        return ops.transpose(reshaped, (0, 2, 1, 3))
+
+    def call(self, inputs, mask=None):
+        # Compute Q, K, V: Batch, head, sequence, features
+        query = self._apply_dense_layer_and_split_heads(inputs[0], self.query_dense)
+        key = self._apply_dense_layer_and_split_heads(inputs[1], self.key_dense)
+        value = self._apply_dense_layer_and_split_heads(inputs[2], self.value_dense)
+
+        # Compute scaled dot-product attention scores
+        attention_scores = ops.matmul(query, ops.transpose(key, [0, 1, 3, 2]))
+
+        # Compute relative positional encoding and combine with attention scores
+        start_idx = max(0, self.max_sequence_len - ops.shape(query)[2])
+        relative_embedding = self.relative_embedding[start_idx:, :]
+        attention_scores += self._compute_attention_scores(query, relative_embedding)
+        logits = attention_scores / ops.sqrt(self.head_dim)
+
+        # Apply mask if provided
+        if mask is not None:
+            logits += ops.cast(mask, "float32") * -1e9
+
+        # Compute attention weights
+        attention_weights = ops.nn.softmax(logits, axis=-1)
+        attention_output = ops.matmul(attention_weights, value)
+
+        # Merge heads and apply final linear transformation
+        merged_attention = ops.transpose(attention_output, (0, 2, 1, 3))
+        merged_attention = ops.reshape(
+            merged_attention, (ops.shape(merged_attention)[0], -1, self.embedding_dim)
+        )
+        output = self.output_dense(merged_attention)
+
+        return output, attention_weights
+
+    def _compute_attention_scores(self, query, relative_embedding):
+        """
+        Compute relative attention scores using positional encodings.
+        """
+        relative_scores = ops.einsum("bhld, md->bhlm", query, relative_embedding)
+        relative_scores = self._apply_mask_to_relative_scores(relative_scores)
+        return self._skew_attention_scores(relative_scores)
+
+    def _apply_mask_to_relative_scores(self, scores):
+        """
+        Apply masking to relative positional scores to ignore future positions.
+        """
+        mask = ops.flip(
+            ops.tri(scores.shape[-2], scores.shape[-1], dtype="float32"), axis=1
+        )
+        return mask * scores
+
+    def _skew_attention_scores(self, scores):
+        """
+        Perform skewing operation to align relative attention scores with the sequence.
+        """
+        padded_scores = ops.pad(scores, ((0, 0), (0, 0), (0, 0), (1, 0)))
+        padded_shape = ops.shape(padded_scores)
+        reshaped_scores = ops.reshape(
+            padded_scores, (-1, padded_shape[1], padded_shape[-1], padded_shape[-2])
+        )
+        skewed_scores = reshaped_scores[:, :, 1:, :]
+
+        if self.key_length > self.query_length:
+            size_diff = self.key_length - self.query_length
+            return ops.pad(skewed_scores, [[0, 0], [0, 0], [0, 0], [0, size_diff]])
+        else:
+            return skewed_scores[:, :, :, : self.key_length]
+
+
+"""
+### Decoder Layer
+
+Using the RelativeGlobalAttention layer, we can define the DecoderLayer. It is mostly like
+the standard Transformer decoder layer but with the custom attention mechanism.
+"""
+
+
+@keras.utils.register_keras_serializable()
+class DecoderLayer(layers.Layer):
+    def __init__(self, embedding_dim, num_heads, max_sequence_len, dropout=0.1):
+        super(DecoderLayer, self).__init__()
+
+        # Initialize attributes
+        self.embedding_dim = embedding_dim
+        self.num_heads = num_heads
+        self.max_sequence_len = max_sequence_len
+
+        # Initialize layers
+        self.relative_global_attention_1 = RelativeGlobalAttention(
+            num_heads, embedding_dim, max_sequence_len
+        )
+
+        self.feed_forward_network_pre = layers.Dense(self.embedding_dim // 2, "relu")
+        self.feed_forward_network_pos = layers.Dense(self.embedding_dim)
+
+        self.layer_normalization_1 = layers.LayerNormalization(epsilon=1e-6)
+        self.layer_normalization_2 = layers.LayerNormalization(epsilon=1e-6)
+
+        self.dropout_1 = layers.Dropout(dropout)
+        self.dropout_2 = layers.Dropout(dropout)
+
+    def call(self, inputs, mask=None, training=False):
+        # Attention block. Inputs are (query, key, value)
+        attention_out, attention_weights = self.relative_global_attention_1(
+            (inputs, inputs, inputs), mask=mask
+        )
+        attention_out = self.dropout_1(attention_out, training=training)
+        attention_out_normalized = self.layer_normalization_1(attention_out + inputs)
+
+        ffn_out = self.feed_forward_network_pre(attention_out)
+        ffn_out = self.feed_forward_network_pos(ffn_out)
+        ffn_out = self.dropout_2(ffn_out, training=training)
+        out = self.layer_normalization_2(attention_out_normalized + ffn_out)
+
+        return out, attention_weights
+
+
+"""
+### Decoder
+
+The Decoder layer is composed of multiple DecoderLayer blocks. It also includes
+an embedding layer that converts our tokenized input into an embedding representation.
+"""
+
+
+@keras.utils.register_keras_serializable()
+class Decoder(layers.Layer):
+    def __init__(
+        self, embedding_dim, vocabulary_size, max_sequence_len, num_blocks, dropout
+    ):
+        super(Decoder, self).__init__()
+
+        self.embedding_dim = embedding_dim
+        self.num_blocks = num_blocks
+
+        self.embedding = layers.Embedding(vocabulary_size, self.embedding_dim)
+        self.positional_encoding = hub_layers.SinePositionEncoding()
+
+        self.decode_layers = [
+            DecoderLayer(
+                embedding_dim, embedding_dim // 64, max_sequence_len, dropout=dropout
+            )
+            for _ in range(num_blocks)
+        ]
+        self.dropout = layers.Dropout(dropout)
+
+    def call(self, inputs, mask=None, training=False, return_attention_weights=False):
+        weights = []
+
+        # Adding embedding and position encoding.
+        x = self.embedding(inputs)
+        x = x * ops.sqrt(ops.cast(self.embedding_dim, "float32"))
+        x = x + self.positional_encoding(x)
+        x = self.dropout(x, training=training)
+
+        # Passing through the transformer blocks.
+        for i in range(self.num_blocks):
+            x, w = self.decode_layers[i](x, mask=mask, training=training)
+            weights.append(w)
+        if return_attention_weights:
+            return x, weights
+        return x
+
+
+"""
+### Music Transformer Decoder
+
+With the above layers defined, we can now define the MusicTransformerDecoder model. It applies
+a linear transformation to the output of the decoder to get the logits for each token.
+"""
+
+
+@keras.utils.register_keras_serializable()
+class MusicTransformerDecoder(keras.Model):
+    def __init__(
+        self,
+        embedding_dim=CONFIG.embedding_dim,
+        vocabulary_size=CONFIG.vocabulary_size,
+        num_blocks=CONFIG.num_transformer_blocks,
+        max_sequence_len=CONFIG.max_sequence_len,
+        dropout=0.2,
+    ):
+        # Initialize attributes
+        super(MusicTransformerDecoder, self).__init__()
+        self.embedding_dim = embedding_dim
+        self.vocabulary_size = vocabulary_size
+        self.num_blocks = num_blocks
+        self.max_sequence_len = max_sequence_len
+
+        # Initialize layers
+        # Transformer decoder
+        self.decoder = Decoder(
+            embedding_dim, vocabulary_size, max_sequence_len, num_blocks, dropout
+        )
+        # Output layer
+        self.fc = layers.Dense(self.vocabulary_size, activation=None, name="output")
+
+    @staticmethod
+    def get_look_ahead_mask(max_sequence_len, inputs):
+        sequence_length = min(max_sequence_len, inputs.shape[1])
+        sequence_mask = ops.logical_not(
+            ops.tri(sequence_length, sequence_length, dtype="bool")
+        )
+
+        inputs = ops.cast(inputs[:, None, None, :], "int32")
+        output_pad_tensor = ops.ones_like(inputs) * CONFIG.token_pad
+        decoder_output_mask = ops.equal(inputs, output_pad_tensor)
+        return ops.cast(ops.logical_or(decoder_output_mask, sequence_mask), "int32")
+
+    def call(self, inputs, training=False):
+        mask = self.get_look_ahead_mask(self.max_sequence_len, inputs)
+        decoding = self.decoder(
+            inputs, mask=mask, training=training, return_attention_weights=False
+        )
+        return self.fc(decoding)
+
+    # --- Sequence generation methods
+
+    def generate(self, inputs: list, length=CONFIG.max_sequence_len, top_k=5):
+        inputs = ops.convert_to_tensor([inputs])
+
+        # Generate a new token using output distribution at given index
+        def generate_token(inputs, end_idx):
+            distribution = ops.stop_gradient(self.call(inputs)[0, end_idx])
+
+            # Select the top-k tokens and their probabilities
+            top_k_distribution, top_k_indices = ops.top_k(distribution, k=top_k)
+
+            # Sample from the top-k probabilities
+            new_token_idx = keras.random.categorical(top_k_distribution[None, :], 1)
+            return ops.take(top_k_indices, new_token_idx[0])
+
+        # Compute the number of tokens to add
+        added_tokens = min(length, self.max_sequence_len - inputs.shape[1])
+        progbar = utils.Progbar(added_tokens, unit_name="token", interval=5)
+
+        # Pad the input sequence that will be filled with generated tokens
+        out = ops.pad(inputs, ((0, 0), (0, added_tokens)), "constant", CONFIG.token_pad)
+
+        # Generate tokens using top-k sampling
+        for token_idx in range(inputs.shape[1] - 1, inputs.shape[1] - 1 + added_tokens):
+            token = ops.cast(generate_token(out, end_idx=token_idx), out.dtype)
+            out = ops.scatter_update(out, ((0, token_idx + 1),), token)
+            progbar.add(1)
+
+        return ops.convert_to_numpy(out[0])
+
+    # --- Serialization methods
+
+    def get_config(self):
+        atts = ["embedding_dim", "vocabulary_size", "num_blocks", "max_sequence_len"]
+        return {a: getattr(self, a) for a in atts}
+
+    @classmethod
+    def from_config(cls, config):
+        return cls(**config)
+
+
+"""
+### Loss function
+
+We define a custom loss function that computes the categorical cross-entropy
+loss for the model. It is computed only for non-padding tokens and uses
+`from_logits=True` since the model outputs logits.
+"""
+
+
+@keras.utils.register_keras_serializable()
+def train_loss(y_true, y_pred):
+    mask = ops.cast(ops.logical_not(ops.equal(y_true, CONFIG.token_pad)), "float32")
+    y_true = ops.one_hot(ops.cast(y_true, "int32"), CONFIG.vocabulary_size)
+    return ops.categorical_crossentropy(y_true, y_pred, from_logits=True) * mask
+
+
+"""
+### Learning rate schedule
+
+Following the Music Transformer paper, we define an adapted exponential decay
+learning rate schedule that takes into account the embedding dimension.
+"""
+
+
+@keras.utils.register_keras_serializable()
+class CustomSchedule(optimizers.schedules.LearningRateSchedule):
+    def __init__(self, embedding_dim, warmup_steps=4000):
+        super(CustomSchedule, self).__init__()
+
+        self.embedding_dim = embedding_dim
+        self.warmup_steps = warmup_steps
+
+        self._embedding_dim = ops.cast(self.embedding_dim, "float32")
+        # Numerical stability adjustment on torch, which is less precise
+        self._lr_adjust = 0.1 if keras.backend.backend() == "torch" else 1.0
+
+    def get_config(self):
+        return {"embedding_dim": self.embedding_dim, "warmup_steps": self.warmup_steps}
+
+    def __call__(self, step):
+        step_rsqrt = ops.rsqrt(ops.cast(step, "float32"))
+        warmup_adjust = step * (self.warmup_steps**-1.5)
+        output = ops.rsqrt(self._embedding_dim) * ops.minimum(step_rsqrt, warmup_adjust)
+        return self._lr_adjust * output
+
+
+"""
+## Training the model
+
+We can now train the model on the Maestro dataset. First, we define a training
+function. This function compiles the model, trains it, and saves the best model
+checkpoint. This way, we can continue training from the best model checkpoint
+if needed.
+"""
+
+
+def train_model(model, train_ds, val_ds, epochs=15):
+    # Configure optimizer
+    learning_rate = CustomSchedule(CONFIG.embedding_dim)
+    optimizer = optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)
+
+    # Compile the model
+    model.compile(optimizer=optimizer, loss=train_loss)
+
+    # Train the model
+    save_cb = callbacks.ModelCheckpoint(CONFIG.model_out, save_best_only=True)
+    model.fit(
+        train_ds, validation_data=val_ds, epochs=epochs, callbacks=[save_cb], verbose=2
+    )
+    return model
+
+
+"""
+We can now train the model on the Maestro dataset. If a model checkpoint exists,
+we can load it and continue training.
+"""
+if path.exists(CONFIG.model_out):
+    model = keras.models.load_model(CONFIG.model_out)
+    # Comment out to continue model training from the checkpoint
+    # train_model(model, train_dataset, val_dataset, epochs=10)
+else:
+    # Train the model
+    model = train_model(MusicTransformerDecoder(), train_dataset, val_dataset)
+
+
+"""
+## Generate music
+
+We can now generate music using the trained model. We use an existing MIDI file
+as a seed and generate a new sequence.
+"""
+
+
+def generate_music(model, seed_path, length=1024, out_dir=None, top_k=None):
+    # Ensure the output directory exists
+    out_dir = out_dir if out_dir is not None else tempfile.mkdtemp()
+    os.makedirs(out_dir, exist_ok=True)
+
+    # Get some tokens from the MIDI file
+    inputs = midi_tokenizer.encode_midi(seed_path)[100:125]
+    print(f"Seed tokens: {inputs}")
+
+    # Generate music that follows the input tokens until the maximum length
+    result = model.generate(inputs, length=length, top_k=top_k)
+
+    output_path = path.join(out_dir, path.basename(seed_path).split(".")[0] + ".mid")
+    midi_tokenizer.decode_midi(result, output_path)
+    return output_path
+
+
+output_file = generate_music(model, val_paths[-1], out_dir="tmp/", top_k=15)
+print(visualize_midi(output_file, out_dir="tmp/"))  # Saved audio path
+visualize_midi(output_file)  # Display the audio if in a Jupyter notebook
+
+"""
+## Conclusion
+
+In this example, we learned how to build a music generation model using a custom
+Transformer decoder architecture.
+
+We did it following the Music Transformer paper by Huang et al. (2018).
+To do so we had to:
+
+- Define a custom loss function and learning rate schedule.
+- Define a custom attention mechanism.
+- Preprocess MIDI files into a tokenized format.
+
+After training the model on the Maestro dataset, we generated music sequences
+using a seed MIDI file.
+
+### Next steps
+
+We could further improve inference times by caching attention weights during the
+forward pass, in a similar way as `keras_hub` `CausalLM` models, which use the
+`CachedMultiHeadAttention` layer.
+"""
diff --git a/knowledge_base/generative/molecule_generation.py b/knowledge_base/generative/molecule_generation.py
new file mode 100644
index 0000000000000000000000000000000000000000..56ef4b8f285a20df2ac4baac34f913d6aae21cc4
--- /dev/null
+++ b/knowledge_base/generative/molecule_generation.py
@@ -0,0 +1,623 @@
+"""
+Title: Drug Molecule Generation with VAE
+Author: [Victor Basu](https://www.linkedin.com/in/victor-basu-520958147)
+Date created: 2022/03/10
+Last modified: 2024/12/17
+Description: Implementing a Convolutional Variational AutoEncoder (VAE) for Drug Discovery.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+In this example, we use a Variational Autoencoder to generate molecules for drug discovery.
+We use the research papers
+[Automatic chemical design using a data-driven continuous representation of molecules](https://arxiv.org/abs/1610.02415)
+and [MolGAN: An implicit generative model for small molecular graphs](https://arxiv.org/abs/1805.11973)
+as a reference.
+
+The model described in the paper **Automatic chemical design using a data-driven
+continuous representation of molecules** generates new molecules via efficient exploration
+of open-ended spaces of chemical compounds. The model consists of
+three components: Encoder, Decoder and Predictor. The Encoder converts the discrete
+representation of a molecule into a real-valued continuous vector, and the Decoder
+converts these continuous vectors back to discrete molecule representations. The
+Predictor estimates chemical properties from the latent continuous vector representation
+of the molecule. Continuous representations allow the use of gradient-based
+optimization to efficiently guide the search for optimized functional compounds.
+
+![intro](https://bit.ly/3CtPMzM)
+
+**Figure (a)** - A diagram of the autoencoder used for molecule design, including the
+joint property prediction model. Starting from a discrete molecule representation, such
+as a SMILES string, the encoder network converts each molecule into a vector in the
+latent space, which is effectively a continuous molecule representation. Given a point
+in the latent space, the decoder network produces a corresponding SMILES string. A
+multilayer perceptron network estimates the value of target properties associated with
+each molecule.
+
+**Figure (b)** - Gradient-based optimization in continuous latent space. After training a
+surrogate model `f(z)` to predict the properties of molecules based on their latent
+representation `z`, we can optimize `f(z)` with respect to `z` to find new latent
+representations expected to match specific desired properties. These new latent
+representations can then be decoded into SMILES strings, at which point their properties
+can be tested empirically.
+
+For an explanation and implementation of MolGAN, please refer to the Keras Example
+[**WGAN-GP with R-GCN for the generation of small molecular graphs**](https://bit.ly/3pU6zXK) by
+Alexander Kensert. Many of the functions used in the present example are from the above Keras example.
+"""
+
+"""
+## Setup
+
+RDKit is an open source toolkit for cheminformatics and machine learning. This toolkit come in handy
+if one is into drug discovery domain. In this example, RDKit is used to conveniently
+and efficiently transform SMILES to molecule objects, and then from those obtain sets of atoms
+and bonds.
+
+Quoting from
+[WGAN-GP with R-GCN for the generation of small molecular graphs](https://keras.io/examples/generative/wgan-graphs/)):
+
+**"SMILES expresses the structure of a given molecule in the form of an ASCII string.
+The SMILES string is a compact encoding which, for smaller molecules, is relatively human-readable.
+Encoding molecules as a string both alleviates and facilitates database and/or web searching
+of a given molecule. RDKit uses algorithms to accurately transform a given SMILES to
+a molecule object, which can then be used to compute a great number of molecular properties/features."**
+"""
+
+"""shell
+pip -q install rdkit-pypi==2021.9.4
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import ast
+
+import pandas as pd
+import numpy as np
+
+import tensorflow as tf
+import keras
+from keras import layers
+from keras import ops
+
+import matplotlib.pyplot as plt
+from rdkit import Chem, RDLogger
+from rdkit.Chem import BondType
+from rdkit.Chem.Draw import MolsToGridImage
+
+RDLogger.DisableLog("rdApp.*")
+
+"""
+## Dataset
+
+We use the [**ZINC – A Free Database of Commercially Available Compounds for
+Virtual Screening**](https://bit.ly/3IVBI4x) dataset. The dataset comes with molecule
+formula in SMILE representation along with their respective molecular properties such as
+**logP** (water–octanal partition coefficient), **SAS** (synthetic
+accessibility score) and **QED** (Qualitative Estimate of Drug-likeness).
+
+"""
+
+csv_path = keras.utils.get_file(
+    "250k_rndm_zinc_drugs_clean_3.csv",
+    "https://raw.githubusercontent.com/aspuru-guzik-group/chemical_vae/master/models/zinc_properties/250k_rndm_zinc_drugs_clean_3.csv",
+)
+
+df = pd.read_csv(csv_path)
+df["smiles"] = df["smiles"].apply(lambda s: s.replace("\n", ""))
+df.head()
+
+"""
+## Hyperparameters
+"""
+
+SMILE_CHARSET = '["C", "B", "F", "I", "H", "O", "N", "S", "P", "Cl", "Br"]'
+
+bond_mapping = {"SINGLE": 0, "DOUBLE": 1, "TRIPLE": 2, "AROMATIC": 3}
+bond_mapping.update(
+    {0: BondType.SINGLE, 1: BondType.DOUBLE, 2: BondType.TRIPLE, 3: BondType.AROMATIC}
+)
+SMILE_CHARSET = ast.literal_eval(SMILE_CHARSET)
+
+MAX_MOLSIZE = max(df["smiles"].str.len())
+SMILE_to_index = dict((c, i) for i, c in enumerate(SMILE_CHARSET))
+index_to_SMILE = dict((i, c) for i, c in enumerate(SMILE_CHARSET))
+atom_mapping = dict(SMILE_to_index)
+atom_mapping.update(index_to_SMILE)
+
+BATCH_SIZE = 100
+EPOCHS = 10
+
+VAE_LR = 5e-4
+NUM_ATOMS = 120  # Maximum number of atoms
+
+ATOM_DIM = len(SMILE_CHARSET)  # Number of atom types
+BOND_DIM = 4 + 1  # Number of bond types
+LATENT_DIM = 435  # Size of the latent space
+
+
+def smiles_to_graph(smiles):
+    # Converts SMILES to molecule object
+    molecule = Chem.MolFromSmiles(smiles)
+
+    # Initialize adjacency and feature tensor
+    adjacency = np.zeros((BOND_DIM, NUM_ATOMS, NUM_ATOMS), "float32")
+    features = np.zeros((NUM_ATOMS, ATOM_DIM), "float32")
+
+    # loop over each atom in molecule
+    for atom in molecule.GetAtoms():
+        i = atom.GetIdx()
+        atom_type = atom_mapping[atom.GetSymbol()]
+        features[i] = np.eye(ATOM_DIM)[atom_type]
+        # loop over one-hop neighbors
+        for neighbor in atom.GetNeighbors():
+            j = neighbor.GetIdx()
+            bond = molecule.GetBondBetweenAtoms(i, j)
+            bond_type_idx = bond_mapping[bond.GetBondType().name]
+            adjacency[bond_type_idx, [i, j], [j, i]] = 1
+
+    # Where no bond, add 1 to last channel (indicating "non-bond")
+    # Notice: channels-first
+    adjacency[-1, np.sum(adjacency, axis=0) == 0] = 1
+
+    # Where no atom, add 1 to last column (indicating "non-atom")
+    features[np.where(np.sum(features, axis=1) == 0)[0], -1] = 1
+
+    return adjacency, features
+
+
+def graph_to_molecule(graph):
+    # Unpack graph
+    adjacency, features = graph
+
+    # RWMol is a molecule object intended to be edited
+    molecule = Chem.RWMol()
+
+    # Remove "no atoms" & atoms with no bonds
+    keep_idx = np.where(
+        (np.argmax(features, axis=1) != ATOM_DIM - 1)
+        & (np.sum(adjacency[:-1], axis=(0, 1)) != 0)
+    )[0]
+    features = features[keep_idx]
+    adjacency = adjacency[:, keep_idx, :][:, :, keep_idx]
+
+    # Add atoms to molecule
+    for atom_type_idx in np.argmax(features, axis=1):
+        atom = Chem.Atom(atom_mapping[atom_type_idx])
+        _ = molecule.AddAtom(atom)
+
+    # Add bonds between atoms in molecule; based on the upper triangles
+    # of the [symmetric] adjacency tensor
+    (bonds_ij, atoms_i, atoms_j) = np.where(np.triu(adjacency) == 1)
+    for bond_ij, atom_i, atom_j in zip(bonds_ij, atoms_i, atoms_j):
+        if atom_i == atom_j or bond_ij == BOND_DIM - 1:
+            continue
+        bond_type = bond_mapping[bond_ij]
+        molecule.AddBond(int(atom_i), int(atom_j), bond_type)
+
+    # Sanitize the molecule; for more information on sanitization, see
+    # https://www.rdkit.org/docs/RDKit_Book.html#molecular-sanitization
+    flag = Chem.SanitizeMol(molecule, catchErrors=True)
+    # Let's be strict. If sanitization fails, return None
+    if flag != Chem.SanitizeFlags.SANITIZE_NONE:
+        return None
+
+    return molecule
+
+
+"""
+##  Generate training set
+"""
+
+train_df = df.sample(frac=0.75, random_state=42)  # random state is a seed value
+train_df.reset_index(drop=True, inplace=True)
+
+adjacency_tensor, feature_tensor, qed_tensor = [], [], []
+for idx in range(8000):
+    adjacency, features = smiles_to_graph(train_df.loc[idx]["smiles"])
+    qed = train_df.loc[idx]["qed"]
+    adjacency_tensor.append(adjacency)
+    feature_tensor.append(features)
+    qed_tensor.append(qed)
+
+adjacency_tensor = np.array(adjacency_tensor)
+feature_tensor = np.array(feature_tensor)
+qed_tensor = np.array(qed_tensor)
+
+
+class RelationalGraphConvLayer(keras.layers.Layer):
+    def __init__(
+        self,
+        units=128,
+        activation="relu",
+        use_bias=False,
+        kernel_initializer="glorot_uniform",
+        bias_initializer="zeros",
+        kernel_regularizer=None,
+        bias_regularizer=None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        self.units = units
+        self.activation = keras.activations.get(activation)
+        self.use_bias = use_bias
+        self.kernel_initializer = keras.initializers.get(kernel_initializer)
+        self.bias_initializer = keras.initializers.get(bias_initializer)
+        self.kernel_regularizer = keras.regularizers.get(kernel_regularizer)
+        self.bias_regularizer = keras.regularizers.get(bias_regularizer)
+
+    def build(self, input_shape):
+        bond_dim = input_shape[0][1]
+        atom_dim = input_shape[1][2]
+
+        self.kernel = self.add_weight(
+            shape=(bond_dim, atom_dim, self.units),
+            initializer=self.kernel_initializer,
+            regularizer=self.kernel_regularizer,
+            trainable=True,
+            name="W",
+            dtype="float32",
+        )
+
+        if self.use_bias:
+            self.bias = self.add_weight(
+                shape=(bond_dim, 1, self.units),
+                initializer=self.bias_initializer,
+                regularizer=self.bias_regularizer,
+                trainable=True,
+                name="b",
+                dtype="float32",
+            )
+
+        self.built = True
+
+    def call(self, inputs, training=False):
+        adjacency, features = inputs
+        # Aggregate information from neighbors
+        x = ops.matmul(adjacency, features[:, None])
+        # Apply linear transformation
+        x = ops.matmul(x, self.kernel)
+        if self.use_bias:
+            x += self.bias
+        # Reduce bond types dim
+        x_reduced = ops.sum(x, axis=1)
+        # Apply non-linear transformation
+        return self.activation(x_reduced)
+
+
+"""
+## Build the Encoder and Decoder
+
+The Encoder takes as input a molecule's graph adjacency matrix and feature matrix.
+These features are processed via a Graph Convolution layer, then are flattened and
+processed by several Dense layers to derive `z_mean` and `log_var`, the
+latent-space representation of the molecule.
+
+**Graph Convolution layer**: The relational graph convolution layer implements
+non-linearly transformed neighbourhood aggregations. We can define these layers as
+follows:
+
+`H_hat**(l+1) = σ(D_hat**(-1) * A_hat * H_hat**(l+1) * W**(l))`
+
+Where `σ` denotes the non-linear transformation (commonly a ReLU activation), `A` the
+adjacency tensor, `H_hat**(l)` the feature tensor at the `l-th` layer, `D_hat**(-1)` the
+inverse diagonal degree tensor of `A_hat`, and `W_hat**(l)` the trainable weight tensor
+at the `l-th` layer. Specifically, for each bond type (relation), the degree tensor
+expresses, in the diagonal, the number of bonds attached to each atom.
+
+Source:
+[WGAN-GP with R-GCN for the generation of small molecular graphs](https://keras.io/examples/generative/wgan-graphs/))
+
+The Decoder takes as input the latent-space representation and predicts
+the graph adjacency matrix and feature matrix of the corresponding molecules.
+"""
+
+
+def get_encoder(
+    gconv_units, latent_dim, adjacency_shape, feature_shape, dense_units, dropout_rate
+):
+    adjacency = layers.Input(shape=adjacency_shape)
+    features = layers.Input(shape=feature_shape)
+
+    # Propagate through one or more graph convolutional layers
+    features_transformed = features
+    for units in gconv_units:
+        features_transformed = RelationalGraphConvLayer(units)(
+            [adjacency, features_transformed]
+        )
+    # Reduce 2-D representation of molecule to 1-D
+    x = layers.GlobalAveragePooling1D()(features_transformed)
+
+    # Propagate through one or more densely connected layers
+    for units in dense_units:
+        x = layers.Dense(units, activation="relu")(x)
+        x = layers.Dropout(dropout_rate)(x)
+
+    z_mean = layers.Dense(latent_dim, dtype="float32", name="z_mean")(x)
+    log_var = layers.Dense(latent_dim, dtype="float32", name="log_var")(x)
+
+    encoder = keras.Model([adjacency, features], [z_mean, log_var], name="encoder")
+
+    return encoder
+
+
+def get_decoder(dense_units, dropout_rate, latent_dim, adjacency_shape, feature_shape):
+    latent_inputs = keras.Input(shape=(latent_dim,))
+
+    x = latent_inputs
+    for units in dense_units:
+        x = layers.Dense(units, activation="tanh")(x)
+        x = layers.Dropout(dropout_rate)(x)
+
+    # Map outputs of previous layer (x) to [continuous] adjacency tensors (x_adjacency)
+    x_adjacency = layers.Dense(np.prod(adjacency_shape))(x)
+    x_adjacency = layers.Reshape(adjacency_shape)(x_adjacency)
+    # Symmetrify tensors in the last two dimensions
+    x_adjacency = (x_adjacency + ops.transpose(x_adjacency, (0, 1, 3, 2))) / 2
+    x_adjacency = layers.Softmax(axis=1)(x_adjacency)
+
+    # Map outputs of previous layer (x) to [continuous] feature tensors (x_features)
+    x_features = layers.Dense(np.prod(feature_shape))(x)
+    x_features = layers.Reshape(feature_shape)(x_features)
+    x_features = layers.Softmax(axis=2)(x_features)
+
+    decoder = keras.Model(
+        latent_inputs, outputs=[x_adjacency, x_features], name="decoder"
+    )
+
+    return decoder
+
+
+"""
+## Build the Sampling layer
+"""
+
+
+class Sampling(layers.Layer):
+    def __init__(self, seed=None, **kwargs):
+        super().__init__(**kwargs)
+        self.seed_generator = keras.random.SeedGenerator(seed)
+
+    def call(self, inputs):
+        z_mean, z_log_var = inputs
+        batch, dim = ops.shape(z_log_var)
+        epsilon = keras.random.normal(shape=(batch, dim), seed=self.seed_generator)
+        return z_mean + ops.exp(0.5 * z_log_var) * epsilon
+
+
+"""
+## Build the VAE
+
+This model is trained to optimize four losses:
+
+* Categorical crossentropy
+* KL divergence loss
+* Property prediction loss
+* Graph loss (gradient penalty)
+
+The categorical crossentropy loss function measures the model's
+reconstruction accuracy. The Property prediction loss estimates the mean squared
+error between predicted and actual properties after running the latent representation
+through a property prediction model. The property
+prediction of the model is optimized via binary crossentropy. The gradient
+penalty is further guided by the model's property (QED) prediction.
+
+A gradient penalty is an alternative soft constraint on the
+1-Lipschitz continuity as an improvement upon the gradient clipping scheme from the
+original neural network
+("1-Lipschitz continuity" means that the norm of the gradient is at most 1 at every single
+point of the function).
+It adds a regularization term to the loss function.
+"""
+
+
+class MoleculeGenerator(keras.Model):
+    def __init__(self, encoder, decoder, max_len, seed=None, **kwargs):
+        super().__init__(**kwargs)
+        self.encoder = encoder
+        self.decoder = decoder
+        self.property_prediction_layer = layers.Dense(1)
+        self.max_len = max_len
+        self.seed_generator = keras.random.SeedGenerator(seed)
+        self.sampling_layer = Sampling(seed=seed)
+
+        self.train_total_loss_tracker = keras.metrics.Mean(name="train_total_loss")
+        self.val_total_loss_tracker = keras.metrics.Mean(name="val_total_loss")
+
+    def train_step(self, data):
+        adjacency_tensor, feature_tensor, qed_tensor = data[0]
+        graph_real = [adjacency_tensor, feature_tensor]
+        self.batch_size = ops.shape(qed_tensor)[0]
+        with tf.GradientTape() as tape:
+            z_mean, z_log_var, qed_pred, gen_adjacency, gen_features = self(
+                graph_real, training=True
+            )
+            graph_generated = [gen_adjacency, gen_features]
+            total_loss = self._compute_loss(
+                z_log_var, z_mean, qed_tensor, qed_pred, graph_real, graph_generated
+            )
+
+        grads = tape.gradient(total_loss, self.trainable_weights)
+        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
+
+        self.train_total_loss_tracker.update_state(total_loss)
+        return {"loss": self.train_total_loss_tracker.result()}
+
+    def _compute_loss(
+        self, z_log_var, z_mean, qed_true, qed_pred, graph_real, graph_generated
+    ):
+        adjacency_real, features_real = graph_real
+        adjacency_gen, features_gen = graph_generated
+
+        adjacency_loss = ops.mean(
+            ops.sum(
+                keras.losses.categorical_crossentropy(
+                    adjacency_real, adjacency_gen, axis=1
+                ),
+                axis=(1, 2),
+            )
+        )
+        features_loss = ops.mean(
+            ops.sum(
+                keras.losses.categorical_crossentropy(features_real, features_gen),
+                axis=(1),
+            )
+        )
+        kl_loss = -0.5 * ops.sum(
+            1 + z_log_var - z_mean**2 - ops.minimum(ops.exp(z_log_var), 1e6), 1
+        )
+        kl_loss = ops.mean(kl_loss)
+
+        property_loss = ops.mean(
+            keras.losses.binary_crossentropy(qed_true, ops.squeeze(qed_pred, axis=1))
+        )
+
+        graph_loss = self._gradient_penalty(graph_real, graph_generated)
+
+        return kl_loss + property_loss + graph_loss + adjacency_loss + features_loss
+
+    def _gradient_penalty(self, graph_real, graph_generated):
+        # Unpack graphs
+        adjacency_real, features_real = graph_real
+        adjacency_generated, features_generated = graph_generated
+
+        # Generate interpolated graphs (adjacency_interp and features_interp)
+        alpha = keras.random.uniform(shape=(self.batch_size,), seed=self.seed_generator)
+        alpha = ops.reshape(alpha, (self.batch_size, 1, 1, 1))
+        adjacency_interp = (adjacency_real * alpha) + (
+            1.0 - alpha
+        ) * adjacency_generated
+        alpha = ops.reshape(alpha, (self.batch_size, 1, 1))
+        features_interp = (features_real * alpha) + (1.0 - alpha) * features_generated
+
+        # Compute the logits of interpolated graphs
+        with tf.GradientTape() as tape:
+            tape.watch(adjacency_interp)
+            tape.watch(features_interp)
+            _, _, logits, _, _ = self(
+                [adjacency_interp, features_interp], training=True
+            )
+
+        # Compute the gradients with respect to the interpolated graphs
+        grads = tape.gradient(logits, [adjacency_interp, features_interp])
+        # Compute the gradient penalty
+        grads_adjacency_penalty = (1 - ops.norm(grads[0], axis=1)) ** 2
+        grads_features_penalty = (1 - ops.norm(grads[1], axis=2)) ** 2
+        return ops.mean(
+            ops.mean(grads_adjacency_penalty, axis=(-2, -1))
+            + ops.mean(grads_features_penalty, axis=(-1))
+        )
+
+    def inference(self, batch_size):
+        z = keras.random.normal(
+            shape=(batch_size, LATENT_DIM), seed=self.seed_generator
+        )
+        reconstruction_adjacency, reconstruction_features = model.decoder.predict(z)
+        # obtain one-hot encoded adjacency tensor
+        adjacency = ops.argmax(reconstruction_adjacency, axis=1)
+        adjacency = ops.one_hot(adjacency, num_classes=BOND_DIM, axis=1)
+        # Remove potential self-loops from adjacency
+        adjacency = adjacency * (1.0 - ops.eye(NUM_ATOMS, dtype="float32")[None, None])
+        # obtain one-hot encoded feature tensor
+        features = ops.argmax(reconstruction_features, axis=2)
+        features = ops.one_hot(features, num_classes=ATOM_DIM, axis=2)
+        return [
+            graph_to_molecule([adjacency[i].numpy(), features[i].numpy()])
+            for i in range(batch_size)
+        ]
+
+    def call(self, inputs):
+        z_mean, log_var = self.encoder(inputs)
+        z = self.sampling_layer([z_mean, log_var])
+
+        gen_adjacency, gen_features = self.decoder(z)
+
+        property_pred = self.property_prediction_layer(z_mean)
+
+        return z_mean, log_var, property_pred, gen_adjacency, gen_features
+
+
+"""
+## Train the model
+"""
+
+vae_optimizer = keras.optimizers.Adam(learning_rate=VAE_LR)
+
+encoder = get_encoder(
+    gconv_units=[9],
+    adjacency_shape=(BOND_DIM, NUM_ATOMS, NUM_ATOMS),
+    feature_shape=(NUM_ATOMS, ATOM_DIM),
+    latent_dim=LATENT_DIM,
+    dense_units=[512],
+    dropout_rate=0.0,
+)
+decoder = get_decoder(
+    dense_units=[128, 256, 512],
+    dropout_rate=0.2,
+    latent_dim=LATENT_DIM,
+    adjacency_shape=(BOND_DIM, NUM_ATOMS, NUM_ATOMS),
+    feature_shape=(NUM_ATOMS, ATOM_DIM),
+)
+
+model = MoleculeGenerator(encoder, decoder, MAX_MOLSIZE)
+
+model.compile(vae_optimizer)
+history = model.fit([adjacency_tensor, feature_tensor, qed_tensor], epochs=EPOCHS)
+
+"""
+## Inference
+
+We use our model to generate new valid molecules from different points of the latent space.
+"""
+
+"""
+### Generate unique Molecules with the model
+"""
+
+molecules = model.inference(1000)
+
+MolsToGridImage(
+    [m for m in molecules if m is not None][:1000], molsPerRow=5, subImgSize=(260, 160)
+)
+
+"""
+### Display latent space clusters with respect to molecular properties (QAE)
+"""
+
+
+def plot_latent(vae, data, labels):
+    # display a 2D plot of the property in the latent space
+    z_mean, _ = vae.encoder.predict(data)
+    plt.figure(figsize=(12, 10))
+    plt.scatter(z_mean[:, 0], z_mean[:, 1], c=labels)
+    plt.colorbar()
+    plt.xlabel("z[0]")
+    plt.ylabel("z[1]")
+    plt.show()
+
+
+plot_latent(model, [adjacency_tensor[:8000], feature_tensor[:8000]], qed_tensor[:8000])
+
+"""
+## Conclusion
+
+In this example, we combined model architectures from two papers,
+"Automatic chemical design using a data-driven continuous representation of
+molecules" from 2016 and the "MolGAN" paper from 2018. The former paper
+treats SMILES inputs as strings and seeks to generate molecule strings in SMILES format,
+while the later paper considers SMILES inputs as graphs (a combination of adjacency
+matrices and feature matrices) and seeks to generate molecules as graphs.
+
+This hybrid approach enables a new type of directed gradient-based search through chemical space.
+
+Example available on HuggingFace
+
+| Trained Model | Demo |
+| :--: | :--: |
+| [![Generic badge](https://img.shields.io/badge/%F0%9F%A4%97%20Model-molecule%20generation%20with%20VAE-black.svg)](https://huggingface.co/keras-io/drug-molecule-generation-with-VAE) | [![Generic badge](https://img.shields.io/badge/%F0%9F%A4%97%20Spaces-molecule%20generation%20with%20VAE-black.svg)](https://huggingface.co/spaces/keras-io/generating-drug-molecule-with-VAE) |
+"""
diff --git a/knowledge_base/generative/neural_style_transfer.py b/knowledge_base/generative/neural_style_transfer.py
new file mode 100644
index 0000000000000000000000000000000000000000..8b1586f8470a40f9ad21ab561fe47658485a906e
--- /dev/null
+++ b/knowledge_base/generative/neural_style_transfer.py
@@ -0,0 +1,273 @@
+"""
+Title: Neural style transfer
+Author: [fchollet](https://twitter.com/fchollet)
+Date created: 2016/01/11
+Last modified: 2020/05/02
+Description: Transferring the style of a reference image to target image using gradient descent.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+Style transfer consists in generating an image
+with the same "content" as a base image, but with the
+"style" of a different picture (typically artistic).
+This is achieved through the optimization of a loss function
+that has 3 components: "style loss", "content loss",
+and "total variation loss":
+
+- The total variation loss imposes local spatial continuity between
+the pixels of the combination image, giving it visual coherence.
+- The style loss is where the deep learning keeps in --that one is defined
+using a deep convolutional neural network. Precisely, it consists in a sum of
+L2 distances between the Gram matrices of the representations of
+the base image and the style reference image, extracted from
+different layers of a convnet (trained on ImageNet). The general idea
+is to capture color/texture information at different spatial
+scales (fairly large scales --defined by the depth of the layer considered).
+- The content loss is a L2 distance between the features of the base
+image (extracted from a deep layer) and the features of the combination image,
+keeping the generated image close enough to the original one.
+
+**Reference:** [A Neural Algorithm of Artistic Style](
+  http://arxiv.org/abs/1508.06576)
+"""
+
+"""
+## Setup
+"""
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import keras
+import numpy as np
+import tensorflow as tf
+from keras.applications import vgg19
+
+base_image_path = keras.utils.get_file("paris.jpg", "https://i.imgur.com/F28w3Ac.jpg")
+style_reference_image_path = keras.utils.get_file(
+    "starry_night.jpg", "https://i.imgur.com/9ooB60I.jpg"
+)
+result_prefix = "paris_generated"
+
+# Weights of the different loss components
+total_variation_weight = 1e-6
+style_weight = 1e-6
+content_weight = 2.5e-8
+
+# Dimensions of the generated picture.
+width, height = keras.utils.load_img(base_image_path).size
+img_nrows = 400
+img_ncols = int(width * img_nrows / height)
+
+"""
+## Let's take a look at our base (content) image and our style reference image
+"""
+
+from IPython.display import Image, display
+
+display(Image(base_image_path))
+display(Image(style_reference_image_path))
+
+"""
+## Image preprocessing / deprocessing utilities
+"""
+
+
+def preprocess_image(image_path):
+    # Util function to open, resize and format pictures into appropriate tensors
+    img = keras.utils.load_img(image_path, target_size=(img_nrows, img_ncols))
+    img = keras.utils.img_to_array(img)
+    img = np.expand_dims(img, axis=0)
+    img = vgg19.preprocess_input(img)
+    return tf.convert_to_tensor(img)
+
+
+def deprocess_image(x):
+    # Util function to convert a tensor into a valid image
+    x = x.reshape((img_nrows, img_ncols, 3))
+    # Remove zero-center by mean pixel
+    x[:, :, 0] += 103.939
+    x[:, :, 1] += 116.779
+    x[:, :, 2] += 123.68
+    # 'BGR'->'RGB'
+    x = x[:, :, ::-1]
+    x = np.clip(x, 0, 255).astype("uint8")
+    return x
+
+
+"""
+## Compute the style transfer loss
+
+First, we need to define 4 utility functions:
+
+- `gram_matrix` (used to compute the style loss)
+- The `style_loss` function, which keeps the generated image close to the local textures
+of the style reference image
+- The `content_loss` function, which keeps the high-level representation of the
+generated image close to that of the base image
+- The `total_variation_loss` function, a regularization loss which keeps the generated
+image locally-coherent
+"""
+
+# The gram matrix of an image tensor (feature-wise outer product)
+
+
+def gram_matrix(x):
+    x = tf.transpose(x, (2, 0, 1))
+    features = tf.reshape(x, (tf.shape(x)[0], -1))
+    gram = tf.matmul(features, tf.transpose(features))
+    return gram
+
+
+# The "style loss" is designed to maintain
+# the style of the reference image in the generated image.
+# It is based on the gram matrices (which capture style) of
+# feature maps from the style reference image
+# and from the generated image
+
+
+def style_loss(style, combination):
+    S = gram_matrix(style)
+    C = gram_matrix(combination)
+    channels = 3
+    size = img_nrows * img_ncols
+    return tf.reduce_sum(tf.square(S - C)) / (4.0 * (channels**2) * (size**2))
+
+
+# An auxiliary loss function
+# designed to maintain the "content" of the
+# base image in the generated image
+
+
+def content_loss(base, combination):
+    return tf.reduce_sum(tf.square(combination - base))
+
+
+# The 3rd loss function, total variation loss,
+# designed to keep the generated image locally coherent
+
+
+def total_variation_loss(x):
+    a = tf.square(
+        x[:, : img_nrows - 1, : img_ncols - 1, :] - x[:, 1:, : img_ncols - 1, :]
+    )
+    b = tf.square(
+        x[:, : img_nrows - 1, : img_ncols - 1, :] - x[:, : img_nrows - 1, 1:, :]
+    )
+    return tf.reduce_sum(tf.pow(a + b, 1.25))
+
+
+"""
+Next, let's create a feature extraction model that retrieves the intermediate activations
+of VGG19 (as a dict, by name).
+"""
+
+# Build a VGG19 model loaded with pre-trained ImageNet weights
+model = vgg19.VGG19(weights="imagenet", include_top=False)
+
+# Get the symbolic outputs of each "key" layer (we gave them unique names).
+outputs_dict = dict([(layer.name, layer.output) for layer in model.layers])
+
+# Set up a model that returns the activation values for every layer in
+# VGG19 (as a dict).
+feature_extractor = keras.Model(inputs=model.inputs, outputs=outputs_dict)
+
+"""
+Finally, here's the code that computes the style transfer loss.
+"""
+
+# List of layers to use for the style loss.
+style_layer_names = [
+    "block1_conv1",
+    "block2_conv1",
+    "block3_conv1",
+    "block4_conv1",
+    "block5_conv1",
+]
+# The layer to use for the content loss.
+content_layer_name = "block5_conv2"
+
+
+def compute_loss(combination_image, base_image, style_reference_image):
+    input_tensor = tf.concat(
+        [base_image, style_reference_image, combination_image], axis=0
+    )
+    features = feature_extractor(input_tensor)
+
+    # Initialize the loss
+    loss = tf.zeros(shape=())
+
+    # Add content loss
+    layer_features = features[content_layer_name]
+    base_image_features = layer_features[0, :, :, :]
+    combination_features = layer_features[2, :, :, :]
+    loss = loss + content_weight * content_loss(
+        base_image_features, combination_features
+    )
+    # Add style loss
+    for layer_name in style_layer_names:
+        layer_features = features[layer_name]
+        style_reference_features = layer_features[1, :, :, :]
+        combination_features = layer_features[2, :, :, :]
+        sl = style_loss(style_reference_features, combination_features)
+        loss += (style_weight / len(style_layer_names)) * sl
+
+    # Add total variation loss
+    loss += total_variation_weight * total_variation_loss(combination_image)
+    return loss
+
+
+"""
+## Add a tf.function decorator to loss & gradient computation
+
+To compile it, and thus make it fast.
+"""
+
+
+@tf.function
+def compute_loss_and_grads(combination_image, base_image, style_reference_image):
+    with tf.GradientTape() as tape:
+        loss = compute_loss(combination_image, base_image, style_reference_image)
+    grads = tape.gradient(loss, combination_image)
+    return loss, grads
+
+
+"""
+## The training loop
+
+Repeatedly run vanilla gradient descent steps to minimize the loss, and save the
+resulting image every 100 iterations.
+
+We decay the learning rate by 0.96 every 100 steps.
+"""
+
+optimizer = keras.optimizers.SGD(
+    keras.optimizers.schedules.ExponentialDecay(
+        initial_learning_rate=100.0, decay_steps=100, decay_rate=0.96
+    )
+)
+
+base_image = preprocess_image(base_image_path)
+style_reference_image = preprocess_image(style_reference_image_path)
+combination_image = tf.Variable(preprocess_image(base_image_path))
+
+iterations = 4000
+for i in range(1, iterations + 1):
+    loss, grads = compute_loss_and_grads(
+        combination_image, base_image, style_reference_image
+    )
+    optimizer.apply_gradients([(grads, combination_image)])
+    if i % 100 == 0:
+        print("Iteration %d: loss=%.2f" % (i, loss))
+        img = deprocess_image(combination_image.numpy())
+        fname = result_prefix + "_at_iteration_%d.png" % i
+        keras.utils.save_img(fname, img)
+
+"""
+After 4000 iterations, you get the following result:
+"""
+
+display(Image(result_prefix + "_at_iteration_4000.png"))
diff --git a/knowledge_base/generative/pixelcnn.py b/knowledge_base/generative/pixelcnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..5c08a09fc246aae181ed384e3fa149ef102b676d
--- /dev/null
+++ b/knowledge_base/generative/pixelcnn.py
@@ -0,0 +1,187 @@
+"""
+Title: PixelCNN
+Author: [ADMoreau](https://github.com/ADMoreau)
+Date created: 2020/05/17
+Last modified: 2020/05/23
+Description: PixelCNN implemented in Keras.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+PixelCNN is a generative model proposed in 2016 by van den Oord et al.
+(reference: [Conditional Image Generation with PixelCNN Decoders](https://arxiv.org/abs/1606.05328)).
+It is designed to generate images (or other data types) iteratively
+from an input vector where the probability distribution of prior elements dictates the
+probability distribution of later elements. In the following example, images are generated
+in this fashion, pixel-by-pixel, via a masked convolution kernel that only looks at data
+from previously generated pixels (origin at the top left) to generate later pixels.
+During inference, the output of the network is used as a probability distribution
+from which new pixel values are sampled to generate a new image
+(here, with MNIST, the pixels values are either black or white).
+"""
+
+import numpy as np
+import keras
+from keras import layers
+from keras import ops
+from tqdm import tqdm
+
+"""
+## Getting the Data
+"""
+
+# Model / data parameters
+num_classes = 10
+input_shape = (28, 28, 1)
+n_residual_blocks = 5
+# The data, split between train and test sets
+(x, _), (y, _) = keras.datasets.mnist.load_data()
+# Concatenate all the images together
+data = np.concatenate((x, y), axis=0)
+# Round all pixel values less than 33% of the max 256 value to 0
+# anything above this value gets rounded up to 1 so that all values are either
+# 0 or 1
+data = np.where(data < (0.33 * 256), 0, 1)
+data = data.astype(np.float32)
+
+"""
+## Create two classes for the requisite Layers for the model
+"""
+
+
+# The first layer is the PixelCNN layer. This layer simply
+# builds on the 2D convolutional layer, but includes masking.
+class PixelConvLayer(layers.Layer):
+    def __init__(self, mask_type, **kwargs):
+        super().__init__()
+        self.mask_type = mask_type
+        self.conv = layers.Conv2D(**kwargs)
+
+    def build(self, input_shape):
+        # Build the conv2d layer to initialize kernel variables
+        self.conv.build(input_shape)
+        # Use the initialized kernel to create the mask
+        kernel_shape = ops.shape(self.conv.kernel)
+        self.mask = np.zeros(shape=kernel_shape)
+        self.mask[: kernel_shape[0] // 2, ...] = 1.0
+        self.mask[kernel_shape[0] // 2, : kernel_shape[1] // 2, ...] = 1.0
+        if self.mask_type == "B":
+            self.mask[kernel_shape[0] // 2, kernel_shape[1] // 2, ...] = 1.0
+
+    def call(self, inputs):
+        self.conv.kernel.assign(self.conv.kernel * self.mask)
+        return self.conv(inputs)
+
+
+# Next, we build our residual block layer.
+# This is just a normal residual block, but based on the PixelConvLayer.
+class ResidualBlock(keras.layers.Layer):
+    def __init__(self, filters, **kwargs):
+        super().__init__(**kwargs)
+        self.conv1 = keras.layers.Conv2D(
+            filters=filters, kernel_size=1, activation="relu"
+        )
+        self.pixel_conv = PixelConvLayer(
+            mask_type="B",
+            filters=filters // 2,
+            kernel_size=3,
+            activation="relu",
+            padding="same",
+        )
+        self.conv2 = keras.layers.Conv2D(
+            filters=filters, kernel_size=1, activation="relu"
+        )
+
+    def call(self, inputs):
+        x = self.conv1(inputs)
+        x = self.pixel_conv(x)
+        x = self.conv2(x)
+        return keras.layers.add([inputs, x])
+
+
+"""
+## Build the model based on the original paper
+"""
+
+inputs = keras.Input(shape=input_shape, batch_size=128)
+x = PixelConvLayer(
+    mask_type="A", filters=128, kernel_size=7, activation="relu", padding="same"
+)(inputs)
+
+for _ in range(n_residual_blocks):
+    x = ResidualBlock(filters=128)(x)
+
+for _ in range(2):
+    x = PixelConvLayer(
+        mask_type="B",
+        filters=128,
+        kernel_size=1,
+        strides=1,
+        activation="relu",
+        padding="valid",
+    )(x)
+
+out = keras.layers.Conv2D(
+    filters=1, kernel_size=1, strides=1, activation="sigmoid", padding="valid"
+)(x)
+
+pixel_cnn = keras.Model(inputs, out)
+adam = keras.optimizers.Adam(learning_rate=0.0005)
+pixel_cnn.compile(optimizer=adam, loss="binary_crossentropy")
+
+pixel_cnn.summary()
+pixel_cnn.fit(
+    x=data, y=data, batch_size=128, epochs=50, validation_split=0.1, verbose=2
+)
+
+"""
+## Demonstration
+
+The PixelCNN cannot generate the full image at once. Instead, it must generate each pixel in
+order, append the last generated pixel to the current image, and feed the image back into the
+model to repeat the process.
+"""
+
+from IPython.display import Image, display
+
+# Create an empty array of pixels.
+batch = 4
+pixels = np.zeros(shape=(batch,) + (pixel_cnn.input_shape)[1:])
+batch, rows, cols, channels = pixels.shape
+
+# Iterate over the pixels because generation has to be done sequentially pixel by pixel.
+for row in tqdm(range(rows)):
+    for col in range(cols):
+        for channel in range(channels):
+            # Feed the whole array and retrieving the pixel value probabilities for the next
+            # pixel.
+            probs = pixel_cnn.predict(pixels, verbose=0)[:, row, col, channel]
+            # Use the probabilities to pick pixel values and append the values to the image
+            # frame.
+            pixels[:, row, col, channel] = ops.ceil(
+                probs - keras.random.uniform(probs.shape)
+            )
+
+
+def deprocess_image(x):
+    # Stack the single channeled black and white image to rgb values.
+    x = np.stack((x, x, x), 2)
+    # Undo preprocessing
+    x *= 255.0
+    # Convert to uint8 and clip to the valid range [0, 255]
+    x = np.clip(x, 0, 255).astype("uint8")
+    return x
+
+
+# Iterate over the generated images and plot them with matplotlib.
+for i, pic in enumerate(pixels):
+    keras.utils.save_img(
+        "generated_image_{}.png".format(i), deprocess_image(np.squeeze(pic, -1))
+    )
+
+display(Image("generated_image_0.png"))
+display(Image("generated_image_1.png"))
+display(Image("generated_image_2.png"))
+display(Image("generated_image_3.png"))
diff --git a/knowledge_base/generative/random_walks_with_stable_diffusion.py b/knowledge_base/generative/random_walks_with_stable_diffusion.py
new file mode 100644
index 0000000000000000000000000000000000000000..ba3d48603e597762cb918727b86f6b2e728ce335
--- /dev/null
+++ b/knowledge_base/generative/random_walks_with_stable_diffusion.py
@@ -0,0 +1,387 @@
+"""
+Title: A walk through latent space with Stable Diffusion
+Authors: Ian Stenbit, [fchollet](https://twitter.com/fchollet), [lukewood](https://twitter.com/luke_wood_ml)
+Date created: 2022/09/28
+Last modified: 2022/09/28
+Description: Explore the latent manifold of Stable Diffusion.
+Accelerator: GPU
+"""
+
+"""
+## Overview
+
+Generative image models learn a "latent manifold" of the visual world:
+a low-dimensional vector space where each point maps to an image.
+Going from such a point on the manifold back to a displayable image
+is called "decoding" -- in the Stable Diffusion model, this is handled by
+the "decoder" model.
+
+![The Stable Diffusion architecture](https://i.imgur.com/2uC8rYJ.png)
+
+This latent manifold of images is continuous and interpolative, meaning that:
+
+1. Moving a little on the manifold only changes the corresponding image a little (continuity).
+2. For any two points A and B on the manifold (i.e. any two images), it is possible
+to move from A to B via a path where each intermediate point is also on the manifold (i.e.
+is also a valid image). Intermediate points would be called "interpolations" between
+the two starting images.
+
+Stable Diffusion isn't just an image model, though, it's also a natural language model.
+It has two latent spaces: the image representation space learned by the
+encoder used during training, and the prompt latent space
+which is learned using a combination of pretraining and training-time
+fine-tuning.
+
+_Latent space walking_, or _latent space exploration_, is the process of
+sampling a point in latent space and incrementally changing the latent
+representation. Its most common application is generating animations
+where each sampled point is fed to the decoder and is stored as a
+frame in the final animation.
+For high-quality latent representations, this produces coherent-looking
+animations. These animations can provide insight into the feature map of the
+latent space, and can ultimately lead to improvements in the training
+process. One such GIF is displayed below:
+
+![Panda to Plane](/img/examples/generative/random_walks_with_stable_diffusion/panda2plane.gif)
+
+In this guide, we will show how to take advantage of the Stable Diffusion API
+in KerasCV to perform prompt interpolation and circular walks through
+Stable Diffusion's visual latent manifold, as well as through
+the text encoder's latent manifold.
+
+This guide assumes the reader has a
+high-level understanding of Stable Diffusion.
+If you haven't already, you should start
+by reading the [Stable Diffusion Tutorial](https://keras.io/guides/keras_cv/generate_images_with_stable_diffusion/).
+
+To start, we import KerasCV and load up a Stable Diffusion model using the
+optimizations discussed in the tutorial
+[Generate images with Stable Diffusion](https://keras.io/guides/keras_cv/generate_images_with_stable_diffusion/).
+Note that if you are running with a M1 Mac GPU you should not enable mixed precision.
+"""
+
+"""shell
+pip install keras-cv --upgrade --quiet
+"""
+
+import keras_cv
+import keras
+import matplotlib.pyplot as plt
+from keras import ops
+import numpy as np
+import math
+from PIL import Image
+
+# Enable mixed precision
+# (only do this if you have a recent NVIDIA GPU)
+keras.mixed_precision.set_global_policy("mixed_float16")
+
+# Instantiate the Stable Diffusion model
+model = keras_cv.models.StableDiffusion(jit_compile=True)
+
+"""
+## Interpolating between text prompts
+
+In Stable Diffusion, a text prompt is first encoded into a vector,
+and that encoding is used to guide the diffusion process.
+The latent encoding vector has shape
+77x768 (that's huge!), and when we give Stable Diffusion a text prompt, we're
+generating images from just one such point on the latent manifold.
+
+To explore more of this manifold, we can interpolate between two text encodings
+and generate images at those interpolated points:
+"""
+
+prompt_1 = "A watercolor painting of a Golden Retriever at the beach"
+prompt_2 = "A still life DSLR photo of a bowl of fruit"
+interpolation_steps = 5
+
+encoding_1 = ops.squeeze(model.encode_text(prompt_1))
+encoding_2 = ops.squeeze(model.encode_text(prompt_2))
+
+interpolated_encodings = ops.linspace(encoding_1, encoding_2, interpolation_steps)
+
+# Show the size of the latent manifold
+print(f"Encoding shape: {encoding_1.shape}")
+
+"""
+Once we've interpolated the encodings, we can generate images from each point.
+Note that in order to maintain some stability between the resulting images we
+keep the diffusion noise constant between images.
+"""
+
+seed = 12345
+noise = keras.random.normal((512 // 8, 512 // 8, 4), seed=seed)
+
+images = model.generate_image(
+    interpolated_encodings,
+    batch_size=interpolation_steps,
+    diffusion_noise=noise,
+)
+
+"""
+Now that we've generated some interpolated images, let's take a look at them!
+
+Throughout this tutorial, we're going to export sequences of images as gifs so
+that they can be easily viewed with some temporal context. For sequences of
+images where the first and last images don't match conceptually, we rubber-band
+the gif.
+
+If you're running in Colab, you can view your own GIFs by running:
+
+```
+from IPython.display import Image as IImage
+IImage("doggo-and-fruit-5.gif")
+```
+"""
+
+
+def export_as_gif(filename, images, frames_per_second=10, rubber_band=False):
+    if rubber_band:
+        images += images[2:-1][::-1]
+    images[0].save(
+        filename,
+        save_all=True,
+        append_images=images[1:],
+        duration=1000 // frames_per_second,
+        loop=0,
+    )
+
+
+export_as_gif(
+    "doggo-and-fruit-5.gif",
+    [Image.fromarray(img) for img in images],
+    frames_per_second=2,
+    rubber_band=True,
+)
+
+"""
+![Dog to Fruit 5](https://i.imgur.com/4ZCxZY4.gif)
+
+The results may seem surprising. Generally, interpolating between prompts
+produces coherent looking images, and often demonstrates a progressive concept
+shift between the contents of the two prompts. This is indicative of a high
+quality representation space, that closely mirrors the natural structure
+of the visual world.
+
+To best visualize this, we should do a much more fine-grained interpolation,
+using hundreds of steps. In order to keep batch size small (so that we don't
+OOM our GPU), this requires manually batching our interpolated
+encodings.
+"""
+
+interpolation_steps = 150
+batch_size = 3
+batches = interpolation_steps // batch_size
+
+interpolated_encodings = ops.linspace(encoding_1, encoding_2, interpolation_steps)
+batched_encodings = ops.split(interpolated_encodings, batches)
+
+images = []
+for batch in range(batches):
+    images += [
+        Image.fromarray(img)
+        for img in model.generate_image(
+            batched_encodings[batch],
+            batch_size=batch_size,
+            num_steps=25,
+            diffusion_noise=noise,
+        )
+    ]
+
+export_as_gif("doggo-and-fruit-150.gif", images, rubber_band=True)
+
+"""
+![Dog to Fruit 150](/img/examples/generative/random_walks_with_stable_diffusion/dog2fruit150.gif)
+
+The resulting gif shows a much clearer and more coherent shift between the two
+prompts. Try out some prompts of your own and experiment!
+
+We can even extend this concept for more than one image. For example, we can
+interpolate between four prompts:
+"""
+
+prompt_1 = "A watercolor painting of a Golden Retriever at the beach"
+prompt_2 = "A still life DSLR photo of a bowl of fruit"
+prompt_3 = "The eiffel tower in the style of starry night"
+prompt_4 = "An architectural sketch of a skyscraper"
+
+interpolation_steps = 6
+batch_size = 3
+batches = (interpolation_steps**2) // batch_size
+
+encoding_1 = ops.squeeze(model.encode_text(prompt_1))
+encoding_2 = ops.squeeze(model.encode_text(prompt_2))
+encoding_3 = ops.squeeze(model.encode_text(prompt_3))
+encoding_4 = ops.squeeze(model.encode_text(prompt_4))
+
+interpolated_encodings = ops.linspace(
+    ops.linspace(encoding_1, encoding_2, interpolation_steps),
+    ops.linspace(encoding_3, encoding_4, interpolation_steps),
+    interpolation_steps,
+)
+interpolated_encodings = ops.reshape(
+    interpolated_encodings, (interpolation_steps**2, 77, 768)
+)
+batched_encodings = ops.split(interpolated_encodings, batches)
+
+images = []
+for batch in range(batches):
+    images.append(
+        model.generate_image(
+            batched_encodings[batch],
+            batch_size=batch_size,
+            diffusion_noise=noise,
+        )
+    )
+
+
+def plot_grid(images, path, grid_size, scale=2):
+    fig, axs = plt.subplots(
+        grid_size, grid_size, figsize=(grid_size * scale, grid_size * scale)
+    )
+    fig.tight_layout()
+    plt.subplots_adjust(wspace=0, hspace=0)
+    plt.axis("off")
+    for ax in axs.flat:
+        ax.axis("off")
+
+    images = images.astype(int)
+    for i in range(min(grid_size * grid_size, len(images))):
+        ax = axs.flat[i]
+        ax.imshow(images[i].astype("uint8"))
+        ax.axis("off")
+
+    for i in range(len(images), grid_size * grid_size):
+        axs.flat[i].axis("off")
+        axs.flat[i].remove()
+
+    plt.savefig(
+        fname=path,
+        pad_inches=0,
+        bbox_inches="tight",
+        transparent=False,
+        dpi=60,
+    )
+
+
+images = np.concatenate(images)
+plot_grid(images, "4-way-interpolation.jpg", interpolation_steps)
+
+"""
+We can also interpolate while allowing diffusion noise to vary by dropping
+the `diffusion_noise` parameter:
+"""
+
+images = []
+for batch in range(batches):
+    images.append(model.generate_image(batched_encodings[batch], batch_size=batch_size))
+
+images = np.concatenate(images)
+plot_grid(images, "4-way-interpolation-varying-noise.jpg", interpolation_steps)
+
+"""
+Next up -- let's go for some walks!
+
+## A walk around a text prompt
+
+Our next experiment will be to go for a walk around the latent manifold
+starting from a point produced by a particular prompt.
+"""
+
+walk_steps = 150
+batch_size = 3
+batches = walk_steps // batch_size
+step_size = 0.005
+
+encoding = ops.squeeze(
+    model.encode_text("The Eiffel Tower in the style of starry night")
+)
+# Note that (77, 768) is the shape of the text encoding.
+delta = ops.ones_like(encoding) * step_size
+
+walked_encodings = []
+for step_index in range(walk_steps):
+    walked_encodings.append(encoding)
+    encoding += delta
+walked_encodings = ops.stack(walked_encodings)
+batched_encodings = ops.split(walked_encodings, batches)
+
+images = []
+for batch in range(batches):
+    images += [
+        Image.fromarray(img)
+        for img in model.generate_image(
+            batched_encodings[batch],
+            batch_size=batch_size,
+            num_steps=25,
+            diffusion_noise=noise,
+        )
+    ]
+
+export_as_gif("eiffel-tower-starry-night.gif", images, rubber_band=True)
+
+"""
+![Eiffel tower walk gif](https://i.imgur.com/9MMYtal.gif)
+
+Perhaps unsurprisingly, walking too far from the encoder's latent manifold
+produces images that look incoherent. Try it for yourself by setting
+your own prompt, and adjusting `step_size` to increase or decrease the magnitude
+of the walk. Note that when the magnitude of the walk gets large, the walk often
+leads into areas which produce extremely noisy images.
+
+## A circular walk through the diffusion noise space for a single prompt
+
+Our final experiment is to stick to one prompt and explore the variety of images
+that the diffusion model can produce from that prompt. We do this by controlling
+the noise that is used to seed the diffusion process.
+
+We create two noise components, `x` and `y`, and do a walk from 0 to 2π, summing
+the cosine of our `x` component and the sin of our `y` component to produce noise.
+Using this approach, the end of our walk arrives at the same noise inputs where
+we began our walk, so we get a "loopable" result!
+"""
+
+prompt = "An oil paintings of cows in a field next to a windmill in Holland"
+encoding = ops.squeeze(model.encode_text(prompt))
+walk_steps = 150
+batch_size = 3
+batches = walk_steps // batch_size
+
+walk_noise_x = keras.random.normal(noise.shape, dtype="float64")
+walk_noise_y = keras.random.normal(noise.shape, dtype="float64")
+
+walk_scale_x = ops.cos(ops.linspace(0, 2, walk_steps) * math.pi)
+walk_scale_y = ops.sin(ops.linspace(0, 2, walk_steps) * math.pi)
+noise_x = ops.tensordot(walk_scale_x, walk_noise_x, axes=0)
+noise_y = ops.tensordot(walk_scale_y, walk_noise_y, axes=0)
+noise = ops.add(noise_x, noise_y)
+batched_noise = ops.split(noise, batches)
+
+images = []
+for batch in range(batches):
+    images += [
+        Image.fromarray(img)
+        for img in model.generate_image(
+            encoding,
+            batch_size=batch_size,
+            num_steps=25,
+            diffusion_noise=batched_noise[batch],
+        )
+    ]
+
+export_as_gif("cows.gif", images)
+
+"""
+![Happy Cows](/img/examples/generative/random_walks_with_stable_diffusion/happycows.gif)
+
+Experiment with your own prompts and with different values of
+`unconditional_guidance_scale`!
+
+## Conclusion
+
+Stable Diffusion offers a lot more than just single text-to-image generation.
+Exploring the latent manifold of the text encoder and the noise space of the
+diffusion model are two fun ways to experience the power of this model, and
+KerasCV makes it easy!
+"""
diff --git a/knowledge_base/generative/random_walks_with_stable_diffusion_3.py b/knowledge_base/generative/random_walks_with_stable_diffusion_3.py
new file mode 100644
index 0000000000000000000000000000000000000000..52307af0a803aa37d9fa75da90e35899106aaa01
--- /dev/null
+++ b/knowledge_base/generative/random_walks_with_stable_diffusion_3.py
@@ -0,0 +1,693 @@
+"""
+Title: A walk through latent space with Stable Diffusion 3
+Authors: [Hongyu Chiu](https://github.com/james77777778), Ian Stenbit, [fchollet](https://twitter.com/fchollet), [lukewood](https://twitter.com/luke_wood_ml)
+Date created: 2024/11/11
+Last modified: 2024/11/11
+Description: Explore the latent manifold of Stable Diffusion 3.
+Accelerator: GPU
+"""
+
+"""
+## Overview
+
+Generative image models learn a "latent manifold" of the visual world: a
+low-dimensional vector space where each point maps to an image. Going from such
+a point on the manifold back to a displayable image is called "decoding" -- in
+the Stable Diffusion model, this is handled by the "decoder" model.
+
+![Stable Diffusion 3 Medium Architecture](/img/examples/generative/random_walks_with_stable_diffusion_3/mmdit.png)
+
+This latent manifold of images is continuous and interpolative, meaning that:
+
+1. Moving a little on the manifold only changes the corresponding image a
+little (continuity).
+2. For any two points A and B on the manifold (i.e. any two images), it is
+possible to move from A to B via a path where each intermediate point is also on
+the manifold (i.e. is also a valid image). Intermediate points would be called
+"interpolations" between the two starting images.
+
+Stable Diffusion isn't just an image model, though, it's also a natural language
+model. It has two latent spaces: the image representation space learned by the
+encoder used during training, and the prompt latent space which is learned using
+a combination of pretraining and training-time fine-tuning.
+
+_Latent space walking_, or _latent space exploration_, is the process of
+sampling a point in latent space and incrementally changing the latent
+representation. Its most common application is generating animations where each
+sampled point is fed to the decoder and is stored as a frame in the final
+animation.
+For high-quality latent representations, this produces coherent-looking
+animations. These animations can provide insight into the feature map of the
+latent space, and can ultimately lead to improvements in the training process.
+One such GIF is displayed below:
+
+![dog_to_cat_64.gif](/img/examples/generative/random_walks_with_stable_diffusion_3/dog_to_cat_64.gif)
+
+In this guide, we will show how to take advantage of the TextToImage API in
+KerasHub to perform prompt interpolation and circular walks through Stable
+Diffusion 3's visual latent manifold, as well as through the text encoder's
+latent manifold.
+
+This guide assumes the reader has a high-level understanding of Stable
+Diffusion 3. If you haven't already, you should start by reading the
+[Stable Diffusion 3 in KerasHub](
+https://keras.io/guides/keras_hub/stable_diffusion_3_in_keras_hub/).
+
+It is also worth noting that the preset "stable_diffusion_3_medium" excludes the
+T5XXL text encoder, as it requires significantly more GPU memory. The performace
+degradation is negligible in most cases. The weights, including T5XXL, will be
+available on KerasHub soon.
+"""
+
+"""shell
+# Use the latest version of KerasHub
+!pip install -Uq git+https://github.com/keras-team/keras-hub.git
+"""
+
+import math
+
+import keras
+import keras_hub
+import matplotlib.pyplot as plt
+from keras import ops
+from keras import random
+from PIL import Image
+
+height, width = 512, 512
+num_steps = 28
+guidance_scale = 7.0
+dtype = "float16"
+
+# Instantiate the Stable Diffusion 3 model and the preprocessor
+backbone = keras_hub.models.StableDiffusion3Backbone.from_preset(
+    "stable_diffusion_3_medium", image_shape=(height, width, 3), dtype=dtype
+)
+preprocessor = keras_hub.models.StableDiffusion3TextToImagePreprocessor.from_preset(
+    "stable_diffusion_3_medium"
+)
+
+"""
+Let's define some helper functions for this example.
+"""
+
+
+def get_text_embeddings(prompt):
+    """Get the text embeddings for a given prompt."""
+    token_ids = preprocessor.generate_preprocess([prompt])
+    negative_token_ids = preprocessor.generate_preprocess([""])
+    (
+        positive_embeddings,
+        negative_embeddings,
+        positive_pooled_embeddings,
+        negative_pooled_embeddings,
+    ) = backbone.encode_text_step(token_ids, negative_token_ids)
+    return (
+        positive_embeddings,
+        negative_embeddings,
+        positive_pooled_embeddings,
+        negative_pooled_embeddings,
+    )
+
+
+def decode_to_images(x, height, width):
+    """Concatenate and normalize the images to uint8 dtype."""
+    x = ops.concatenate(x, axis=0)
+    x = ops.reshape(x, (-1, height, width, 3))
+    x = ops.clip(ops.divide(ops.add(x, 1.0), 2.0), 0.0, 1.0)
+    return ops.cast(ops.round(ops.multiply(x, 255.0)), "uint8")
+
+
+def generate_with_latents_and_embeddings(
+    latents, embeddings, num_steps, guidance_scale
+):
+    """Generate images from latents and text embeddings."""
+
+    def body_fun(step, latents):
+        return backbone.denoise_step(
+            latents,
+            embeddings,
+            step,
+            num_steps,
+            guidance_scale,
+        )
+
+    latents = ops.fori_loop(0, num_steps, body_fun, latents)
+    return backbone.decode_step(latents)
+
+
+def export_as_gif(filename, images, frames_per_second=10, no_rubber_band=False):
+    if not no_rubber_band:
+        images += images[2:-1][::-1]  # Makes a rubber band: A->B->A
+    images[0].save(
+        filename,
+        save_all=True,
+        append_images=images[1:],
+        duration=1000 // frames_per_second,
+        loop=0,
+    )
+
+
+"""
+We are going to generate images using custom latents and embeddings, so we need
+to implement the `generate_with_latents_and_embeddings` function. Additionally,
+it is important to compile this function to speed up the generation process.
+"""
+
+if keras.config.backend() == "torch":
+    import torch
+
+    @torch.no_grad()
+    def wrapped_function(*args, **kwargs):
+        return generate_with_latents_and_embeddings(*args, **kwargs)
+
+    generate_function = wrapped_function
+elif keras.config.backend() == "tensorflow":
+    import tensorflow as tf
+
+    generate_function = tf.function(
+        generate_with_latents_and_embeddings, jit_compile=True
+    )
+elif keras.config.backend() == "jax":
+    import itertools
+
+    import jax
+
+    @jax.jit
+    def compiled_function(state, *args, **kwargs):
+        (trainable_variables, non_trainable_variables) = state
+        mapping = itertools.chain(
+            zip(backbone.trainable_variables, trainable_variables),
+            zip(backbone.non_trainable_variables, non_trainable_variables),
+        )
+        with keras.StatelessScope(state_mapping=mapping):
+            return generate_with_latents_and_embeddings(*args, **kwargs)
+
+    def wrapped_function(*args, **kwargs):
+        state = (
+            [v.value for v in backbone.trainable_variables],
+            [v.value for v in backbone.non_trainable_variables],
+        )
+        return compiled_function(state, *args, **kwargs)
+
+    generate_function = wrapped_function
+
+
+"""
+## Interpolating between text prompts
+
+In Stable Diffusion 3, a text prompt is encoded into multiple vectors, which are
+then used to guide the diffusion process. These latent encoding vectors have
+shapes of 154x4096 and 2048 for both the positive and negative prompts - quite
+large! When we input a text prompt into Stable Diffusion 3, we generate images
+from a single point on this latent manifold.
+
+To explore more of this manifold, we can interpolate between two text encodings
+and generate images at those interpolated points:
+"""
+
+prompt_1 = "A cute dog in a beautiful field of lavander colorful flowers "
+prompt_1 += "everywhere, perfect lighting, leica summicron 35mm f2.0, kodak "
+prompt_1 += "portra 400, film grain"
+prompt_2 = prompt_1.replace("dog", "cat")
+interpolation_steps = 5
+
+encoding_1 = get_text_embeddings(prompt_1)
+encoding_2 = get_text_embeddings(prompt_2)
+
+
+# Show the size of the latent manifold
+print(f"Positive embeddings shape: {encoding_1[0].shape}")
+print(f"Negative embeddings shape: {encoding_1[1].shape}")
+print(f"Positive pooled embeddings shape: {encoding_1[2].shape}")
+print(f"Negative pooled embeddings shape: {encoding_1[3].shape}")
+
+
+"""
+In this example, we want to use Spherical Linear Interpolation (slerp) instead
+of simple linear interpolation. Slerp is commonly used in computer graphics to
+animate rotations smoothly and can also be applied to interpolate between
+high-dimensional data points, such as latent vectors used in generative models.
+
+The source is from Andrej Karpathy's gist:
+[https://gist.github.com/karpathy/00103b0037c5aaea32fe1da1af553355](https://gist.github.com/karpathy/00103b0037c5aaea32fe1da1af553355).
+
+A more detailed explanation of this method can be found at:
+[https://en.wikipedia.org/wiki/Slerp](https://en.wikipedia.org/wiki/Slerp).
+"""
+
+
+def slerp(v1, v2, num):
+    ori_dtype = v1.dtype
+    # Cast to float32 for numerical stability.
+    v1 = ops.cast(v1, "float32")
+    v2 = ops.cast(v2, "float32")
+
+    def interpolation(t, v1, v2, dot_threshold=0.9995):
+        """helper function to spherically interpolate two arrays."""
+        dot = ops.sum(
+            v1 * v2 / (ops.linalg.norm(ops.ravel(v1)) * ops.linalg.norm(ops.ravel(v2)))
+        )
+        if ops.abs(dot) > dot_threshold:
+            v2 = (1 - t) * v1 + t * v2
+        else:
+            theta_0 = ops.arccos(dot)
+            sin_theta_0 = ops.sin(theta_0)
+            theta_t = theta_0 * t
+            sin_theta_t = ops.sin(theta_t)
+            s0 = ops.sin(theta_0 - theta_t) / sin_theta_0
+            s1 = sin_theta_t / sin_theta_0
+            v2 = s0 * v1 + s1 * v2
+        return v2
+
+    t = ops.linspace(0, 1, num)
+    interpolated = ops.stack([interpolation(t[i], v1, v2) for i in range(num)], axis=0)
+    return ops.cast(interpolated, ori_dtype)
+
+
+interpolated_positive_embeddings = slerp(
+    encoding_1[0], encoding_2[0], interpolation_steps
+)
+interpolated_positive_pooled_embeddings = slerp(
+    encoding_1[2], encoding_2[2], interpolation_steps
+)
+# We don't use negative prompts in this example, so there’s no need to
+# interpolate them.
+negative_embeddings = encoding_1[1]
+negative_pooled_embeddings = encoding_1[3]
+
+
+"""
+Once we've interpolated the encodings, we can generate images from each point.
+Note that in order to maintain some stability between the resulting images we
+keep the diffusion latents constant between images.
+"""
+
+latents = random.normal((1, height // 8, width // 8, 16), seed=42)
+
+images = []
+progbar = keras.utils.Progbar(interpolation_steps)
+for i in range(interpolation_steps):
+    images.append(
+        generate_function(
+            latents,
+            (
+                interpolated_positive_embeddings[i],
+                negative_embeddings,
+                interpolated_positive_pooled_embeddings[i],
+                negative_pooled_embeddings,
+            ),
+            ops.convert_to_tensor(num_steps),
+            ops.convert_to_tensor(guidance_scale),
+        )
+    )
+    progbar.update(i + 1, finalize=i == interpolation_steps - 1)
+
+"""
+Now that we've generated some interpolated images, let's take a look at them!
+
+Throughout this tutorial, we're going to export sequences of images as gifs so
+that they can be easily viewed with some temporal context. For sequences of
+images where the first and last images don't match conceptually, we rubber-band
+the gif.
+
+If you're running in Colab, you can view your own GIFs by running:
+
+```
+from IPython.display import Image as IImage
+IImage("dog_to_cat_5.gif")
+```
+"""
+
+images = ops.convert_to_numpy(decode_to_images(images, height, width))
+export_as_gif(
+    "dog_to_cat_5.gif",
+    [Image.fromarray(image) for image in images],
+    frames_per_second=2,
+)
+
+"""
+The results may seem surprising. Generally, interpolating between prompts
+produces coherent looking images, and often demonstrates a progressive concept
+shift between the contents of the two prompts. This is indicative of a high
+quality representation space, that closely mirrors the natural structure of the
+visual world.
+
+To best visualize this, we should do a much more fine-grained interpolation,
+using more steps.
+"""
+
+interpolation_steps = 64
+batch_size = 4
+batches = interpolation_steps // batch_size
+
+interpolated_positive_embeddings = slerp(
+    encoding_1[0], encoding_2[0], interpolation_steps
+)
+interpolated_positive_pooled_embeddings = slerp(
+    encoding_1[2], encoding_2[2], interpolation_steps
+)
+positive_embeddings_shape = ops.shape(encoding_1[0])
+positive_pooled_embeddings_shape = ops.shape(encoding_1[2])
+interpolated_positive_embeddings = ops.reshape(
+    interpolated_positive_embeddings,
+    (
+        batches,
+        batch_size,
+        positive_embeddings_shape[-2],
+        positive_embeddings_shape[-1],
+    ),
+)
+interpolated_positive_pooled_embeddings = ops.reshape(
+    interpolated_positive_pooled_embeddings,
+    (batches, batch_size, positive_pooled_embeddings_shape[-1]),
+)
+negative_embeddings = ops.tile(encoding_1[1], (batch_size, 1, 1))
+negative_pooled_embeddings = ops.tile(encoding_1[3], (batch_size, 1))
+
+latents = random.normal((1, height // 8, width // 8, 16), seed=42)
+latents = ops.tile(latents, (batch_size, 1, 1, 1))
+
+images = []
+progbar = keras.utils.Progbar(batches)
+for i in range(batches):
+    images.append(
+        generate_function(
+            latents,
+            (
+                interpolated_positive_embeddings[i],
+                negative_embeddings,
+                interpolated_positive_pooled_embeddings[i],
+                negative_pooled_embeddings,
+            ),
+            ops.convert_to_tensor(num_steps),
+            ops.convert_to_tensor(guidance_scale),
+        )
+    )
+    progbar.update(i + 1, finalize=i == batches - 1)
+
+images = ops.convert_to_numpy(decode_to_images(images, height, width))
+export_as_gif(
+    "dog_to_cat_64.gif",
+    [Image.fromarray(image) for image in images],
+    frames_per_second=2,
+)
+
+"""
+The resulting gif shows a much clearer and more coherent shift between the two
+prompts. Try out some prompts of your own and experiment!
+
+We can even extend this concept for more than one image. For example, we can
+interpolate between four prompts:
+"""
+
+prompt_1 = "A watercolor painting of a Golden Retriever at the beach"
+prompt_2 = "A still life DSLR photo of a bowl of fruit"
+prompt_3 = "The eiffel tower in the style of starry night"
+prompt_4 = "An architectural sketch of a skyscraper"
+
+interpolation_steps = 8
+batch_size = 4
+batches = (interpolation_steps**2) // batch_size
+
+encoding_1 = get_text_embeddings(prompt_1)
+encoding_2 = get_text_embeddings(prompt_2)
+encoding_3 = get_text_embeddings(prompt_3)
+encoding_4 = get_text_embeddings(prompt_4)
+
+positive_embeddings_shape = ops.shape(encoding_1[0])
+positive_pooled_embeddings_shape = ops.shape(encoding_1[2])
+interpolated_positive_embeddings_12 = slerp(
+    encoding_1[0], encoding_2[0], interpolation_steps
+)
+interpolated_positive_embeddings_34 = slerp(
+    encoding_3[0], encoding_4[0], interpolation_steps
+)
+interpolated_positive_embeddings = slerp(
+    interpolated_positive_embeddings_12,
+    interpolated_positive_embeddings_34,
+    interpolation_steps,
+)
+interpolated_positive_embeddings = ops.reshape(
+    interpolated_positive_embeddings,
+    (
+        batches,
+        batch_size,
+        positive_embeddings_shape[-2],
+        positive_embeddings_shape[-1],
+    ),
+)
+interpolated_positive_pooled_embeddings_12 = slerp(
+    encoding_1[2], encoding_2[2], interpolation_steps
+)
+interpolated_positive_pooled_embeddings_34 = slerp(
+    encoding_3[2], encoding_4[2], interpolation_steps
+)
+interpolated_positive_pooled_embeddings = slerp(
+    interpolated_positive_pooled_embeddings_12,
+    interpolated_positive_pooled_embeddings_34,
+    interpolation_steps,
+)
+interpolated_positive_pooled_embeddings = ops.reshape(
+    interpolated_positive_pooled_embeddings,
+    (batches, batch_size, positive_pooled_embeddings_shape[-1]),
+)
+negative_embeddings = ops.tile(encoding_1[1], (batch_size, 1, 1))
+negative_pooled_embeddings = ops.tile(encoding_1[3], (batch_size, 1))
+
+latents = random.normal((1, height // 8, width // 8, 16), seed=42)
+latents = ops.tile(latents, (batch_size, 1, 1, 1))
+
+images = []
+progbar = keras.utils.Progbar(batches)
+for i in range(batches):
+    images.append(
+        generate_function(
+            latents,
+            (
+                interpolated_positive_embeddings[i],
+                negative_embeddings,
+                interpolated_positive_pooled_embeddings[i],
+                negative_pooled_embeddings,
+            ),
+            ops.convert_to_tensor(num_steps),
+            ops.convert_to_tensor(guidance_scale),
+        )
+    )
+    progbar.update(i + 1, finalize=i == batches - 1)
+
+
+"""
+Let's display the resulting images in a grid to make them easier to interpret.
+"""
+
+
+def plot_grid(images, path, grid_size, scale=2):
+    fig, axs = plt.subplots(
+        grid_size, grid_size, figsize=(grid_size * scale, grid_size * scale)
+    )
+    fig.tight_layout()
+    plt.subplots_adjust(wspace=0, hspace=0)
+    plt.axis("off")
+    for ax in axs.flat:
+        ax.axis("off")
+
+    for i in range(min(grid_size * grid_size, len(images))):
+        ax = axs.flat[i]
+        ax.imshow(images[i])
+        ax.axis("off")
+
+    for i in range(len(images), grid_size * grid_size):
+        axs.flat[i].axis("off")
+        axs.flat[i].remove()
+
+    plt.savefig(
+        fname=path,
+        pad_inches=0,
+        bbox_inches="tight",
+        transparent=False,
+        dpi=60,
+    )
+
+
+images = ops.convert_to_numpy(decode_to_images(images, height, width))
+plot_grid(images, "4-way-interpolation.jpg", interpolation_steps)
+
+"""
+We can also interpolate while allowing diffusion latents to vary by dropping
+the `seed` parameter:
+"""
+
+images = []
+progbar = keras.utils.Progbar(batches)
+for i in range(batches):
+    # Vary diffusion latents for each input.
+    latents = random.normal((batch_size, height // 8, width // 8, 16))
+    images.append(
+        generate_function(
+            latents,
+            (
+                interpolated_positive_embeddings[i],
+                negative_embeddings,
+                interpolated_positive_pooled_embeddings[i],
+                negative_pooled_embeddings,
+            ),
+            ops.convert_to_tensor(num_steps),
+            ops.convert_to_tensor(guidance_scale),
+        )
+    )
+    progbar.update(i + 1, finalize=i == batches - 1)
+
+images = ops.convert_to_numpy(decode_to_images(images, height, width))
+plot_grid(images, "4-way-interpolation-varying-latent.jpg", interpolation_steps)
+
+"""
+Next up -- let's go for some walks!
+
+## A walk around a text prompt
+
+Our next experiment will be to go for a walk around the latent manifold
+starting from a point produced by a particular prompt.
+"""
+
+walk_steps = 64
+batch_size = 4
+batches = walk_steps // batch_size
+step_size = 0.01
+prompt = "The eiffel tower in the style of starry night"
+encoding = get_text_embeddings(prompt)
+
+positive_embeddings = encoding[0]
+positive_pooled_embeddings = encoding[2]
+negative_embeddings = encoding[1]
+negative_pooled_embeddings = encoding[3]
+
+# The shape of `positive_embeddings`: (1, 154, 4096)
+# The shape of `positive_pooled_embeddings`: (1, 2048)
+positive_embeddings_delta = ops.ones_like(positive_embeddings) * step_size
+positive_pooled_embeddings_delta = ops.ones_like(positive_pooled_embeddings) * step_size
+positive_embeddings_shape = ops.shape(positive_embeddings)
+positive_pooled_embeddings_shape = ops.shape(positive_pooled_embeddings)
+
+walked_positive_embeddings = []
+walked_positive_pooled_embeddings = []
+for step_index in range(walk_steps):
+    walked_positive_embeddings.append(positive_embeddings)
+    walked_positive_pooled_embeddings.append(positive_pooled_embeddings)
+    positive_embeddings += positive_embeddings_delta
+    positive_pooled_embeddings += positive_pooled_embeddings_delta
+walked_positive_embeddings = ops.stack(walked_positive_embeddings, axis=0)
+walked_positive_pooled_embeddings = ops.stack(walked_positive_pooled_embeddings, axis=0)
+walked_positive_embeddings = ops.reshape(
+    walked_positive_embeddings,
+    (
+        batches,
+        batch_size,
+        positive_embeddings_shape[-2],
+        positive_embeddings_shape[-1],
+    ),
+)
+walked_positive_pooled_embeddings = ops.reshape(
+    walked_positive_pooled_embeddings,
+    (batches, batch_size, positive_pooled_embeddings_shape[-1]),
+)
+negative_embeddings = ops.tile(encoding_1[1], (batch_size, 1, 1))
+negative_pooled_embeddings = ops.tile(encoding_1[3], (batch_size, 1))
+
+latents = random.normal((1, height // 8, width // 8, 16), seed=42)
+latents = ops.tile(latents, (batch_size, 1, 1, 1))
+
+images = []
+progbar = keras.utils.Progbar(batches)
+for i in range(batches):
+    images.append(
+        generate_function(
+            latents,
+            (
+                walked_positive_embeddings[i],
+                negative_embeddings,
+                walked_positive_pooled_embeddings[i],
+                negative_pooled_embeddings,
+            ),
+            ops.convert_to_tensor(num_steps),
+            ops.convert_to_tensor(guidance_scale),
+        )
+    )
+    progbar.update(i + 1, finalize=i == batches - 1)
+
+images = ops.convert_to_numpy(decode_to_images(images, height, width))
+export_as_gif(
+    "eiffel-tower-starry-night.gif",
+    [Image.fromarray(image) for image in images],
+    frames_per_second=2,
+)
+
+"""
+Perhaps unsurprisingly, walking too far from the encoder's latent manifold
+produces images that look incoherent. Try it for yourself by setting your own
+prompt, and adjusting `step_size` to increase or decrease the magnitude
+of the walk. Note that when the magnitude of the walk gets large, the walk often
+leads into areas which produce extremely noisy images.
+
+## A circular walk through the diffusion latent space for a single prompt
+
+Our final experiment is to stick to one prompt and explore the variety of images
+that the diffusion model can produce from that prompt. We do this by controlling
+the noise that is used to seed the diffusion process.
+
+We create two noise components, `x` and `y`, and do a walk from 0 to 2π, summing
+the cosine of our `x` component and the sin of our `y` component to produce
+noise. Using this approach, the end of our walk arrives at the same noise inputs
+where we began our walk, so we get a "loopable" result!
+"""
+
+walk_steps = 64
+batch_size = 4
+batches = walk_steps // batch_size
+prompt = "An oil paintings of cows in a field next to a windmill in Holland"
+encoding = get_text_embeddings(prompt)
+
+walk_latent_x = random.normal((1, height // 8, width // 8, 16))
+walk_latent_y = random.normal((1, height // 8, width // 8, 16))
+walk_scale_x = ops.cos(ops.linspace(0.0, 2.0, walk_steps) * math.pi)
+walk_scale_y = ops.sin(ops.linspace(0.0, 2.0, walk_steps) * math.pi)
+latent_x = ops.tensordot(walk_scale_x, walk_latent_x, axes=0)
+latent_y = ops.tensordot(walk_scale_y, walk_latent_y, axes=0)
+latents = ops.add(latent_x, latent_y)
+latents = ops.reshape(latents, (batches, batch_size, height // 8, width // 8, 16))
+
+images = []
+progbar = keras.utils.Progbar(batches)
+for i in range(batches):
+    images.append(
+        generate_function(
+            latents[i],
+            (
+                ops.tile(encoding[0], (batch_size, 1, 1)),
+                ops.tile(encoding[1], (batch_size, 1, 1)),
+                ops.tile(encoding[2], (batch_size, 1)),
+                ops.tile(encoding[3], (batch_size, 1)),
+            ),
+            ops.convert_to_tensor(num_steps),
+            ops.convert_to_tensor(guidance_scale),
+        )
+    )
+    progbar.update(i + 1, finalize=i == batches - 1)
+
+images = ops.convert_to_numpy(decode_to_images(images, height, width))
+export_as_gif(
+    "cows.gif",
+    [Image.fromarray(image) for image in images],
+    frames_per_second=4,
+    no_rubber_band=True,
+)
+
+"""
+Experiment with your own prompts and with different values of the parameters!
+
+## Conclusion
+
+Stable Diffusion 3 offers a lot more than just single text-to-image generation.
+Exploring the latent manifold of the text encoder and the latent space of the
+diffusion model are two fun ways to experience the power of this model, and
+KerasHub makes it easy!
+"""
diff --git a/knowledge_base/generative/real_nvp.py b/knowledge_base/generative/real_nvp.py
new file mode 100644
index 0000000000000000000000000000000000000000..aaf6769d476dd4a6c0055b84980309400678d2f0
--- /dev/null
+++ b/knowledge_base/generative/real_nvp.py
@@ -0,0 +1,223 @@
+"""
+Title: Density estimation using Real NVP
+Authors: [Mandolini Giorgio Maria](https://www.linkedin.com/in/giorgio-maria-mandolini-a2a1b71b4/), [Sanna Daniele](https://www.linkedin.com/in/daniele-sanna-338629bb/), [Zannini Quirini Giorgio](https://www.linkedin.com/in/giorgio-zannini-quirini-16ab181a0/)
+Date created: 2020/08/10
+Last modified: 2020/08/10
+Description: Estimating the density distribution of the "double moon" dataset.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+The aim of this work is to map a simple distribution - which is easy to sample
+and whose density is simple to estimate - to a more complex one learned from the data.
+This kind of generative model is also known as "normalizing flow".
+
+In order to do this, the model is trained via the maximum
+likelihood principle, using the "change of variable" formula.
+
+We will use an affine coupling function. We create it such that its inverse, as well as
+the determinant of the Jacobian, are easy to obtain (more details in the referenced paper).
+
+**Requirements:**
+
+* Tensorflow 2.9.1
+* Tensorflow probability 0.17.0
+
+**Reference:**
+
+[Density estimation using Real NVP](https://arxiv.org/abs/1605.08803)
+"""
+
+"""
+## Setup
+
+"""
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+from tensorflow.keras import regularizers
+from sklearn.datasets import make_moons
+import numpy as np
+import matplotlib.pyplot as plt
+import tensorflow_probability as tfp
+
+"""
+## Load the data
+"""
+
+data = make_moons(3000, noise=0.05)[0].astype("float32")
+norm = layers.Normalization()
+norm.adapt(data)
+normalized_data = norm(data)
+
+"""
+## Affine coupling layer
+"""
+
+# Creating a custom layer with keras API.
+output_dim = 256
+reg = 0.01
+
+
+def Coupling(input_shape):
+    input = keras.layers.Input(shape=input_shape)
+
+    t_layer_1 = keras.layers.Dense(
+        output_dim, activation="relu", kernel_regularizer=regularizers.l2(reg)
+    )(input)
+    t_layer_2 = keras.layers.Dense(
+        output_dim, activation="relu", kernel_regularizer=regularizers.l2(reg)
+    )(t_layer_1)
+    t_layer_3 = keras.layers.Dense(
+        output_dim, activation="relu", kernel_regularizer=regularizers.l2(reg)
+    )(t_layer_2)
+    t_layer_4 = keras.layers.Dense(
+        output_dim, activation="relu", kernel_regularizer=regularizers.l2(reg)
+    )(t_layer_3)
+    t_layer_5 = keras.layers.Dense(
+        input_shape, activation="linear", kernel_regularizer=regularizers.l2(reg)
+    )(t_layer_4)
+
+    s_layer_1 = keras.layers.Dense(
+        output_dim, activation="relu", kernel_regularizer=regularizers.l2(reg)
+    )(input)
+    s_layer_2 = keras.layers.Dense(
+        output_dim, activation="relu", kernel_regularizer=regularizers.l2(reg)
+    )(s_layer_1)
+    s_layer_3 = keras.layers.Dense(
+        output_dim, activation="relu", kernel_regularizer=regularizers.l2(reg)
+    )(s_layer_2)
+    s_layer_4 = keras.layers.Dense(
+        output_dim, activation="relu", kernel_regularizer=regularizers.l2(reg)
+    )(s_layer_3)
+    s_layer_5 = keras.layers.Dense(
+        input_shape, activation="tanh", kernel_regularizer=regularizers.l2(reg)
+    )(s_layer_4)
+
+    return keras.Model(inputs=input, outputs=[s_layer_5, t_layer_5])
+
+
+"""
+## Real NVP
+"""
+
+
+class RealNVP(keras.Model):
+    def __init__(self, num_coupling_layers):
+        super().__init__()
+
+        self.num_coupling_layers = num_coupling_layers
+
+        # Distribution of the latent space.
+        self.distribution = tfp.distributions.MultivariateNormalDiag(
+            loc=[0.0, 0.0], scale_diag=[1.0, 1.0]
+        )
+        self.masks = np.array(
+            [[0, 1], [1, 0]] * (num_coupling_layers // 2), dtype="float32"
+        )
+        self.loss_tracker = keras.metrics.Mean(name="loss")
+        self.layers_list = [Coupling(2) for i in range(num_coupling_layers)]
+
+    @property
+    def metrics(self):
+        """List of the model's metrics.
+
+        We make sure the loss tracker is listed as part of `model.metrics`
+        so that `fit()` and `evaluate()` are able to `reset()` the loss tracker
+        at the start of each epoch and at the start of an `evaluate()` call.
+        """
+        return [self.loss_tracker]
+
+    def call(self, x, training=True):
+        log_det_inv = 0
+        direction = 1
+        if training:
+            direction = -1
+        for i in range(self.num_coupling_layers)[::direction]:
+            x_masked = x * self.masks[i]
+            reversed_mask = 1 - self.masks[i]
+            s, t = self.layers_list[i](x_masked)
+            s *= reversed_mask
+            t *= reversed_mask
+            gate = (direction - 1) / 2
+            x = (
+                reversed_mask
+                * (x * tf.exp(direction * s) + direction * t * tf.exp(gate * s))
+                + x_masked
+            )
+            log_det_inv += gate * tf.reduce_sum(s, [1])
+
+        return x, log_det_inv
+
+    # Log likelihood of the normal distribution plus the log determinant of the jacobian.
+
+    def log_loss(self, x):
+        y, logdet = self(x)
+        log_likelihood = self.distribution.log_prob(y) + logdet
+        return -tf.reduce_mean(log_likelihood)
+
+    def train_step(self, data):
+        with tf.GradientTape() as tape:
+            loss = self.log_loss(data)
+
+        g = tape.gradient(loss, self.trainable_variables)
+        self.optimizer.apply_gradients(zip(g, self.trainable_variables))
+        self.loss_tracker.update_state(loss)
+
+        return {"loss": self.loss_tracker.result()}
+
+    def test_step(self, data):
+        loss = self.log_loss(data)
+        self.loss_tracker.update_state(loss)
+
+        return {"loss": self.loss_tracker.result()}
+
+
+"""
+## Model training
+"""
+
+model = RealNVP(num_coupling_layers=6)
+
+model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0001))
+
+history = model.fit(
+    normalized_data, batch_size=256, epochs=300, verbose=2, validation_split=0.2
+)
+
+"""
+## Performance evaluation
+"""
+
+plt.figure(figsize=(15, 10))
+plt.plot(history.history["loss"])
+plt.plot(history.history["val_loss"])
+plt.title("model loss")
+plt.legend(["train", "validation"], loc="upper right")
+plt.ylabel("loss")
+plt.xlabel("epoch")
+
+# From data to latent space.
+z, _ = model(normalized_data)
+
+# From latent space to data.
+samples = model.distribution.sample(3000)
+x, _ = model.predict(samples)
+
+f, axes = plt.subplots(2, 2)
+f.set_size_inches(20, 15)
+
+axes[0, 0].scatter(normalized_data[:, 0], normalized_data[:, 1], color="r")
+axes[0, 0].set(title="Inference data space X", xlabel="x", ylabel="y")
+axes[0, 1].scatter(z[:, 0], z[:, 1], color="r")
+axes[0, 1].set(title="Inference latent space Z", xlabel="x", ylabel="y")
+axes[0, 1].set_xlim([-3.5, 4])
+axes[0, 1].set_ylim([-4, 4])
+axes[1, 0].scatter(samples[:, 0], samples[:, 1], color="g")
+axes[1, 0].set(title="Generated latent space Z", xlabel="x", ylabel="y")
+axes[1, 1].scatter(x[:, 0], x[:, 1], color="g")
+axes[1, 1].set(title="Generated data space X", label="x", ylabel="y")
+axes[1, 1].set_xlim([-2, 2])
+axes[1, 1].set_ylim([-2, 2])
diff --git a/knowledge_base/generative/stylegan.py b/knowledge_base/generative/stylegan.py
new file mode 100644
index 0000000000000000000000000000000000000000..11854f2f0b1d4f526b4df3821462f4a231e8df15
--- /dev/null
+++ b/knowledge_base/generative/stylegan.py
@@ -0,0 +1,772 @@
+"""
+Title: Face image generation with StyleGAN
+Author: [Soon-Yau Cheong](https://www.linkedin.com/in/soonyau/)
+Date created: 2021/07/01
+Last modified: 2021/07/01
+Description: Implementation of StyleGAN for image generation.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+The key idea of StyleGAN is to progressively increase the resolution of the generated
+images and to incorporate style features in the generative process.This
+[StyleGAN](https://arxiv.org/abs/1812.04948) implementation is based on the book
+[Hands-on Image Generation with TensorFlow](https://www.amazon.com/dp/1838826785).
+The code from the book's
+[GitHub repository](https://github.com/PacktPublishing/Hands-On-Image-Generation-with-TensorFlow-2.0/tree/master/Chapter07)
+was refactored to leverage a custom `train_step()` to enable
+faster training time via compilation and distribution.
+"""
+
+"""
+## Setup
+"""
+
+"""
+### Install latest TFA
+"""
+"""shell
+pip install tensorflow_addons
+"""
+
+import os
+import numpy as np
+import matplotlib.pyplot as plt
+
+from functools import partial
+
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+from tensorflow.keras.models import Sequential
+from tensorflow_addons.layers import InstanceNormalization
+
+import gdown
+from zipfile import ZipFile
+
+"""
+## Prepare the dataset
+
+In this example, we will train using the CelebA from the project GDrive.
+"""
+
+
+def log2(x):
+    return int(np.log2(x))
+
+
+# we use different batch size for different resolution, so larger image size
+# could fit into GPU memory. The keys is image resolution in log2
+batch_sizes = {2: 16, 3: 16, 4: 16, 5: 16, 6: 16, 7: 8, 8: 4, 9: 2, 10: 1}
+# We adjust the train step accordingly
+train_step_ratio = {k: batch_sizes[2] / v for k, v in batch_sizes.items()}
+
+
+os.makedirs("celeba_gan")
+
+url = "https://drive.google.com/uc?id=1O7m1010EJjLE5QxLZiM9Fpjs7Oj6e684"
+output = "celeba_gan/data.zip"
+gdown.download(url, output, quiet=True)
+
+with ZipFile("celeba_gan/data.zip", "r") as zipobj:
+    zipobj.extractall("celeba_gan")
+
+# Create a dataset from our folder, and rescale the images to the [0-1] range:
+
+ds_train = keras.utils.image_dataset_from_directory(
+    "celeba_gan", label_mode=None, image_size=(64, 64), batch_size=32
+)
+
+
+def resize_image(res, image):
+    # only downsampling, so use nearest neighbor that is faster to run
+    image = tf.image.resize(
+        image, (res, res), method=tf.image.ResizeMethod.NEAREST_NEIGHBOR
+    )
+    image = tf.cast(image, tf.float32) / 127.5 - 1.0
+    return image
+
+
+def create_dataloader(res):
+    batch_size = batch_sizes[log2(res)]
+    # NOTE: we unbatch the dataset so we can `batch()` it again with the `drop_remainder=True` option
+    # since the model only supports a single batch size
+    dl = ds_train.map(
+        partial(resize_image, res), num_parallel_calls=tf.data.AUTOTUNE
+    ).unbatch()
+    dl = dl.shuffle(200).batch(batch_size, drop_remainder=True).prefetch(1).repeat()
+    return dl
+
+
+"""
+## Utility function to display images after each epoch
+"""
+
+
+def plot_images(images, log2_res, fname=""):
+    scales = {2: 0.5, 3: 1, 4: 2, 5: 3, 6: 4, 7: 5, 8: 6, 9: 7, 10: 8}
+    scale = scales[log2_res]
+
+    grid_col = min(images.shape[0], int(32 // scale))
+    grid_row = 1
+
+    f, axarr = plt.subplots(
+        grid_row, grid_col, figsize=(grid_col * scale, grid_row * scale)
+    )
+
+    for row in range(grid_row):
+        ax = axarr if grid_row == 1 else axarr[row]
+        for col in range(grid_col):
+            ax[col].imshow(images[row * grid_col + col])
+            ax[col].axis("off")
+    plt.show()
+    if fname:
+        f.savefig(fname)
+
+
+"""
+## Custom Layers
+
+The following are building blocks that will be used to construct the generators and
+discriminators of the StyleGAN model.
+"""
+
+
+def fade_in(alpha, a, b):
+    return alpha * a + (1.0 - alpha) * b
+
+
+def wasserstein_loss(y_true, y_pred):
+    return -tf.reduce_mean(y_true * y_pred)
+
+
+def pixel_norm(x, epsilon=1e-8):
+    return x / tf.math.sqrt(tf.reduce_mean(x**2, axis=-1, keepdims=True) + epsilon)
+
+
+def minibatch_std(input_tensor, epsilon=1e-8):
+    n, h, w, c = tf.shape(input_tensor)
+    group_size = tf.minimum(4, n)
+    x = tf.reshape(input_tensor, [group_size, -1, h, w, c])
+    group_mean, group_var = tf.nn.moments(x, axes=(0), keepdims=False)
+    group_std = tf.sqrt(group_var + epsilon)
+    avg_std = tf.reduce_mean(group_std, axis=[1, 2, 3], keepdims=True)
+    x = tf.tile(avg_std, [group_size, h, w, 1])
+    return tf.concat([input_tensor, x], axis=-1)
+
+
+class EqualizedConv(layers.Layer):
+    def __init__(self, out_channels, kernel=3, gain=2, **kwargs):
+        super().__init__(**kwargs)
+        self.kernel = kernel
+        self.out_channels = out_channels
+        self.gain = gain
+        self.pad = kernel != 1
+
+    def build(self, input_shape):
+        self.in_channels = input_shape[-1]
+        initializer = keras.initializers.RandomNormal(mean=0.0, stddev=1.0)
+        self.w = self.add_weight(
+            shape=[self.kernel, self.kernel, self.in_channels, self.out_channels],
+            initializer=initializer,
+            trainable=True,
+            name="kernel",
+        )
+        self.b = self.add_weight(
+            shape=(self.out_channels,), initializer="zeros", trainable=True, name="bias"
+        )
+        fan_in = self.kernel * self.kernel * self.in_channels
+        self.scale = tf.sqrt(self.gain / fan_in)
+
+    def call(self, inputs):
+        if self.pad:
+            x = tf.pad(inputs, [[0, 0], [1, 1], [1, 1], [0, 0]], mode="REFLECT")
+        else:
+            x = inputs
+        output = (
+            tf.nn.conv2d(x, self.scale * self.w, strides=1, padding="VALID") + self.b
+        )
+        return output
+
+
+class EqualizedDense(layers.Layer):
+    def __init__(self, units, gain=2, learning_rate_multiplier=1, **kwargs):
+        super().__init__(**kwargs)
+        self.units = units
+        self.gain = gain
+        self.learning_rate_multiplier = learning_rate_multiplier
+
+    def build(self, input_shape):
+        self.in_channels = input_shape[-1]
+        initializer = keras.initializers.RandomNormal(
+            mean=0.0, stddev=1.0 / self.learning_rate_multiplier
+        )
+        self.w = self.add_weight(
+            shape=[self.in_channels, self.units],
+            initializer=initializer,
+            trainable=True,
+            name="kernel",
+        )
+        self.b = self.add_weight(
+            shape=(self.units,), initializer="zeros", trainable=True, name="bias"
+        )
+        fan_in = self.in_channels
+        self.scale = tf.sqrt(self.gain / fan_in)
+
+    def call(self, inputs):
+        output = tf.add(tf.matmul(inputs, self.scale * self.w), self.b)
+        return output * self.learning_rate_multiplier
+
+
+class AddNoise(layers.Layer):
+    def build(self, input_shape):
+        n, h, w, c = input_shape[0]
+        initializer = keras.initializers.RandomNormal(mean=0.0, stddev=1.0)
+        self.b = self.add_weight(
+            shape=[1, 1, 1, c], initializer=initializer, trainable=True, name="kernel"
+        )
+
+    def call(self, inputs):
+        x, noise = inputs
+        output = x + self.b * noise
+        return output
+
+
+class AdaIN(layers.Layer):
+    def __init__(self, gain=1, **kwargs):
+        super().__init__(**kwargs)
+        self.gain = gain
+
+    def build(self, input_shapes):
+        x_shape = input_shapes[0]
+        w_shape = input_shapes[1]
+
+        self.w_channels = w_shape[-1]
+        self.x_channels = x_shape[-1]
+
+        self.dense_1 = EqualizedDense(self.x_channels, gain=1)
+        self.dense_2 = EqualizedDense(self.x_channels, gain=1)
+
+    def call(self, inputs):
+        x, w = inputs
+        ys = tf.reshape(self.dense_1(w), (-1, 1, 1, self.x_channels))
+        yb = tf.reshape(self.dense_2(w), (-1, 1, 1, self.x_channels))
+        return ys * x + yb
+
+
+"""
+Next we build the following:
+
+- A model mapping to map the random noise into style code
+- The generator
+- The discriminator
+
+For the generator, we build generator blocks at multiple resolutions,
+e.g. 4x4, 8x8, ...up to 1024x1024. We only use 4x4 in the beginning
+and we use progressively larger-resolution blocks as the training proceeds.
+Same for the discriminator.
+"""
+
+
+def Mapping(num_stages, input_shape=512):
+    z = layers.Input(shape=(input_shape))
+    w = pixel_norm(z)
+    for i in range(8):
+        w = EqualizedDense(512, learning_rate_multiplier=0.01)(w)
+        w = layers.LeakyReLU(0.2)(w)
+    w = tf.tile(tf.expand_dims(w, 1), (1, num_stages, 1))
+    return keras.Model(z, w, name="mapping")
+
+
+class Generator:
+    def __init__(self, start_res_log2, target_res_log2):
+        self.start_res_log2 = start_res_log2
+        self.target_res_log2 = target_res_log2
+        self.num_stages = target_res_log2 - start_res_log2 + 1
+        # list of generator blocks at increasing resolution
+        self.g_blocks = []
+        # list of layers to convert g_block activation to RGB
+        self.to_rgb = []
+        # list of noise input of different resolutions into g_blocks
+        self.noise_inputs = []
+        # filter size to use at each stage, keys are log2(resolution)
+        self.filter_nums = {
+            0: 512,
+            1: 512,
+            2: 512,  # 4x4
+            3: 512,  # 8x8
+            4: 512,  # 16x16
+            5: 512,  # 32x32
+            6: 256,  # 64x64
+            7: 128,  # 128x128
+            8: 64,  # 256x256
+            9: 32,  # 512x512
+            10: 16,
+        }  # 1024x1024
+
+        start_res = 2**start_res_log2
+        self.input_shape = (start_res, start_res, self.filter_nums[start_res_log2])
+        self.g_input = layers.Input(self.input_shape, name="generator_input")
+
+        for i in range(start_res_log2, target_res_log2 + 1):
+            filter_num = self.filter_nums[i]
+            res = 2**i
+            self.noise_inputs.append(
+                layers.Input(shape=(res, res, 1), name=f"noise_{res}x{res}")
+            )
+            to_rgb = Sequential(
+                [
+                    layers.InputLayer(input_shape=(res, res, filter_num)),
+                    EqualizedConv(3, 1, gain=1),
+                ],
+                name=f"to_rgb_{res}x{res}",
+            )
+            self.to_rgb.append(to_rgb)
+            is_base = i == self.start_res_log2
+            if is_base:
+                input_shape = (res, res, self.filter_nums[i - 1])
+            else:
+                input_shape = (2 ** (i - 1), 2 ** (i - 1), self.filter_nums[i - 1])
+            g_block = self.build_block(
+                filter_num, res=res, input_shape=input_shape, is_base=is_base
+            )
+            self.g_blocks.append(g_block)
+
+    def build_block(self, filter_num, res, input_shape, is_base):
+        input_tensor = layers.Input(shape=input_shape, name=f"g_{res}")
+        noise = layers.Input(shape=(res, res, 1), name=f"noise_{res}")
+        w = layers.Input(shape=512)
+        x = input_tensor
+
+        if not is_base:
+            x = layers.UpSampling2D((2, 2))(x)
+            x = EqualizedConv(filter_num, 3)(x)
+
+        x = AddNoise()([x, noise])
+        x = layers.LeakyReLU(0.2)(x)
+        x = InstanceNormalization()(x)
+        x = AdaIN()([x, w])
+
+        x = EqualizedConv(filter_num, 3)(x)
+        x = AddNoise()([x, noise])
+        x = layers.LeakyReLU(0.2)(x)
+        x = InstanceNormalization()(x)
+        x = AdaIN()([x, w])
+        return keras.Model([input_tensor, w, noise], x, name=f"genblock_{res}x{res}")
+
+    def grow(self, res_log2):
+        res = 2**res_log2
+
+        num_stages = res_log2 - self.start_res_log2 + 1
+        w = layers.Input(shape=(self.num_stages, 512), name="w")
+
+        alpha = layers.Input(shape=(1), name="g_alpha")
+        x = self.g_blocks[0]([self.g_input, w[:, 0], self.noise_inputs[0]])
+
+        if num_stages == 1:
+            rgb = self.to_rgb[0](x)
+        else:
+            for i in range(1, num_stages - 1):
+                x = self.g_blocks[i]([x, w[:, i], self.noise_inputs[i]])
+
+            old_rgb = self.to_rgb[num_stages - 2](x)
+            old_rgb = layers.UpSampling2D((2, 2))(old_rgb)
+
+            i = num_stages - 1
+            x = self.g_blocks[i]([x, w[:, i], self.noise_inputs[i]])
+
+            new_rgb = self.to_rgb[i](x)
+
+            rgb = fade_in(alpha[0], new_rgb, old_rgb)
+
+        return keras.Model(
+            [self.g_input, w, self.noise_inputs, alpha],
+            rgb,
+            name=f"generator_{res}_x_{res}",
+        )
+
+
+class Discriminator:
+    def __init__(self, start_res_log2, target_res_log2):
+        self.start_res_log2 = start_res_log2
+        self.target_res_log2 = target_res_log2
+        self.num_stages = target_res_log2 - start_res_log2 + 1
+        # filter size to use at each stage, keys are log2(resolution)
+        self.filter_nums = {
+            0: 512,
+            1: 512,
+            2: 512,  # 4x4
+            3: 512,  # 8x8
+            4: 512,  # 16x16
+            5: 512,  # 32x32
+            6: 256,  # 64x64
+            7: 128,  # 128x128
+            8: 64,  # 256x256
+            9: 32,  # 512x512
+            10: 16,
+        }  # 1024x1024
+        # list of discriminator blocks at increasing resolution
+        self.d_blocks = []
+        # list of layers to convert RGB into activation for d_blocks inputs
+        self.from_rgb = []
+
+        for res_log2 in range(self.start_res_log2, self.target_res_log2 + 1):
+            res = 2**res_log2
+            filter_num = self.filter_nums[res_log2]
+            from_rgb = Sequential(
+                [
+                    layers.InputLayer(
+                        input_shape=(res, res, 3), name=f"from_rgb_input_{res}"
+                    ),
+                    EqualizedConv(filter_num, 1),
+                    layers.LeakyReLU(0.2),
+                ],
+                name=f"from_rgb_{res}",
+            )
+
+            self.from_rgb.append(from_rgb)
+
+            input_shape = (res, res, filter_num)
+            if len(self.d_blocks) == 0:
+                d_block = self.build_base(filter_num, res)
+            else:
+                d_block = self.build_block(
+                    filter_num, self.filter_nums[res_log2 - 1], res
+                )
+
+            self.d_blocks.append(d_block)
+
+    def build_base(self, filter_num, res):
+        input_tensor = layers.Input(shape=(res, res, filter_num), name=f"d_{res}")
+        x = minibatch_std(input_tensor)
+        x = EqualizedConv(filter_num, 3)(x)
+        x = layers.LeakyReLU(0.2)(x)
+        x = layers.Flatten()(x)
+        x = EqualizedDense(filter_num)(x)
+        x = layers.LeakyReLU(0.2)(x)
+        x = EqualizedDense(1)(x)
+        return keras.Model(input_tensor, x, name=f"d_{res}")
+
+    def build_block(self, filter_num_1, filter_num_2, res):
+        input_tensor = layers.Input(shape=(res, res, filter_num_1), name=f"d_{res}")
+        x = EqualizedConv(filter_num_1, 3)(input_tensor)
+        x = layers.LeakyReLU(0.2)(x)
+        x = EqualizedConv(filter_num_2)(x)
+        x = layers.LeakyReLU(0.2)(x)
+        x = layers.AveragePooling2D((2, 2))(x)
+        return keras.Model(input_tensor, x, name=f"d_{res}")
+
+    def grow(self, res_log2):
+        res = 2**res_log2
+        idx = res_log2 - self.start_res_log2
+        alpha = layers.Input(shape=(1), name="d_alpha")
+        input_image = layers.Input(shape=(res, res, 3), name="input_image")
+        x = self.from_rgb[idx](input_image)
+        x = self.d_blocks[idx](x)
+        if idx > 0:
+            idx -= 1
+            downsized_image = layers.AveragePooling2D((2, 2))(input_image)
+            y = self.from_rgb[idx](downsized_image)
+            x = fade_in(alpha[0], x, y)
+
+            for i in range(idx, -1, -1):
+                x = self.d_blocks[i](x)
+        return keras.Model([input_image, alpha], x, name=f"discriminator_{res}_x_{res}")
+
+
+"""
+## Build StyleGAN with custom train step
+"""
+
+
+class StyleGAN(tf.keras.Model):
+    def __init__(self, z_dim=512, target_res=64, start_res=4):
+        super().__init__()
+        self.z_dim = z_dim
+
+        self.target_res_log2 = log2(target_res)
+        self.start_res_log2 = log2(start_res)
+        self.current_res_log2 = self.target_res_log2
+        self.num_stages = self.target_res_log2 - self.start_res_log2 + 1
+
+        self.alpha = tf.Variable(1.0, dtype=tf.float32, trainable=False, name="alpha")
+
+        self.mapping = Mapping(num_stages=self.num_stages)
+        self.d_builder = Discriminator(self.start_res_log2, self.target_res_log2)
+        self.g_builder = Generator(self.start_res_log2, self.target_res_log2)
+        self.g_input_shape = self.g_builder.input_shape
+
+        self.phase = None
+        self.train_step_counter = tf.Variable(0, dtype=tf.int32, trainable=False)
+
+        self.loss_weights = {"gradient_penalty": 10, "drift": 0.001}
+
+    def grow_model(self, res):
+        tf.keras.backend.clear_session()
+        res_log2 = log2(res)
+        self.generator = self.g_builder.grow(res_log2)
+        self.discriminator = self.d_builder.grow(res_log2)
+        self.current_res_log2 = res_log2
+        print(f"\nModel resolution:{res}x{res}")
+
+    def compile(
+        self, steps_per_epoch, phase, res, d_optimizer, g_optimizer, *args, **kwargs
+    ):
+        self.loss_weights = kwargs.pop("loss_weights", self.loss_weights)
+        self.steps_per_epoch = steps_per_epoch
+        if res != 2**self.current_res_log2:
+            self.grow_model(res)
+            self.d_optimizer = d_optimizer
+            self.g_optimizer = g_optimizer
+
+        self.train_step_counter.assign(0)
+        self.phase = phase
+        self.d_loss_metric = keras.metrics.Mean(name="d_loss")
+        self.g_loss_metric = keras.metrics.Mean(name="g_loss")
+        super().compile(*args, **kwargs)
+
+    @property
+    def metrics(self):
+        return [self.d_loss_metric, self.g_loss_metric]
+
+    def generate_noise(self, batch_size):
+        noise = [
+            tf.random.normal((batch_size, 2**res, 2**res, 1))
+            for res in range(self.start_res_log2, self.target_res_log2 + 1)
+        ]
+        return noise
+
+    def gradient_loss(self, grad):
+        loss = tf.square(grad)
+        loss = tf.reduce_sum(loss, axis=tf.range(1, tf.size(tf.shape(loss))))
+        loss = tf.sqrt(loss)
+        loss = tf.reduce_mean(tf.square(loss - 1))
+        return loss
+
+    def train_step(self, real_images):
+        self.train_step_counter.assign_add(1)
+
+        if self.phase == "TRANSITION":
+            self.alpha.assign(
+                tf.cast(self.train_step_counter / self.steps_per_epoch, tf.float32)
+            )
+        elif self.phase == "STABLE":
+            self.alpha.assign(1.0)
+        else:
+            raise NotImplementedError
+        alpha = tf.expand_dims(self.alpha, 0)
+        batch_size = tf.shape(real_images)[0]
+        real_labels = tf.ones(batch_size)
+        fake_labels = -tf.ones(batch_size)
+
+        z = tf.random.normal((batch_size, self.z_dim))
+        const_input = tf.ones(tuple([batch_size] + list(self.g_input_shape)))
+        noise = self.generate_noise(batch_size)
+
+        # generator
+        with tf.GradientTape() as g_tape:
+            w = self.mapping(z)
+            fake_images = self.generator([const_input, w, noise, alpha])
+            pred_fake = self.discriminator([fake_images, alpha])
+            g_loss = wasserstein_loss(real_labels, pred_fake)
+
+            trainable_weights = (
+                self.mapping.trainable_weights + self.generator.trainable_weights
+            )
+            gradients = g_tape.gradient(g_loss, trainable_weights)
+            self.g_optimizer.apply_gradients(zip(gradients, trainable_weights))
+
+        # discriminator
+        with tf.GradientTape() as gradient_tape, tf.GradientTape() as total_tape:
+            # forward pass
+            pred_fake = self.discriminator([fake_images, alpha])
+            pred_real = self.discriminator([real_images, alpha])
+
+            epsilon = tf.random.uniform((batch_size, 1, 1, 1))
+            interpolates = epsilon * real_images + (1 - epsilon) * fake_images
+            gradient_tape.watch(interpolates)
+            pred_fake_grad = self.discriminator([interpolates, alpha])
+
+            # calculate losses
+            loss_fake = wasserstein_loss(fake_labels, pred_fake)
+            loss_real = wasserstein_loss(real_labels, pred_real)
+            loss_fake_grad = wasserstein_loss(fake_labels, pred_fake_grad)
+
+            # gradient penalty
+            gradients_fake = gradient_tape.gradient(loss_fake_grad, [interpolates])
+            gradient_penalty = self.loss_weights[
+                "gradient_penalty"
+            ] * self.gradient_loss(gradients_fake)
+
+            # drift loss
+            all_pred = tf.concat([pred_fake, pred_real], axis=0)
+            drift_loss = self.loss_weights["drift"] * tf.reduce_mean(all_pred**2)
+
+            d_loss = loss_fake + loss_real + gradient_penalty + drift_loss
+
+            gradients = total_tape.gradient(
+                d_loss, self.discriminator.trainable_weights
+            )
+            self.d_optimizer.apply_gradients(
+                zip(gradients, self.discriminator.trainable_weights)
+            )
+
+        # Update metrics
+        self.d_loss_metric.update_state(d_loss)
+        self.g_loss_metric.update_state(g_loss)
+        return {
+            "d_loss": self.d_loss_metric.result(),
+            "g_loss": self.g_loss_metric.result(),
+        }
+
+    def call(self, inputs: dict()):
+        style_code = inputs.get("style_code", None)
+        z = inputs.get("z", None)
+        noise = inputs.get("noise", None)
+        batch_size = inputs.get("batch_size", 1)
+        alpha = inputs.get("alpha", 1.0)
+        alpha = tf.expand_dims(alpha, 0)
+        if style_code is None:
+            if z is None:
+                z = tf.random.normal((batch_size, self.z_dim))
+            style_code = self.mapping(z)
+
+        if noise is None:
+            noise = self.generate_noise(batch_size)
+
+        # self.alpha.assign(alpha)
+
+        const_input = tf.ones(tuple([batch_size] + list(self.g_input_shape)))
+        images = self.generator([const_input, style_code, noise, alpha])
+        images = np.clip((images * 0.5 + 0.5) * 255, 0, 255).astype(np.uint8)
+
+        return images
+
+
+"""
+## Training
+
+We first build the StyleGAN at smallest resolution, such as 4x4 or 8x8. Then we
+progressively grow the model to higher resolution by appending new generator and
+discriminator blocks.
+"""
+
+START_RES = 4
+TARGET_RES = 128
+
+style_gan = StyleGAN(start_res=START_RES, target_res=TARGET_RES)
+
+"""
+The training for each new resolution happens in two phases - "transition" and "stable".
+In the transition phase, the features from the previous resolution are mixed with the
+current resolution. This allows for a smoother transition when scaling up. We use each
+epoch in `model.fit()` as a phase.
+"""
+
+
+def train(
+    start_res=START_RES,
+    target_res=TARGET_RES,
+    steps_per_epoch=5000,
+    display_images=True,
+):
+    opt_cfg = {"learning_rate": 1e-3, "beta_1": 0.0, "beta_2": 0.99, "epsilon": 1e-8}
+
+    val_batch_size = 16
+    val_z = tf.random.normal((val_batch_size, style_gan.z_dim))
+    val_noise = style_gan.generate_noise(val_batch_size)
+
+    start_res_log2 = int(np.log2(start_res))
+    target_res_log2 = int(np.log2(target_res))
+
+    for res_log2 in range(start_res_log2, target_res_log2 + 1):
+        res = 2**res_log2
+        for phase in ["TRANSITION", "STABLE"]:
+            if res == start_res and phase == "TRANSITION":
+                continue
+
+            train_dl = create_dataloader(res)
+
+            steps = int(train_step_ratio[res_log2] * steps_per_epoch)
+
+            style_gan.compile(
+                d_optimizer=tf.keras.optimizers.legacy.Adam(**opt_cfg),
+                g_optimizer=tf.keras.optimizers.legacy.Adam(**opt_cfg),
+                loss_weights={"gradient_penalty": 10, "drift": 0.001},
+                steps_per_epoch=steps,
+                res=res,
+                phase=phase,
+                run_eagerly=False,
+            )
+
+            prefix = f"res_{res}x{res}_{style_gan.phase}"
+
+            ckpt_cb = keras.callbacks.ModelCheckpoint(
+                f"checkpoints/stylegan_{res}x{res}.ckpt",
+                save_weights_only=True,
+                verbose=0,
+            )
+            print(phase)
+            style_gan.fit(
+                train_dl, epochs=1, steps_per_epoch=steps, callbacks=[ckpt_cb]
+            )
+
+            if display_images:
+                images = style_gan({"z": val_z, "noise": val_noise, "alpha": 1.0})
+                plot_images(images, res_log2)
+
+
+"""
+StyleGAN can take a long time to train, in the code below, a small `steps_per_epoch`
+value of 1 is used to sanity-check the code is working alright. In practice, a larger
+`steps_per_epoch` value (over 10000)
+is required to get decent results.
+"""
+
+train(start_res=4, target_res=16, steps_per_epoch=1, display_images=False)
+
+"""
+## Results
+
+We can now run some inference using pre-trained 64x64 checkpoints. In general, the image
+fidelity increases with the resolution. You can try to train this StyleGAN to resolutions
+above 128x128 with the CelebA HQ dataset.
+"""
+
+url = "https://github.com/soon-yau/stylegan_keras/releases/download/keras_example_v1.0/stylegan_128x128.ckpt.zip"
+
+weights_path = keras.utils.get_file(
+    "stylegan_128x128.ckpt.zip",
+    url,
+    extract=True,
+    cache_dir=os.path.abspath("."),
+    cache_subdir="pretrained",
+)
+
+style_gan.grow_model(128)
+style_gan.load_weights(os.path.join("pretrained/stylegan_128x128.ckpt"))
+
+tf.random.set_seed(196)
+batch_size = 2
+z = tf.random.normal((batch_size, style_gan.z_dim))
+w = style_gan.mapping(z)
+noise = style_gan.generate_noise(batch_size=batch_size)
+images = style_gan({"style_code": w, "noise": noise, "alpha": 1.0})
+plot_images(images, 5)
+
+"""
+## Style Mixing
+
+We can also mix styles from two images to create a new image.
+"""
+
+alpha = 0.4
+w_mix = np.expand_dims(alpha * w[0] + (1 - alpha) * w[1], 0)
+noise_a = [np.expand_dims(n[0], 0) for n in noise]
+mix_images = style_gan({"style_code": w_mix, "noise": noise_a})
+image_row = np.hstack([images[0], images[1], mix_images[0]])
+plt.figure(figsize=(9, 3))
+plt.imshow(image_row)
+plt.axis("off")
diff --git a/knowledge_base/generative/text_generation_fnet.py b/knowledge_base/generative/text_generation_fnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..1eca13c9efc8738771ebcdef6549f2adf52c7bb5
--- /dev/null
+++ b/knowledge_base/generative/text_generation_fnet.py
@@ -0,0 +1,387 @@
+"""
+Title: Text Generation using FNet
+Author: [Darshan Deshpande](https://twitter.com/getdarshan)
+Date created: 2021/10/05
+Last modified: 2021/10/05
+Description: FNet transformer for text generation in Keras.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+The original transformer implementation (Vaswani et al., 2017) was one of the major
+breakthroughs in Natural Language Processing, giving rise to important architectures such BERT and GPT.
+However, the drawback of these architectures is
+that the self-attention mechanism they use is computationally expensive. The FNet
+architecture proposes to replace this self-attention attention with a leaner mechanism:
+a Fourier transformation-based linear mixer for input tokens.
+
+The FNet model was able to achieve 92-97% of BERT's accuracy while training 80% faster on
+GPUs and almost 70% faster on TPUs. This type of design provides an efficient and small
+model size, leading to faster inference times.
+
+In this example, we will implement and train this architecture on the Cornell Movie
+Dialog corpus to show the applicability of this model to text generation.
+"""
+
+"""
+## Imports
+"""
+
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+import os
+
+# Defining hyperparameters
+
+VOCAB_SIZE = 8192
+MAX_SAMPLES = 50000
+BUFFER_SIZE = 20000
+MAX_LENGTH = 40
+EMBED_DIM = 256
+LATENT_DIM = 512
+NUM_HEADS = 8
+BATCH_SIZE = 64
+
+"""
+## Loading data
+
+We will be using the Cornell Dialog Corpus. We will parse the movie conversations into
+questions and answers sets.
+"""
+
+path_to_zip = keras.utils.get_file(
+    "cornell_movie_dialogs.zip",
+    origin="http://www.cs.cornell.edu/~cristian/data/cornell_movie_dialogs_corpus.zip",
+    extract=True,
+)
+
+path_to_dataset = os.path.join(
+    os.path.dirname(path_to_zip), "cornell movie-dialogs corpus"
+)
+path_to_movie_lines = os.path.join(path_to_dataset, "movie_lines.txt")
+path_to_movie_conversations = os.path.join(path_to_dataset, "movie_conversations.txt")
+
+
+def load_conversations():
+    # Helper function for loading the conversation splits
+    id2line = {}
+    with open(path_to_movie_lines, errors="ignore") as file:
+        lines = file.readlines()
+    for line in lines:
+        parts = line.replace("\n", "").split(" +++$+++ ")
+        id2line[parts[0]] = parts[4]
+
+    inputs, outputs = [], []
+    with open(path_to_movie_conversations, "r") as file:
+        lines = file.readlines()
+    for line in lines:
+        parts = line.replace("\n", "").split(" +++$+++ ")
+        # get conversation in a list of line ID
+        conversation = [line[1:-1] for line in parts[3][1:-1].split(", ")]
+        for i in range(len(conversation) - 1):
+            inputs.append(id2line[conversation[i]])
+            outputs.append(id2line[conversation[i + 1]])
+            if len(inputs) >= MAX_SAMPLES:
+                return inputs, outputs
+    return inputs, outputs
+
+
+questions, answers = load_conversations()
+
+# Splitting training and validation sets
+
+train_dataset = tf.data.Dataset.from_tensor_slices((questions[:40000], answers[:40000]))
+val_dataset = tf.data.Dataset.from_tensor_slices((questions[40000:], answers[40000:]))
+
+"""
+### Preprocessing and Tokenization
+"""
+
+
+def preprocess_text(sentence):
+    sentence = tf.strings.lower(sentence)
+    # Adding a space between the punctuation and the last word to allow better tokenization
+    sentence = tf.strings.regex_replace(sentence, r"([?.!,])", r" \1 ")
+    # Replacing multiple continuous spaces with a single space
+    sentence = tf.strings.regex_replace(sentence, r"\s\s+", " ")
+    # Replacing non english words with spaces
+    sentence = tf.strings.regex_replace(sentence, r"[^a-z?.!,]+", " ")
+    sentence = tf.strings.strip(sentence)
+    sentence = tf.strings.join(["[start]", sentence, "[end]"], separator=" ")
+    return sentence
+
+
+vectorizer = layers.TextVectorization(
+    VOCAB_SIZE,
+    standardize=preprocess_text,
+    output_mode="int",
+    output_sequence_length=MAX_LENGTH,
+)
+
+# We will adapt the vectorizer to both the questions and answers
+# This dataset is batched to parallelize and speed up the process
+vectorizer.adapt(tf.data.Dataset.from_tensor_slices((questions + answers)).batch(128))
+
+"""
+### Tokenizing and padding sentences using `TextVectorization`
+"""
+
+
+def vectorize_text(inputs, outputs):
+    inputs, outputs = vectorizer(inputs), vectorizer(outputs)
+    # One extra padding token to the right to match the output shape
+    outputs = tf.pad(outputs, [[0, 1]])
+    return (
+        {"encoder_inputs": inputs, "decoder_inputs": outputs[:-1]},
+        {"outputs": outputs[1:]},
+    )
+
+
+train_dataset = train_dataset.map(vectorize_text, num_parallel_calls=tf.data.AUTOTUNE)
+val_dataset = val_dataset.map(vectorize_text, num_parallel_calls=tf.data.AUTOTUNE)
+
+train_dataset = (
+    train_dataset.cache()
+    .shuffle(BUFFER_SIZE)
+    .batch(BATCH_SIZE)
+    .prefetch(tf.data.AUTOTUNE)
+)
+val_dataset = val_dataset.cache().batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
+
+"""
+## Creating the FNet Encoder
+
+The FNet paper proposes a replacement for the standard attention mechanism used by the
+Transformer architecture (Vaswani et al., 2017).
+
+![Architecture](https://i.imgur.com/rLg47qU.png)
+
+The outputs of the FFT layer are complex numbers. To avoid dealing with complex layers,
+only the real part (the magnitude) is extracted.
+
+The dense layers that follow the Fourier transformation act as convolutions applied on
+the frequency domain.
+"""
+
+
+class FNetEncoder(layers.Layer):
+    def __init__(self, embed_dim, dense_dim, **kwargs):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.dense_dim = dense_dim
+        self.dense_proj = keras.Sequential(
+            [
+                layers.Dense(dense_dim, activation="relu"),
+                layers.Dense(embed_dim),
+            ]
+        )
+        self.layernorm_1 = layers.LayerNormalization()
+        self.layernorm_2 = layers.LayerNormalization()
+
+    def call(self, inputs):
+        # Casting the inputs to complex64
+        inp_complex = tf.cast(inputs, tf.complex64)
+        # Projecting the inputs to the frequency domain using FFT2D and
+        # extracting the real part of the output
+        fft = tf.math.real(tf.signal.fft2d(inp_complex))
+        proj_input = self.layernorm_1(inputs + fft)
+        proj_output = self.dense_proj(proj_input)
+        return self.layernorm_2(proj_input + proj_output)
+
+
+"""
+## Creating the Decoder
+
+The decoder architecture remains the same as the one proposed by (Vaswani et al., 2017)
+in the original transformer architecture, consisting of an embedding, positional
+encoding, two masked multi-head attention layers and finally the dense output layers.
+The architecture that follows is taken from
+[Deep Learning with Python, second edition, chapter 11](https://www.manning.com/books/deep-learning-with-python-second-edition).
+
+"""
+
+
+class PositionalEmbedding(layers.Layer):
+    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
+        super().__init__(**kwargs)
+        self.token_embeddings = layers.Embedding(
+            input_dim=vocab_size, output_dim=embed_dim
+        )
+        self.position_embeddings = layers.Embedding(
+            input_dim=sequence_length, output_dim=embed_dim
+        )
+        self.sequence_length = sequence_length
+        self.vocab_size = vocab_size
+        self.embed_dim = embed_dim
+
+    def call(self, inputs):
+        length = tf.shape(inputs)[-1]
+        positions = tf.range(start=0, limit=length, delta=1)
+        embedded_tokens = self.token_embeddings(inputs)
+        embedded_positions = self.position_embeddings(positions)
+        return embedded_tokens + embedded_positions
+
+    def compute_mask(self, inputs, mask=None):
+        return tf.math.not_equal(inputs, 0)
+
+
+class FNetDecoder(layers.Layer):
+    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.latent_dim = latent_dim
+        self.num_heads = num_heads
+        self.attention_1 = layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=embed_dim
+        )
+        self.attention_2 = layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=embed_dim
+        )
+        self.dense_proj = keras.Sequential(
+            [
+                layers.Dense(latent_dim, activation="relu"),
+                layers.Dense(embed_dim),
+            ]
+        )
+        self.layernorm_1 = layers.LayerNormalization()
+        self.layernorm_2 = layers.LayerNormalization()
+        self.layernorm_3 = layers.LayerNormalization()
+        self.supports_masking = True
+
+    def call(self, inputs, encoder_outputs, mask=None):
+        causal_mask = self.get_causal_attention_mask(inputs)
+        if mask is not None:
+            padding_mask = tf.cast(mask[:, tf.newaxis, :], dtype="int32")
+            padding_mask = tf.minimum(padding_mask, causal_mask)
+
+        attention_output_1 = self.attention_1(
+            query=inputs, value=inputs, key=inputs, attention_mask=causal_mask
+        )
+        out_1 = self.layernorm_1(inputs + attention_output_1)
+
+        attention_output_2 = self.attention_2(
+            query=out_1,
+            value=encoder_outputs,
+            key=encoder_outputs,
+            attention_mask=padding_mask,
+        )
+        out_2 = self.layernorm_2(out_1 + attention_output_2)
+
+        proj_output = self.dense_proj(out_2)
+        return self.layernorm_3(out_2 + proj_output)
+
+    def get_causal_attention_mask(self, inputs):
+        input_shape = tf.shape(inputs)
+        batch_size, sequence_length = input_shape[0], input_shape[1]
+        i = tf.range(sequence_length)[:, tf.newaxis]
+        j = tf.range(sequence_length)
+        mask = tf.cast(i >= j, dtype="int32")
+        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
+        mult = tf.concat(
+            [tf.expand_dims(batch_size, -1), tf.constant([1, 1], dtype=tf.int32)],
+            axis=0,
+        )
+        return tf.tile(mask, mult)
+
+
+def create_model():
+    encoder_inputs = keras.Input(shape=(None,), dtype="int32", name="encoder_inputs")
+    x = PositionalEmbedding(MAX_LENGTH, VOCAB_SIZE, EMBED_DIM)(encoder_inputs)
+    encoder_outputs = FNetEncoder(EMBED_DIM, LATENT_DIM)(x)
+    encoder = keras.Model(encoder_inputs, encoder_outputs)
+    decoder_inputs = keras.Input(shape=(None,), dtype="int32", name="decoder_inputs")
+    encoded_seq_inputs = keras.Input(
+        shape=(None, EMBED_DIM), name="decoder_state_inputs"
+    )
+    x = PositionalEmbedding(MAX_LENGTH, VOCAB_SIZE, EMBED_DIM)(decoder_inputs)
+    x = FNetDecoder(EMBED_DIM, LATENT_DIM, NUM_HEADS)(x, encoded_seq_inputs)
+    x = layers.Dropout(0.5)(x)
+    decoder_outputs = layers.Dense(VOCAB_SIZE, activation="softmax")(x)
+    decoder = keras.Model(
+        [decoder_inputs, encoded_seq_inputs], decoder_outputs, name="outputs"
+    )
+    decoder_outputs = decoder([decoder_inputs, encoder_outputs])
+    fnet = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs, name="fnet")
+    return fnet
+
+
+"""
+## Creating and Training the model
+"""
+
+fnet = create_model()
+fnet.compile("adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
+
+"""
+Here, the `epochs` parameter is set to a single epoch, but in practice the model will take around
+**20-30 epochs** of training to start outputting comprehensible sentences. Although accuracy
+is not a good measure for this task, we will use it just to get a hint of the improvement
+of the network.
+"""
+
+fnet.fit(train_dataset, epochs=1, validation_data=val_dataset)
+
+"""
+## Performing inference
+"""
+
+VOCAB = vectorizer.get_vocabulary()
+
+
+def decode_sentence(input_sentence):
+    # Mapping the input sentence to tokens and adding start and end tokens
+    tokenized_input_sentence = vectorizer(
+        tf.constant("[start] " + preprocess_text(input_sentence) + " [end]")
+    )
+    # Initializing the initial sentence consisting of only the start token.
+    tokenized_target_sentence = tf.expand_dims(VOCAB.index("[start]"), 0)
+    decoded_sentence = ""
+
+    for i in range(MAX_LENGTH):
+        # Get the predictions
+        predictions = fnet.predict(
+            {
+                "encoder_inputs": tf.expand_dims(tokenized_input_sentence, 0),
+                "decoder_inputs": tf.expand_dims(
+                    tf.pad(
+                        tokenized_target_sentence,
+                        [[0, MAX_LENGTH - tf.shape(tokenized_target_sentence)[0]]],
+                    ),
+                    0,
+                ),
+            }
+        )
+        # Calculating the token with maximum probability and getting the corresponding word
+        sampled_token_index = tf.argmax(predictions[0, i, :])
+        sampled_token = VOCAB[sampled_token_index.numpy()]
+        # If sampled token is the end token then stop generating and return the sentence
+        if tf.equal(sampled_token_index, VOCAB.index("[end]")):
+            break
+        decoded_sentence += sampled_token + " "
+        tokenized_target_sentence = tf.concat(
+            [tokenized_target_sentence, [sampled_token_index]], 0
+        )
+
+    return decoded_sentence
+
+
+decode_sentence("Where have you been all this time?")
+
+"""
+## Conclusion
+
+This example shows how to train and perform inference using the FNet model.
+For getting insight into the architecture or for further reading, you can refer to:
+
+1. [FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824v3)
+(Lee-Thorp et al., 2021)
+2. [Attention Is All You Need](https://arxiv.org/abs/1706.03762v5) (Vaswani et al.,
+2017)
+
+Thanks to François Chollet for his Keras example on
+[English-to-Spanish translation with a sequence-to-sequence Transformer](https://keras.io/examples/nlp/neural_machine_translation_with_transformer/)
+from which the decoder implementation was extracted.
+"""
diff --git a/knowledge_base/generative/text_generation_gpt.py b/knowledge_base/generative/text_generation_gpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b63ab760c1d5319afd1e40142acb67e938be265
--- /dev/null
+++ b/knowledge_base/generative/text_generation_gpt.py
@@ -0,0 +1,422 @@
+"""
+Title: GPT text generation from scratch with KerasHub
+Author: [Jesse Chan](https://github.com/jessechancy)
+Date created: 2022/07/25
+Last modified: 2022/07/25
+Description: Using KerasHub to train a mini-GPT model for text generation.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+In this example, we will use KerasHub to build a scaled down Generative
+Pre-Trained (GPT) model. GPT is a Transformer-based model that allows you to generate
+sophisticated text from a prompt.
+
+We will train the model on the [simplebooks-92](https://arxiv.org/abs/1911.12391) corpus,
+which is a dataset made from several novels. It is a good dataset for this example since
+it has a small vocabulary and high word frequency, which is beneficial when training a
+model with few parameters.
+
+This example combines concepts from
+[Text generation with a miniature GPT](https://keras.io/examples/generative/text_generation_with_miniature_gpt/)
+with KerasHub abstractions. We will demonstrate how KerasHub tokenization, layers and
+metrics simplify the training
+process, and then show how to generate output text using the KerasHub sampling utilities.
+
+Note: If you are running this example on a Colab,
+make sure to enable GPU runtime for faster training.
+
+This example requires KerasHub. You can install it via the following command:
+`pip install keras-hub`
+"""
+
+"""
+## Setup
+"""
+
+"""shell
+pip install -q --upgrade keras-hub
+pip install -q --upgrade keras  # Upgrade to Keras 3.
+"""
+
+import os
+import keras_hub
+import keras
+
+import tensorflow.data as tf_data
+import tensorflow.strings as tf_strings
+
+"""
+## Settings & hyperparameters
+"""
+
+# Data
+BATCH_SIZE = 64
+MIN_STRING_LEN = 512  # Strings shorter than this will be discarded
+SEQ_LEN = 128  # Length of training sequences, in tokens
+
+# Model
+EMBED_DIM = 256
+FEED_FORWARD_DIM = 128
+NUM_HEADS = 3
+NUM_LAYERS = 2
+VOCAB_SIZE = 5000  # Limits parameters in model.
+
+# Training
+EPOCHS = 5
+
+# Inference
+NUM_TOKENS_TO_GENERATE = 80
+
+"""
+## Load the data
+
+Now, let's download the dataset! The SimpleBooks dataset consists of 1,573 Gutenberg books, and has
+one of the smallest vocabulary size to word-level tokens ratio. It has a vocabulary size of ~98k,
+a third of WikiText-103's, with around the same number of tokens (~100M). This makes it easy to fit a small model.
+"""
+
+keras.utils.get_file(
+    origin="https://dldata-public.s3.us-east-2.amazonaws.com/simplebooks.zip",
+    extract=True,
+)
+dir = os.path.expanduser("~/.keras/datasets/simplebooks/")
+
+# Load simplebooks-92 train set and filter out short lines.
+raw_train_ds = (
+    tf_data.TextLineDataset(dir + "simplebooks-92-raw/train.txt")
+    .filter(lambda x: tf_strings.length(x) > MIN_STRING_LEN)
+    .batch(BATCH_SIZE)
+    .shuffle(buffer_size=256)
+)
+
+# Load simplebooks-92 validation set and filter out short lines.
+raw_val_ds = (
+    tf_data.TextLineDataset(dir + "simplebooks-92-raw/valid.txt")
+    .filter(lambda x: tf_strings.length(x) > MIN_STRING_LEN)
+    .batch(BATCH_SIZE)
+)
+
+"""
+## Train the tokenizer
+
+We train the tokenizer from the training dataset for a vocabulary size of `VOCAB_SIZE`,
+which is a tuned hyperparameter. We want to limit the vocabulary as much as possible, as
+we will see later on
+that it has a large effect on the number of model parameters. We also don't want to include
+*too few* vocabulary terms, or there would be too many out-of-vocabulary (OOV) sub-words. In
+addition, three tokens are reserved in the vocabulary:
+
+- `"[PAD]"` for padding sequences to `SEQ_LEN`. This token has index 0 in both
+`reserved_tokens` and `vocab`, since `WordPieceTokenizer` (and other layers) consider
+`0`/`vocab[0]` as the default padding.
+- `"[UNK]"` for OOV sub-words, which should match the default `oov_token="[UNK]"` in
+`WordPieceTokenizer`.
+- `"[BOS]"` stands for beginning of sentence, but here technically it is a token
+representing the beginning of each line of training data.
+"""
+
+# Train tokenizer vocabulary
+vocab = keras_hub.tokenizers.compute_word_piece_vocabulary(
+    raw_train_ds,
+    vocabulary_size=VOCAB_SIZE,
+    lowercase=True,
+    reserved_tokens=["[PAD]", "[UNK]", "[BOS]"],
+)
+
+"""
+## Load tokenizer
+
+We use the vocabulary data to initialize
+`keras_hub.tokenizers.WordPieceTokenizer`. WordPieceTokenizer is an efficient
+implementation of the WordPiece algorithm used by BERT and other models. It will strip,
+lower-case and do other irreversible preprocessing operations.
+"""
+
+tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
+    vocabulary=vocab,
+    sequence_length=SEQ_LEN,
+    lowercase=True,
+)
+
+"""
+## Tokenize data
+
+We preprocess the dataset by tokenizing and splitting it into `features` and `labels`.
+"""
+
+# packer adds a start token
+start_packer = keras_hub.layers.StartEndPacker(
+    sequence_length=SEQ_LEN,
+    start_value=tokenizer.token_to_id("[BOS]"),
+)
+
+
+def preprocess(inputs):
+    outputs = tokenizer(inputs)
+    features = start_packer(outputs)
+    labels = outputs
+    return features, labels
+
+
+# Tokenize and split into train and label sequences.
+train_ds = raw_train_ds.map(preprocess, num_parallel_calls=tf_data.AUTOTUNE).prefetch(
+    tf_data.AUTOTUNE
+)
+val_ds = raw_val_ds.map(preprocess, num_parallel_calls=tf_data.AUTOTUNE).prefetch(
+    tf_data.AUTOTUNE
+)
+
+"""
+## Build the model
+
+We create our scaled down GPT model with the following layers:
+
+- One `keras_hub.layers.TokenAndPositionEmbedding` layer, which combines the embedding
+for the token and its position.
+- Multiple `keras_hub.layers.TransformerDecoder` layers, with the default causal masking.
+The layer has no cross-attention when run with decoder sequence only.
+- One final dense linear layer
+"""
+
+inputs = keras.layers.Input(shape=(None,), dtype="int32")
+# Embedding.
+embedding_layer = keras_hub.layers.TokenAndPositionEmbedding(
+    vocabulary_size=VOCAB_SIZE,
+    sequence_length=SEQ_LEN,
+    embedding_dim=EMBED_DIM,
+    mask_zero=True,
+)
+x = embedding_layer(inputs)
+# Transformer decoders.
+for _ in range(NUM_LAYERS):
+    decoder_layer = keras_hub.layers.TransformerDecoder(
+        num_heads=NUM_HEADS,
+        intermediate_dim=FEED_FORWARD_DIM,
+    )
+    x = decoder_layer(x)  # Giving one argument only skips cross-attention.
+# Output.
+outputs = keras.layers.Dense(VOCAB_SIZE)(x)
+model = keras.Model(inputs=inputs, outputs=outputs)
+loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+perplexity = keras_hub.metrics.Perplexity(from_logits=True, mask_token_id=0)
+model.compile(optimizer="adam", loss=loss_fn, metrics=[perplexity])
+
+"""
+Let's take a look at our model summary - a large majority of the
+parameters are in the `token_and_position_embedding` and the output `dense` layer!
+This means that the vocabulary size (`VOCAB_SIZE`) has a large effect on the size of the model,
+while the number of Transformer decoder layers (`NUM_LAYERS`) doesn't affect it as much.
+"""
+
+model.summary()
+
+"""
+## Training
+
+Now that we have our model, let's train it with the `fit()` method.
+"""
+
+model.fit(train_ds, validation_data=val_ds, epochs=EPOCHS)
+
+"""
+## Inference
+
+With our trained model, we can test it out to gauge its performance. To do this
+we can seed our model with an input sequence starting with the `"[BOS]"` token,
+and progressively sample the model by making predictions for each subsequent
+token in a loop.
+
+To start lets build a prompt with the same shape as our model inputs, containing
+only the `"[BOS]"` token.
+"""
+
+# The "packer" layers adds the [BOS] token for us.
+prompt_tokens = start_packer(tokenizer([""]))
+prompt_tokens
+
+"""
+We will use the `keras_hub.samplers` module for inference, which requires a
+callback function wrapping the model we just trained. This wrapper calls
+the model and returns the logit predictions for the current token we are
+generating.
+
+Note: There are two pieces of more advanced functionality available when
+defining your callback. The first is the ability to take in a `cache` of states
+computed in previous generation steps, which can be used to speed up generation.
+The second is the ability to output the final dense "hidden state" of each
+generated token. This is used by `keras_hub.samplers.ContrastiveSampler`, which
+avoids repetition by penalizing repeated hidden states. Both are optional, and
+we will ignore them for now.
+"""
+
+
+def next(prompt, cache, index):
+    logits = model(prompt)[:, index - 1, :]
+    # Ignore hidden states for now; only needed for contrastive search.
+    hidden_states = None
+    return logits, hidden_states, cache
+
+
+"""
+Creating the wrapper function is the most complex part of using these functions. Now that
+it's done, let's test out the different utilities, starting with greedy search.
+"""
+
+"""
+### Greedy search
+
+We greedily pick the most probable token at each timestep. In other words, we get the
+argmax of the model output.
+"""
+
+sampler = keras_hub.samplers.GreedySampler()
+output_tokens = sampler(
+    next=next,
+    prompt=prompt_tokens,
+    index=1,  # Start sampling immediately after the [BOS] token.
+)
+txt = tokenizer.detokenize(output_tokens)
+print(f"Greedy search generated text: \n{txt}\n")
+
+"""
+As you can see, greedy search starts out making some sense, but quickly starts repeating
+itself. This is a common problem with text generation that can be fixed by some of the
+probabilistic text generation utilities shown later on!
+"""
+
+"""
+### Beam search
+
+At a high-level, beam search keeps track of the `num_beams` most probable sequences at
+each timestep, and predicts the best next token from all sequences. It is an improvement
+over greedy search since it stores more possibilities. However, it is less efficient than
+greedy search since it has to compute and store multiple potential sequences.
+
+**Note:** beam search with `num_beams=1` is identical to greedy search.
+"""
+
+sampler = keras_hub.samplers.BeamSampler(num_beams=10)
+output_tokens = sampler(
+    next=next,
+    prompt=prompt_tokens,
+    index=1,
+)
+txt = tokenizer.detokenize(output_tokens)
+print(f"Beam search generated text: \n{txt}\n")
+
+"""
+Similar to greedy search, beam search quickly starts repeating itself, since it is still
+a deterministic method.
+"""
+
+"""
+### Random search
+
+Random search is our first probabilistic method. At each time step, it samples the next
+token using the softmax probabilities provided by the model.
+"""
+
+sampler = keras_hub.samplers.RandomSampler()
+output_tokens = sampler(
+    next=next,
+    prompt=prompt_tokens,
+    index=1,
+)
+txt = tokenizer.detokenize(output_tokens)
+print(f"Random search generated text: \n{txt}\n")
+
+"""
+Voilà, no repetitions! However, with random search, we may see some nonsensical words
+appearing since any word in the vocabulary has a chance of appearing with this sampling
+method. This is fixed by our next search utility, top-k search.
+"""
+
+"""
+### Top-K search
+
+Similar to random search, we sample the next token from the probability distribution
+provided by the model. The only difference is that here, we select out the top `k` most
+probable tokens, and distribute the probability mass over them before sampling. This way,
+we won't be sampling from low probability tokens, and hence we would have less
+nonsensical words!
+"""
+
+sampler = keras_hub.samplers.TopKSampler(k=10)
+output_tokens = sampler(
+    next=next,
+    prompt=prompt_tokens,
+    index=1,
+)
+txt = tokenizer.detokenize(output_tokens)
+print(f"Top-K search generated text: \n{txt}\n")
+
+"""
+### Top-P search
+
+Even with the top-k search, there is something to improve upon. With top-k search, the
+number `k` is fixed, which means it selects the same number of tokens for any probability
+distribution. Consider two scenarios, one where the probability mass is concentrated over
+2 words and another where the probability mass is evenly concentrated across 10. Should
+we choose `k=2` or `k=10`? There is no one size that fits all `k` here.
+
+This is where top-p search comes in! Instead of choosing a `k`, we choose a probability
+`p` that we want the probabilities of the top tokens to sum up to. This way, we can
+dynamically adjust the `k` based on the probability distribution. By setting `p=0.9`, if
+90% of the probability mass is concentrated on the top 2 tokens, we can filter out the
+top 2 tokens to sample from. If instead the 90% is distributed over 10 tokens, it will
+similarly filter out the top 10 tokens to sample from.
+"""
+
+sampler = keras_hub.samplers.TopPSampler(p=0.5)
+output_tokens = sampler(
+    next=next,
+    prompt=prompt_tokens,
+    index=1,
+)
+txt = tokenizer.detokenize(output_tokens)
+print(f"Top-P search generated text: \n{txt}\n")
+
+"""
+### Using callbacks for text generation
+
+We can also wrap the utilities in a callback, which allows you to print out a prediction
+sequence for every epoch of the model! Here is an example of a callback for top-k search:
+"""
+
+
+class TopKTextGenerator(keras.callbacks.Callback):
+    """A callback to generate text from a trained model using top-k."""
+
+    def __init__(self, k):
+        self.sampler = keras_hub.samplers.TopKSampler(k)
+
+    def on_epoch_end(self, epoch, logs=None):
+        output_tokens = self.sampler(
+            next=next,
+            prompt=prompt_tokens,
+            index=1,
+        )
+        txt = tokenizer.detokenize(output_tokens)
+        print(f"Top-K search generated text: \n{txt}\n")
+
+
+text_generation_callback = TopKTextGenerator(k=10)
+# Dummy training loop to demonstrate callback.
+model.fit(train_ds.take(1), verbose=2, epochs=2, callbacks=[text_generation_callback])
+
+"""
+## Conclusion
+
+To recap, in this example, we use KerasHub layers to train a sub-word vocabulary,
+tokenize training data, create a miniature GPT model, and perform inference with the
+text generation library.
+
+If you would like to understand how Transformers work, or learn more about training the
+full GPT model, here are some further readings:
+
+- Attention Is All You Need [Vaswani et al., 2017](https://arxiv.org/abs/1706.03762)
+- GPT-3 Paper [Brown et al., 2020](https://arxiv.org/abs/2005.14165)
+"""
diff --git a/knowledge_base/generative/text_generation_with_miniature_gpt.py b/knowledge_base/generative/text_generation_with_miniature_gpt.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca34ea03d412f54f365aab8875442a1b41f8fc6e
--- /dev/null
+++ b/knowledge_base/generative/text_generation_with_miniature_gpt.py
@@ -0,0 +1,316 @@
+"""
+Title: Text generation with a miniature GPT
+Author: [Apoorv Nandan](https://twitter.com/NandanApoorv)
+Date created: 2020/05/29
+Last modified: 2020/05/29
+Description: Implement a miniature version of GPT and train it to generate text.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+This example demonstrates how to implement an autoregressive language model
+using a miniature version of the GPT model.
+The model consists of a single Transformer block with causal masking
+in its attention layer.
+We use the text from the IMDB sentiment classification dataset for training
+and generate new movie reviews for a given prompt.
+When using this script with your own dataset, make sure it has at least
+1 million words.
+
+This example should be run with `tf-nightly>=2.3.0-dev20200531` or
+with TensorFlow 2.3 or higher.
+
+**References:**
+
+- [GPT](https://www.semanticscholar.org/paper/Improving-Language-Understanding-by-Generative-Radford/cd18800a0fe0b668a1cc19f2ec95b5003d0a5035)
+- [GPT-2](https://www.semanticscholar.org/paper/Language-Models-are-Unsupervised-Multitask-Learners-Radford-Wu/9405cc0d6169988371b2755e573cc28650d14dfe)
+- [GPT-3](https://arxiv.org/abs/2005.14165)
+"""
+"""
+## Setup
+"""
+# We set the backend to TensorFlow. The code works with
+# both `tensorflow` and `torch`. It does not work with JAX
+# due to the behavior of `jax.numpy.tile` in a jit scope
+# (used in `causal_attention_mask()`: `tile` in JAX does
+# not support a dynamic `reps` argument.
+# You can make the code work in JAX by wrapping the
+# inside of the `causal_attention_mask` function in
+# a decorator to prevent jit compilation:
+# `with jax.ensure_compile_time_eval():`.
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import keras
+from keras import layers
+from keras import ops
+from keras.layers import TextVectorization
+import numpy as np
+import os
+import string
+import random
+import tensorflow
+import tensorflow.data as tf_data
+import tensorflow.strings as tf_strings
+
+
+"""
+## Implement a Transformer block as a layer
+"""
+
+
+def causal_attention_mask(batch_size, n_dest, n_src, dtype):
+    """
+    Mask the upper half of the dot product matrix in self attention.
+    This prevents flow of information from future tokens to current token.
+    1's in the lower triangle, counting from the lower right corner.
+    """
+    i = ops.arange(n_dest)[:, None]
+    j = ops.arange(n_src)
+    m = i >= j - n_src + n_dest
+    mask = ops.cast(m, dtype)
+    mask = ops.reshape(mask, [1, n_dest, n_src])
+    mult = ops.concatenate(
+        [ops.expand_dims(batch_size, -1), ops.convert_to_tensor([1, 1])], 0
+    )
+    return ops.tile(mask, mult)
+
+
+class TransformerBlock(layers.Layer):
+    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
+        super().__init__()
+        self.att = layers.MultiHeadAttention(num_heads, embed_dim)
+        self.ffn = keras.Sequential(
+            [
+                layers.Dense(ff_dim, activation="relu"),
+                layers.Dense(embed_dim),
+            ]
+        )
+        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
+        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
+        self.dropout1 = layers.Dropout(rate)
+        self.dropout2 = layers.Dropout(rate)
+
+    def call(self, inputs):
+        input_shape = ops.shape(inputs)
+        batch_size = input_shape[0]
+        seq_len = input_shape[1]
+        causal_mask = causal_attention_mask(batch_size, seq_len, seq_len, "bool")
+        attention_output = self.att(inputs, inputs, attention_mask=causal_mask)
+        attention_output = self.dropout1(attention_output)
+        out1 = self.layernorm1(inputs + attention_output)
+        ffn_output = self.ffn(out1)
+        ffn_output = self.dropout2(ffn_output)
+        return self.layernorm2(out1 + ffn_output)
+
+
+"""
+## Implement an embedding layer
+
+Create two separate embedding layers: one for tokens and one for token index
+(positions).
+"""
+
+
+class TokenAndPositionEmbedding(layers.Layer):
+    def __init__(self, maxlen, vocab_size, embed_dim):
+        super().__init__()
+        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
+        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
+
+    def call(self, x):
+        maxlen = ops.shape(x)[-1]
+        positions = ops.arange(0, maxlen, 1)
+        positions = self.pos_emb(positions)
+        x = self.token_emb(x)
+        return x + positions
+
+
+"""
+## Implement the miniature GPT model
+"""
+vocab_size = 20000  # Only consider the top 20k words
+maxlen = 80  # Max sequence size
+embed_dim = 256  # Embedding size for each token
+num_heads = 2  # Number of attention heads
+feed_forward_dim = 256  # Hidden layer size in feed forward network inside transformer
+
+
+def create_model():
+    inputs = layers.Input(shape=(maxlen,), dtype="int32")
+    embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
+    x = embedding_layer(inputs)
+    transformer_block = TransformerBlock(embed_dim, num_heads, feed_forward_dim)
+    x = transformer_block(x)
+    outputs = layers.Dense(vocab_size)(x)
+    model = keras.Model(inputs=inputs, outputs=[outputs, x])
+    loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+    model.compile(
+        "adam",
+        loss=[loss_fn, None],
+    )  # No loss and optimization based on word embeddings from transformer block
+    return model
+
+
+"""
+## Prepare the data for word-level language modelling
+
+Download the IMDB dataset and combine training and validation sets for a text
+generation task.
+"""
+
+"""shell
+curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
+tar -xf aclImdb_v1.tar.gz
+"""
+
+
+batch_size = 128
+
+# The dataset contains each review in a separate text file
+# The text files are present in four different folders
+# Create a list all files
+filenames = []
+directories = [
+    "aclImdb/train/pos",
+    "aclImdb/train/neg",
+    "aclImdb/test/pos",
+    "aclImdb/test/neg",
+]
+for dir in directories:
+    for f in os.listdir(dir):
+        filenames.append(os.path.join(dir, f))
+
+print(f"{len(filenames)} files")
+
+# Create a dataset from text files
+random.shuffle(filenames)
+text_ds = tf_data.TextLineDataset(filenames)
+text_ds = text_ds.shuffle(buffer_size=256)
+text_ds = text_ds.batch(batch_size)
+
+
+def custom_standardization(input_string):
+    """Remove html line-break tags and handle punctuation"""
+    lowercased = tf_strings.lower(input_string)
+    stripped_html = tf_strings.regex_replace(lowercased, "<br />", " ")
+    return tf_strings.regex_replace(stripped_html, f"([{string.punctuation}])", r" \1")
+
+
+# Create a vectorization layer and adapt it to the text
+vectorize_layer = TextVectorization(
+    standardize=custom_standardization,
+    max_tokens=vocab_size - 1,
+    output_mode="int",
+    output_sequence_length=maxlen + 1,
+)
+vectorize_layer.adapt(text_ds)
+vocab = vectorize_layer.get_vocabulary()  # To get words back from token indices
+
+
+def prepare_lm_inputs_labels(text):
+    """
+    Shift word sequences by 1 position so that the target for position (i) is
+    word at position (i+1). The model will use all words up till position (i)
+    to predict the next word.
+    """
+    text = tensorflow.expand_dims(text, -1)
+    tokenized_sentences = vectorize_layer(text)
+    x = tokenized_sentences[:, :-1]
+    y = tokenized_sentences[:, 1:]
+    return x, y
+
+
+text_ds = text_ds.map(prepare_lm_inputs_labels, num_parallel_calls=tf_data.AUTOTUNE)
+text_ds = text_ds.prefetch(tf_data.AUTOTUNE)
+
+
+"""
+## Implement a Keras callback for generating text
+"""
+
+
+class TextGenerator(keras.callbacks.Callback):
+    """A callback to generate text from a trained model.
+    1. Feed some starting prompt to the model
+    2. Predict probabilities for the next token
+    3. Sample the next token and add it to the next input
+
+    Arguments:
+        max_tokens: Integer, the number of tokens to be generated after prompt.
+        start_tokens: List of integers, the token indices for the starting prompt.
+        index_to_word: List of strings, obtained from the TextVectorization layer.
+        top_k: Integer, sample from the `top_k` token predictions.
+        print_every: Integer, print after this many epochs.
+    """
+
+    def __init__(
+        self, max_tokens, start_tokens, index_to_word, top_k=10, print_every=1
+    ):
+        self.max_tokens = max_tokens
+        self.start_tokens = start_tokens
+        self.index_to_word = index_to_word
+        self.print_every = print_every
+        self.k = top_k
+
+    def sample_from(self, logits):
+        logits, indices = ops.top_k(logits, k=self.k, sorted=True)
+        indices = np.asarray(indices).astype("int32")
+        preds = keras.activations.softmax(ops.expand_dims(logits, 0))[0]
+        preds = np.asarray(preds).astype("float32")
+        return np.random.choice(indices, p=preds)
+
+    def detokenize(self, number):
+        return self.index_to_word[number]
+
+    def on_epoch_end(self, epoch, logs=None):
+        start_tokens = [_ for _ in self.start_tokens]
+        if (epoch + 1) % self.print_every != 0:
+            return
+        num_tokens_generated = 0
+        tokens_generated = []
+        while num_tokens_generated <= self.max_tokens:
+            pad_len = maxlen - len(start_tokens)
+            sample_index = len(start_tokens) - 1
+            if pad_len < 0:
+                x = start_tokens[:maxlen]
+                sample_index = maxlen - 1
+            elif pad_len > 0:
+                x = start_tokens + [0] * pad_len
+            else:
+                x = start_tokens
+            x = np.array([x])
+            y, _ = self.model.predict(x, verbose=0)
+            sample_token = self.sample_from(y[0][sample_index])
+            tokens_generated.append(sample_token)
+            start_tokens.append(sample_token)
+            num_tokens_generated = len(tokens_generated)
+        txt = " ".join(
+            [self.detokenize(_) for _ in self.start_tokens + tokens_generated]
+        )
+        print(f"generated text:\n{txt}\n")
+
+
+# Tokenize starting prompt
+word_to_index = {}
+for index, word in enumerate(vocab):
+    word_to_index[word] = index
+
+start_prompt = "this movie is"
+start_tokens = [word_to_index.get(_, 1) for _ in start_prompt.split()]
+num_tokens_generated = 40
+text_gen_callback = TextGenerator(num_tokens_generated, start_tokens, vocab)
+
+
+"""
+## Train the model
+
+Note: This code should preferably be run on GPU.
+"""
+
+model = create_model()
+
+model.fit(text_ds, verbose=2, epochs=25, callbacks=[text_gen_callback])
diff --git a/knowledge_base/generative/vae.py b/knowledge_base/generative/vae.py
new file mode 100644
index 0000000000000000000000000000000000000000..9068026f5568a712ffacc404003c8d3570262483
--- /dev/null
+++ b/knowledge_base/generative/vae.py
@@ -0,0 +1,197 @@
+"""
+Title: Variational AutoEncoder
+Author: [fchollet](https://twitter.com/fchollet)
+Date created: 2020/05/03
+Last modified: 2024/04/24
+Description: Convolutional Variational AutoEncoder (VAE) trained on MNIST digits.
+Accelerator: GPU
+"""
+
+"""
+## Setup
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import numpy as np
+import tensorflow as tf
+import keras
+from keras import ops
+from keras import layers
+
+"""
+## Create a sampling layer
+"""
+
+
+class Sampling(layers.Layer):
+    """Uses (z_mean, z_log_var) to sample z, the vector encoding a digit."""
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.seed_generator = keras.random.SeedGenerator(1337)
+
+    def call(self, inputs):
+        z_mean, z_log_var = inputs
+        batch = ops.shape(z_mean)[0]
+        dim = ops.shape(z_mean)[1]
+        epsilon = keras.random.normal(shape=(batch, dim), seed=self.seed_generator)
+        return z_mean + ops.exp(0.5 * z_log_var) * epsilon
+
+
+"""
+## Build the encoder
+"""
+
+latent_dim = 2
+
+encoder_inputs = keras.Input(shape=(28, 28, 1))
+x = layers.Conv2D(32, 3, activation="relu", strides=2, padding="same")(encoder_inputs)
+x = layers.Conv2D(64, 3, activation="relu", strides=2, padding="same")(x)
+x = layers.Flatten()(x)
+x = layers.Dense(16, activation="relu")(x)
+z_mean = layers.Dense(latent_dim, name="z_mean")(x)
+z_log_var = layers.Dense(latent_dim, name="z_log_var")(x)
+z = Sampling()([z_mean, z_log_var])
+encoder = keras.Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")
+encoder.summary()
+
+"""
+## Build the decoder
+"""
+
+latent_inputs = keras.Input(shape=(latent_dim,))
+x = layers.Dense(7 * 7 * 64, activation="relu")(latent_inputs)
+x = layers.Reshape((7, 7, 64))(x)
+x = layers.Conv2DTranspose(64, 3, activation="relu", strides=2, padding="same")(x)
+x = layers.Conv2DTranspose(32, 3, activation="relu", strides=2, padding="same")(x)
+decoder_outputs = layers.Conv2DTranspose(1, 3, activation="sigmoid", padding="same")(x)
+decoder = keras.Model(latent_inputs, decoder_outputs, name="decoder")
+decoder.summary()
+
+"""
+## Define the VAE as a `Model` with a custom `train_step`
+"""
+
+
+class VAE(keras.Model):
+    def __init__(self, encoder, decoder, **kwargs):
+        super().__init__(**kwargs)
+        self.encoder = encoder
+        self.decoder = decoder
+        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
+        self.reconstruction_loss_tracker = keras.metrics.Mean(
+            name="reconstruction_loss"
+        )
+        self.kl_loss_tracker = keras.metrics.Mean(name="kl_loss")
+
+    @property
+    def metrics(self):
+        return [
+            self.total_loss_tracker,
+            self.reconstruction_loss_tracker,
+            self.kl_loss_tracker,
+        ]
+
+    def train_step(self, data):
+        with tf.GradientTape() as tape:
+            z_mean, z_log_var, z = self.encoder(data)
+            reconstruction = self.decoder(z)
+            reconstruction_loss = ops.mean(
+                ops.sum(
+                    keras.losses.binary_crossentropy(data, reconstruction),
+                    axis=(1, 2),
+                )
+            )
+            kl_loss = -0.5 * (1 + z_log_var - ops.square(z_mean) - ops.exp(z_log_var))
+            kl_loss = ops.mean(ops.sum(kl_loss, axis=1))
+            total_loss = reconstruction_loss + kl_loss
+        grads = tape.gradient(total_loss, self.trainable_weights)
+        self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
+        self.total_loss_tracker.update_state(total_loss)
+        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
+        self.kl_loss_tracker.update_state(kl_loss)
+        return {
+            "loss": self.total_loss_tracker.result(),
+            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
+            "kl_loss": self.kl_loss_tracker.result(),
+        }
+
+
+"""
+## Train the VAE
+"""
+
+(x_train, _), (x_test, _) = keras.datasets.mnist.load_data()
+mnist_digits = np.concatenate([x_train, x_test], axis=0)
+mnist_digits = np.expand_dims(mnist_digits, -1).astype("float32") / 255
+
+vae = VAE(encoder, decoder)
+vae.compile(optimizer=keras.optimizers.Adam())
+vae.fit(mnist_digits, epochs=30, batch_size=128)
+
+"""
+## Display a grid of sampled digits
+"""
+
+import matplotlib.pyplot as plt
+
+
+def plot_latent_space(vae, n=30, figsize=15):
+    # display a n*n 2D manifold of digits
+    digit_size = 28
+    scale = 1.0
+    figure = np.zeros((digit_size * n, digit_size * n))
+    # linearly spaced coordinates corresponding to the 2D plot
+    # of digit classes in the latent space
+    grid_x = np.linspace(-scale, scale, n)
+    grid_y = np.linspace(-scale, scale, n)[::-1]
+
+    for i, yi in enumerate(grid_y):
+        for j, xi in enumerate(grid_x):
+            z_sample = np.array([[xi, yi]])
+            x_decoded = vae.decoder.predict(z_sample, verbose=0)
+            digit = x_decoded[0].reshape(digit_size, digit_size)
+            figure[
+                i * digit_size : (i + 1) * digit_size,
+                j * digit_size : (j + 1) * digit_size,
+            ] = digit
+
+    plt.figure(figsize=(figsize, figsize))
+    start_range = digit_size // 2
+    end_range = n * digit_size + start_range
+    pixel_range = np.arange(start_range, end_range, digit_size)
+    sample_range_x = np.round(grid_x, 1)
+    sample_range_y = np.round(grid_y, 1)
+    plt.xticks(pixel_range, sample_range_x)
+    plt.yticks(pixel_range, sample_range_y)
+    plt.xlabel("z[0]")
+    plt.ylabel("z[1]")
+    plt.imshow(figure, cmap="Greys_r")
+    plt.show()
+
+
+plot_latent_space(vae)
+
+"""
+## Display how the latent space clusters different digit classes
+"""
+
+
+def plot_label_clusters(vae, data, labels):
+    # display a 2D plot of the digit classes in the latent space
+    z_mean, _, _ = vae.encoder.predict(data, verbose=0)
+    plt.figure(figsize=(12, 10))
+    plt.scatter(z_mean[:, 0], z_mean[:, 1], c=labels)
+    plt.colorbar()
+    plt.xlabel("z[0]")
+    plt.ylabel("z[1]")
+    plt.show()
+
+
+(x_train, y_train), _ = keras.datasets.mnist.load_data()
+x_train = np.expand_dims(x_train, -1).astype("float32") / 255
+
+plot_label_clusters(vae, x_train, y_train)
diff --git a/knowledge_base/generative/vq_vae.py b/knowledge_base/generative/vq_vae.py
new file mode 100644
index 0000000000000000000000000000000000000000..cccd4037c99466e2f80cb6f1809adf8f1e0249a5
--- /dev/null
+++ b/knowledge_base/generative/vq_vae.py
@@ -0,0 +1,596 @@
+"""
+Title: Vector-Quantized Variational Autoencoders
+Author: [Sayak Paul](https://twitter.com/RisingSayak)
+Date created: 2021/07/21
+Last modified: 2022/06/27
+Description: Training a VQ-VAE for image reconstruction and codebook sampling for generation.
+Accelerator: GPU
+"""
+
+"""
+In this example, we develop a Vector Quantized Variational Autoencoder (VQ-VAE).
+VQ-VAE was proposed in
+[Neural Discrete Representation Learning](https://arxiv.org/abs/1711.00937)
+by van der Oord et al. In standard VAEs, the latent space is continuous and is sampled
+from a Gaussian distribution. It is generally harder to learn such a continuous
+distribution via gradient descent. VQ-VAEs, on the other hand,
+operate on a discrete latent space, making the optimization problem simpler. It does so
+by maintaining a discrete *codebook*. The codebook is developed by
+discretizing the distance between continuous embeddings and the encoded
+outputs. These discrete code words are then fed to the decoder, which is trained
+to generate reconstructed samples.
+
+For an overview of VQ-VAEs, please refer to the original paper and
+[this video explanation](https://www.youtube.com/watch?v=VZFVUrYcig0).
+If you need a refresher on VAEs, you can refer to
+[this book chapter](https://livebook.manning.com/book/deep-learning-with-python-second-edition/chapter-12/).
+VQ-VAEs are one of the main recipes behind [DALL-E](https://openai.com/blog/dall-e/)
+and the idea of a codebook is used in [VQ-GANs](https://arxiv.org/abs/2012.09841).
+This example uses implementation details from the
+[official VQ-VAE tutorial](https://github.com/deepmind/sonnet/blob/master/sonnet/examples/vqvae_example.ipynb)
+from DeepMind. 
+
+## Requirements
+
+To run this example, you will need TensorFlow 2.5 or higher, as well as
+TensorFlow Probability, which can be installed using the command below.
+"""
+
+"""shell
+pip install -q tensorflow-probability
+"""
+
+"""
+## Imports
+"""
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+from tensorflow import keras
+from tensorflow.keras import layers
+import tensorflow_probability as tfp
+import tensorflow as tf
+
+"""
+## `VectorQuantizer` layer
+
+First, we implement a custom layer for the vector quantizer, which is the layer in between
+the encoder and decoder. Consider an output from the encoder, with shape `(batch_size, height, width,
+num_filters)`. The vector quantizer will first flatten this output, only keeping the
+`num_filters` dimension intact. So, the shape would become `(batch_size * height * width,
+num_filters)`. The rationale behind this is to treat the total number of filters as the size for
+the latent embeddings.
+
+An embedding table is then initialized to learn a codebook. We measure the L2-normalized
+distance between the flattened encoder outputs and code words of this codebook. We take the
+code that yields the minimum distance, and we apply one-hot encoding to achieve quantization.
+This way, the code yielding the minimum distance to the corresponding encoder output is
+mapped as one and the remaining codes are mapped as zeros.
+
+Since the quantization process is not differentiable, we apply a
+[straight-through estimator](https://www.hassanaskary.com/python/pytorch/deep%20learning/2020/09/19/intuitive-explanation-of-straight-through-estimators.html)
+in between the decoder and the encoder, so that the decoder gradients are directly propagated
+to the encoder. As the encoder and decoder share the same channel space, the decoder gradients are
+still meaningful to the encoder.
+"""
+
+
+class VectorQuantizer(layers.Layer):
+    def __init__(self, num_embeddings, embedding_dim, beta=0.25, **kwargs):
+        super().__init__(**kwargs)
+        self.embedding_dim = embedding_dim
+        self.num_embeddings = num_embeddings
+
+        # The `beta` parameter is best kept between [0.25, 2] as per the paper.
+        self.beta = beta
+
+        # Initialize the embeddings which we will quantize.
+        w_init = tf.random_uniform_initializer()
+        self.embeddings = tf.Variable(
+            initial_value=w_init(
+                shape=(self.embedding_dim, self.num_embeddings), dtype="float32"
+            ),
+            trainable=True,
+            name="embeddings_vqvae",
+        )
+
+    def call(self, x):
+        # Calculate the input shape of the inputs and
+        # then flatten the inputs keeping `embedding_dim` intact.
+        input_shape = tf.shape(x)
+        flattened = tf.reshape(x, [-1, self.embedding_dim])
+
+        # Quantization.
+        encoding_indices = self.get_code_indices(flattened)
+        encodings = tf.one_hot(encoding_indices, self.num_embeddings)
+        quantized = tf.matmul(encodings, self.embeddings, transpose_b=True)
+
+        # Reshape the quantized values back to the original input shape
+        quantized = tf.reshape(quantized, input_shape)
+
+        # Calculate vector quantization loss and add that to the layer. You can learn more
+        # about adding losses to different layers here:
+        # https://keras.io/guides/making_new_layers_and_models_via_subclassing/. Check
+        # the original paper to get a handle on the formulation of the loss function.
+        commitment_loss = tf.reduce_mean((tf.stop_gradient(quantized) - x) ** 2)
+        codebook_loss = tf.reduce_mean((quantized - tf.stop_gradient(x)) ** 2)
+        self.add_loss(self.beta * commitment_loss + codebook_loss)
+
+        # Straight-through estimator.
+        quantized = x + tf.stop_gradient(quantized - x)
+        return quantized
+
+    def get_code_indices(self, flattened_inputs):
+        # Calculate L2-normalized distance between the inputs and the codes.
+        similarity = tf.matmul(flattened_inputs, self.embeddings)
+        distances = (
+            tf.reduce_sum(flattened_inputs**2, axis=1, keepdims=True)
+            + tf.reduce_sum(self.embeddings**2, axis=0)
+            - 2 * similarity
+        )
+
+        # Derive the indices for minimum distances.
+        encoding_indices = tf.argmin(distances, axis=1)
+        return encoding_indices
+
+
+"""
+**A note on straight-through estimation**:
+
+This line of code does the straight-through estimation part: `quantized = x +
+tf.stop_gradient(quantized - x)`. During backpropagation, `(quantized - x)` won't be
+included in the computation graph and the gradients obtained for `quantized`
+will be copied for `inputs`. Thanks to [this video](https://youtu.be/VZFVUrYcig0?t=1393)
+for helping me understand this technique.
+"""
+
+"""
+## Encoder and decoder
+
+Now for the encoder and the decoder for the VQ-VAE. We will keep them small so
+that their capacity is a good fit for the MNIST dataset. The implementation of the encoder and
+decoder come from
+[this example](https://keras.io/examples/generative/vae).
+
+Note that activations _other than ReLU_ may not work for the encoder and decoder layers in the
+quantization architecture: Leaky ReLU activated layers, for example, have proven difficult to
+train, resulting in intermittent loss spikes that the model has trouble recovering from.
+"""
+
+
+def get_encoder(latent_dim=16):
+    encoder_inputs = keras.Input(shape=(28, 28, 1))
+    x = layers.Conv2D(32, 3, activation="relu", strides=2, padding="same")(
+        encoder_inputs
+    )
+    x = layers.Conv2D(64, 3, activation="relu", strides=2, padding="same")(x)
+    encoder_outputs = layers.Conv2D(latent_dim, 1, padding="same")(x)
+    return keras.Model(encoder_inputs, encoder_outputs, name="encoder")
+
+
+def get_decoder(latent_dim=16):
+    latent_inputs = keras.Input(shape=get_encoder(latent_dim).output.shape[1:])
+    x = layers.Conv2DTranspose(64, 3, activation="relu", strides=2, padding="same")(
+        latent_inputs
+    )
+    x = layers.Conv2DTranspose(32, 3, activation="relu", strides=2, padding="same")(x)
+    decoder_outputs = layers.Conv2DTranspose(1, 3, padding="same")(x)
+    return keras.Model(latent_inputs, decoder_outputs, name="decoder")
+
+
+"""
+## Standalone VQ-VAE model
+"""
+
+
+def get_vqvae(latent_dim=16, num_embeddings=64):
+    vq_layer = VectorQuantizer(num_embeddings, latent_dim, name="vector_quantizer")
+    encoder = get_encoder(latent_dim)
+    decoder = get_decoder(latent_dim)
+    inputs = keras.Input(shape=(28, 28, 1))
+    encoder_outputs = encoder(inputs)
+    quantized_latents = vq_layer(encoder_outputs)
+    reconstructions = decoder(quantized_latents)
+    return keras.Model(inputs, reconstructions, name="vq_vae")
+
+
+get_vqvae().summary()
+
+"""
+Note that the output channels of the encoder should match the `latent_dim` for the vector
+quantizer.
+"""
+
+"""
+## Wrapping up the training loop inside `VQVAETrainer`
+"""
+
+
+class VQVAETrainer(keras.models.Model):
+    def __init__(self, train_variance, latent_dim=32, num_embeddings=128, **kwargs):
+        super().__init__(**kwargs)
+        self.train_variance = train_variance
+        self.latent_dim = latent_dim
+        self.num_embeddings = num_embeddings
+
+        self.vqvae = get_vqvae(self.latent_dim, self.num_embeddings)
+
+        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
+        self.reconstruction_loss_tracker = keras.metrics.Mean(
+            name="reconstruction_loss"
+        )
+        self.vq_loss_tracker = keras.metrics.Mean(name="vq_loss")
+
+    @property
+    def metrics(self):
+        return [
+            self.total_loss_tracker,
+            self.reconstruction_loss_tracker,
+            self.vq_loss_tracker,
+        ]
+
+    def train_step(self, x):
+        with tf.GradientTape() as tape:
+            # Outputs from the VQ-VAE.
+            reconstructions = self.vqvae(x)
+
+            # Calculate the losses.
+            reconstruction_loss = (
+                tf.reduce_mean((x - reconstructions) ** 2) / self.train_variance
+            )
+            total_loss = reconstruction_loss + sum(self.vqvae.losses)
+
+        # Backpropagation.
+        grads = tape.gradient(total_loss, self.vqvae.trainable_variables)
+        self.optimizer.apply_gradients(zip(grads, self.vqvae.trainable_variables))
+
+        # Loss tracking.
+        self.total_loss_tracker.update_state(total_loss)
+        self.reconstruction_loss_tracker.update_state(reconstruction_loss)
+        self.vq_loss_tracker.update_state(sum(self.vqvae.losses))
+
+        # Log results.
+        return {
+            "loss": self.total_loss_tracker.result(),
+            "reconstruction_loss": self.reconstruction_loss_tracker.result(),
+            "vqvae_loss": self.vq_loss_tracker.result(),
+        }
+
+
+"""
+## Load and preprocess the MNIST dataset
+"""
+
+(x_train, _), (x_test, _) = keras.datasets.mnist.load_data()
+
+x_train = np.expand_dims(x_train, -1)
+x_test = np.expand_dims(x_test, -1)
+x_train_scaled = (x_train / 255.0) - 0.5
+x_test_scaled = (x_test / 255.0) - 0.5
+
+data_variance = np.var(x_train / 255.0)
+
+"""
+## Train the VQ-VAE model
+"""
+
+vqvae_trainer = VQVAETrainer(data_variance, latent_dim=16, num_embeddings=128)
+vqvae_trainer.compile(optimizer=keras.optimizers.Adam())
+vqvae_trainer.fit(x_train_scaled, epochs=30, batch_size=128)
+
+"""
+## Reconstruction results on the test set
+"""
+
+
+def show_subplot(original, reconstructed):
+    plt.subplot(1, 2, 1)
+    plt.imshow(original.squeeze() + 0.5)
+    plt.title("Original")
+    plt.axis("off")
+
+    plt.subplot(1, 2, 2)
+    plt.imshow(reconstructed.squeeze() + 0.5)
+    plt.title("Reconstructed")
+    plt.axis("off")
+
+    plt.show()
+
+
+trained_vqvae_model = vqvae_trainer.vqvae
+idx = np.random.choice(len(x_test_scaled), 10)
+test_images = x_test_scaled[idx]
+reconstructions_test = trained_vqvae_model.predict(test_images)
+
+for test_image, reconstructed_image in zip(test_images, reconstructions_test):
+    show_subplot(test_image, reconstructed_image)
+
+"""
+These results look decent. You are encouraged to play with different hyperparameters
+(especially the number of embeddings and the dimensions of the embeddings) and observe how
+they affect the results.
+"""
+
+"""
+## Visualizing the discrete codes
+"""
+
+encoder = vqvae_trainer.vqvae.get_layer("encoder")
+quantizer = vqvae_trainer.vqvae.get_layer("vector_quantizer")
+
+encoded_outputs = encoder.predict(test_images)
+flat_enc_outputs = encoded_outputs.reshape(-1, encoded_outputs.shape[-1])
+codebook_indices = quantizer.get_code_indices(flat_enc_outputs)
+codebook_indices = codebook_indices.numpy().reshape(encoded_outputs.shape[:-1])
+
+for i in range(len(test_images)):
+    plt.subplot(1, 2, 1)
+    plt.imshow(test_images[i].squeeze() + 0.5)
+    plt.title("Original")
+    plt.axis("off")
+
+    plt.subplot(1, 2, 2)
+    plt.imshow(codebook_indices[i])
+    plt.title("Code")
+    plt.axis("off")
+    plt.show()
+
+"""
+The figure above shows that the discrete codes have been able to capture some
+regularities from the dataset. Now, how do we sample from this codebook to create
+novel images? Since these codes are discrete and we imposed a categorical distribution
+on them, we cannot use them yet to generate anything meaningful until we can generate likely
+sequences of codes that we can give to the decoder. 
+
+The authors use a PixelCNN to train these codes so that they can be used as powerful priors to
+generate novel examples. PixelCNN was proposed in
+[Conditional Image Generation with PixelCNN Decoders](https://arxiv.org/abs/1606.05328)
+by van der Oord et al. We borrow the implementation from
+[this PixelCNN example](https://keras.io/examples/generative/pixelcnn/). It's an autoregressive
+generative model where the outputs are conditional on the prior ones. In other words, a PixelCNN
+generates an image on a pixel-by-pixel basis. For the purpose in this example, however, its task
+is to generate code book indices instead of pixels directly. The trained VQ-VAE decoder is used
+to map the indices generated by the PixelCNN back into the pixel space.
+"""
+
+"""
+## PixelCNN hyperparameters
+"""
+
+num_residual_blocks = 2
+num_pixelcnn_layers = 2
+pixelcnn_input_shape = encoded_outputs.shape[1:-1]
+print(f"Input shape of the PixelCNN: {pixelcnn_input_shape}")
+
+"""
+This input shape represents the reduction in the resolution performed by the encoder. With "same" padding,
+this exactly halves the "resolution" of the output shape for each stride-2 convolution layer. So, with these
+two layers, we end up with an encoder output tensor of 7x7 on axes 2 and 3, with the first axis as the batch
+size and the last axis being the code book embedding size. Since the quantization layer in the autoencoder
+maps these 7x7 tensors to indices of the code book, these output layer axis sizes must be matched by the
+PixelCNN as the input shape. The task of the PixelCNN for this architecture is to generate _likely_ 7x7
+arrangements of codebook indices.
+
+Note that this shape is something to optimize for in larger-sized image domains, along with the code
+book sizes. Since the PixelCNN is autoregressive, it needs to pass over each codebook index sequentially
+in order to generate novel images from the codebook. Each stride-2 (or rather more correctly a 
+stride (2, 2)) convolution layer will divide the image generation time by four. Note, however, that there
+is probably a lower bound on this part: when the number of codes for the image to reconstruct is too small,
+it has insufficient information for the decoder to represent the level of detail in the image, so the
+output quality will suffer. This can be amended at least to some extent by using a larger code book. 
+Since the autoregressive part of the image generation procedure uses codebook indices, there is far less of 
+a performance penalty on using a larger code book as the lookup time for a larger-sized code from a larger
+code book is much smaller in comparison to iterating over a larger sequence of code book indices, although
+the size of the code book does impact on the batch size that can pass through the image generation procedure.
+Finding the sweet spot for this trade-off can require some architecture tweaking and could very well differ
+per dataset.
+"""
+
+"""
+## PixelCNN model
+
+Majority of this comes from
+[this example](https://keras.io/examples/generative/pixelcnn/).
+
+## Notes
+
+Thanks to [Rein van 't Veer](https://github.com/reinvantveer) for improving this example with
+copy-edits and minor code clean-ups.
+"""
+
+
+# The first layer is the PixelCNN layer. This layer simply
+# builds on the 2D convolutional layer, but includes masking.
+class PixelConvLayer(layers.Layer):
+    def __init__(self, mask_type, **kwargs):
+        super().__init__()
+        self.mask_type = mask_type
+        self.conv = layers.Conv2D(**kwargs)
+
+    def build(self, input_shape):
+        # Build the conv2d layer to initialize kernel variables
+        self.conv.build(input_shape)
+        # Use the initialized kernel to create the mask
+        kernel_shape = self.conv.kernel.get_shape()
+        self.mask = np.zeros(shape=kernel_shape)
+        self.mask[: kernel_shape[0] // 2, ...] = 1.0
+        self.mask[kernel_shape[0] // 2, : kernel_shape[1] // 2, ...] = 1.0
+        if self.mask_type == "B":
+            self.mask[kernel_shape[0] // 2, kernel_shape[1] // 2, ...] = 1.0
+
+    def call(self, inputs):
+        self.conv.kernel.assign(self.conv.kernel * self.mask)
+        return self.conv(inputs)
+
+
+# Next, we build our residual block layer.
+# This is just a normal residual block, but based on the PixelConvLayer.
+class ResidualBlock(keras.layers.Layer):
+    def __init__(self, filters, **kwargs):
+        super().__init__(**kwargs)
+        self.conv1 = keras.layers.Conv2D(
+            filters=filters, kernel_size=1, activation="relu"
+        )
+        self.pixel_conv = PixelConvLayer(
+            mask_type="B",
+            filters=filters // 2,
+            kernel_size=3,
+            activation="relu",
+            padding="same",
+        )
+        self.conv2 = keras.layers.Conv2D(
+            filters=filters, kernel_size=1, activation="relu"
+        )
+
+    def call(self, inputs):
+        x = self.conv1(inputs)
+        x = self.pixel_conv(x)
+        x = self.conv2(x)
+        return keras.layers.add([inputs, x])
+
+
+pixelcnn_inputs = keras.Input(shape=pixelcnn_input_shape, dtype=tf.int32)
+ohe = tf.one_hot(pixelcnn_inputs, vqvae_trainer.num_embeddings)
+x = PixelConvLayer(
+    mask_type="A", filters=128, kernel_size=7, activation="relu", padding="same"
+)(ohe)
+
+for _ in range(num_residual_blocks):
+    x = ResidualBlock(filters=128)(x)
+
+for _ in range(num_pixelcnn_layers):
+    x = PixelConvLayer(
+        mask_type="B",
+        filters=128,
+        kernel_size=1,
+        strides=1,
+        activation="relu",
+        padding="valid",
+    )(x)
+
+out = keras.layers.Conv2D(
+    filters=vqvae_trainer.num_embeddings, kernel_size=1, strides=1, padding="valid"
+)(x)
+
+pixel_cnn = keras.Model(pixelcnn_inputs, out, name="pixel_cnn")
+pixel_cnn.summary()
+
+"""
+## Prepare data to train the PixelCNN
+
+We will train the PixelCNN to learn a categorical distribution of the discrete codes.
+First, we will generate code indices using the encoder and vector quantizer we just
+trained. Our training objective will be to minimize the crossentropy loss between these
+indices and the PixelCNN outputs. Here, the number of categories is equal to the number
+of embeddings present in our codebook (128 in our case). The PixelCNN model is
+trained to learn a distribution (as opposed to minimizing the L1/L2 loss), which is where
+it gets its generative capabilities from.
+"""
+
+# Generate the codebook indices.
+encoded_outputs = encoder.predict(x_train_scaled)
+flat_enc_outputs = encoded_outputs.reshape(-1, encoded_outputs.shape[-1])
+codebook_indices = quantizer.get_code_indices(flat_enc_outputs)
+
+codebook_indices = codebook_indices.numpy().reshape(encoded_outputs.shape[:-1])
+print(f"Shape of the training data for PixelCNN: {codebook_indices.shape}")
+
+"""
+## PixelCNN training
+"""
+
+pixel_cnn.compile(
+    optimizer=keras.optimizers.Adam(3e-4),
+    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    metrics=["accuracy"],
+)
+pixel_cnn.fit(
+    x=codebook_indices,
+    y=codebook_indices,
+    batch_size=128,
+    epochs=30,
+    validation_split=0.1,
+)
+
+"""
+We can improve these scores with more training and hyperparameter tuning.
+"""
+
+"""
+## Codebook sampling
+
+Now that our PixelCNN is trained, we can sample distinct codes from its outputs and pass
+them to our decoder to generate novel images.
+"""
+
+# Create a mini sampler model.
+inputs = layers.Input(shape=pixel_cnn.input_shape[1:])
+outputs = pixel_cnn(inputs, training=False)
+categorical_layer = tfp.layers.DistributionLambda(tfp.distributions.Categorical)
+outputs = categorical_layer(outputs)
+sampler = keras.Model(inputs, outputs)
+
+"""
+We now construct a prior to generate images. Here, we will generate 10 images.
+"""
+
+# Create an empty array of priors.
+batch = 10
+priors = np.zeros(shape=(batch,) + (pixel_cnn.input_shape)[1:])
+batch, rows, cols = priors.shape
+
+# Iterate over the priors because generation has to be done sequentially pixel by pixel.
+for row in range(rows):
+    for col in range(cols):
+        # Feed the whole array and retrieving the pixel value probabilities for the next
+        # pixel.
+        probs = sampler.predict(priors)
+        # Use the probabilities to pick pixel values and append the values to the priors.
+        priors[:, row, col] = probs[:, row, col]
+
+print(f"Prior shape: {priors.shape}")
+
+"""
+We can now use our decoder to generate the images.
+"""
+
+# Perform an embedding lookup.
+pretrained_embeddings = quantizer.embeddings
+priors_ohe = tf.one_hot(priors.astype("int32"), vqvae_trainer.num_embeddings).numpy()
+quantized = tf.matmul(
+    priors_ohe.astype("float32"), pretrained_embeddings, transpose_b=True
+)
+quantized = tf.reshape(quantized, (-1, *(encoded_outputs.shape[1:])))
+
+# Generate novel images.
+decoder = vqvae_trainer.vqvae.get_layer("decoder")
+generated_samples = decoder.predict(quantized)
+
+for i in range(batch):
+    plt.subplot(1, 2, 1)
+    plt.imshow(priors[i])
+    plt.title("Code")
+    plt.axis("off")
+
+    plt.subplot(1, 2, 2)
+    plt.imshow(generated_samples[i].squeeze() + 0.5)
+    plt.title("Generated Sample")
+    plt.axis("off")
+    plt.show()
+
+"""
+We can enhance the quality of these generated samples by tweaking the PixelCNN.
+"""
+
+"""
+## Additional notes
+
+* After the VQ-VAE paper was initially released, the authors developed an exponential
+moving averaging scheme to update the embeddings inside the quantizer. If you're
+interested you can check out
+[this snippet](https://github.com/deepmind/sonnet/blob/master/sonnet/python/modules/nets/vqvae.py#L124).
+* To further enhance the quality of the generated samples,
+[VQ-VAE-2](https://arxiv.org/abs/1906.00446) was proposed that follows a cascaded
+approach to learn the codebook and to generate the images.
+"""
diff --git a/knowledge_base/generative/wgan-graphs.py b/knowledge_base/generative/wgan-graphs.py
new file mode 100644
index 0000000000000000000000000000000000000000..6e470393af062b5b4946bb1fd72802fbda6d6692
--- /dev/null
+++ b/knowledge_base/generative/wgan-graphs.py
@@ -0,0 +1,611 @@
+"""
+Title: WGAN-GP with R-GCN for the generation of small molecular graphs
+Author: [akensert](https://github.com/akensert)
+Date created: 2021/06/30
+Last modified: 2021/06/30
+Description: Complete implementation of WGAN-GP with R-GCN to generate novel molecules.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+In this tutorial, we implement a generative model for graphs and use it to generate
+novel molecules.
+
+Motivation: The [development of new drugs](https://en.wikipedia.org/wiki/Drug_development)
+(molecules) can be extremely time-consuming and costly. The use of deep learning models
+can alleviate the search for good candidate drugs, by predicting properties of known molecules
+(e.g., solubility, toxicity, affinity to target protein, etc.). As the number of
+possible molecules is astronomical, the space in which we search for/explore molecules is
+just a fraction of the entire space. Therefore, it's arguably desirable to implement
+generative models that can learn to generate novel molecules (which would otherwise have never been explored).
+
+### References (implementation)
+
+The implementation in this tutorial is based on/inspired by the
+[MolGAN paper](https://arxiv.org/abs/1805.11973) and DeepChem's
+[Basic MolGAN](https://deepchem.readthedocs.io/en/latest/api_reference/models.html#basicmolganmod
+el).
+
+### Further reading (generative models)
+Recent implementations of generative models for molecular graphs also include
+[Mol-CycleGAN](https://jcheminf.biomedcentral.com/articles/10.1186/s13321-019-0404-1),
+[GraphVAE](https://arxiv.org/abs/1802.03480) and
+[JT-VAE](https://arxiv.org/abs/1802.04364). For more information on generative
+adverserial networks, see [GAN](https://arxiv.org/abs/1406.2661),
+[WGAN](https://arxiv.org/abs/1701.07875) and [WGAN-GP](https://arxiv.org/abs/1704.00028).
+
+"""
+
+"""
+## Setup
+
+### Install RDKit
+
+[RDKit](https://www.rdkit.org/) is a collection of cheminformatics and machine-learning
+software written in C++ and Python. In this tutorial, RDKit is used to conveniently and
+efficiently transform
+[SMILES](https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system) to
+molecule objects, and then from those obtain sets of atoms and bonds.
+
+SMILES expresses the structure of a given molecule in the form of an ASCII string.
+The SMILES string is a compact encoding which, for smaller molecules, is relatively
+human-readable. Encoding molecules as a string both alleviates and facilitates database
+and/or web searching of a given molecule. RDKit uses algorithms to
+accurately transform a given SMILES to a molecule object, which can then
+be used to compute a great number of molecular properties/features.
+
+Notice, RDKit is commonly installed via [Conda](https://www.rdkit.org/docs/Install.html).
+However, thanks to
+[rdkit_platform_wheels](https://github.com/kuelumbus/rdkit_platform_wheels), rdkit
+can now (for the sake of this tutorial) be installed easily via pip, as follows:
+```
+pip -q install rdkit-pypi
+```
+And to allow easy visualization of a molecule objects, Pillow needs to be installed:
+```
+pip -q install Pillow
+```
+
+"""
+
+"""
+### Import packages
+
+"""
+
+from rdkit import Chem, RDLogger
+from rdkit.Chem.Draw import IPythonConsole, MolsToGridImage
+import numpy as np
+import tensorflow as tf
+from tensorflow import keras
+
+RDLogger.DisableLog("rdApp.*")
+
+"""
+## Dataset
+
+The dataset used in this tutorial is a
+[quantum mechanics dataset](http://quantum-machine.org/datasets/) (QM9), obtained from
+[MoleculeNet](http://moleculenet.ai/datasets-1). Although many feature and label columns
+come with the dataset, we'll only focus on the
+[SMILES](https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system)
+column. The QM9 dataset is a good first dataset to work with for generating
+graphs, as the maximum number of heavy (non-hydrogen) atoms found in a molecule is only nine.
+"""
+
+csv_path = tf.keras.utils.get_file(
+    "qm9.csv", "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/qm9.csv"
+)
+
+data = []
+with open(csv_path, "r") as f:
+    for line in f.readlines()[1:]:
+        data.append(line.split(",")[1])
+
+# Let's look at a molecule of the dataset
+smiles = data[1000]
+print("SMILES:", smiles)
+molecule = Chem.MolFromSmiles(smiles)
+print("Num heavy atoms:", molecule.GetNumHeavyAtoms())
+molecule
+
+"""
+### Define helper functions
+These helper functions will help convert SMILES to graphs and graphs to molecule objects.
+
+**Representing a molecular graph**. Molecules can naturally be expressed as undirected
+graphs `G = (V, E)`, where `V` is a set of vertices (atoms), and `E` a set of edges
+(bonds). As for this implementation, each graph (molecule) will be represented as an
+adjacency tensor `A`, which encodes existence/non-existence of atom-pairs with their
+one-hot encoded bond types stretching an extra dimension, and a feature tensor `H`, which
+for each atom, one-hot encodes its atom type. Notice, as hydrogen atoms can be inferred by
+RDKit, hydrogen atoms are excluded from `A` and `H` for easier modeling.
+
+"""
+
+atom_mapping = {
+    "C": 0,
+    0: "C",
+    "N": 1,
+    1: "N",
+    "O": 2,
+    2: "O",
+    "F": 3,
+    3: "F",
+}
+
+bond_mapping = {
+    "SINGLE": 0,
+    0: Chem.BondType.SINGLE,
+    "DOUBLE": 1,
+    1: Chem.BondType.DOUBLE,
+    "TRIPLE": 2,
+    2: Chem.BondType.TRIPLE,
+    "AROMATIC": 3,
+    3: Chem.BondType.AROMATIC,
+}
+
+NUM_ATOMS = 9  # Maximum number of atoms
+ATOM_DIM = 4 + 1  # Number of atom types
+BOND_DIM = 4 + 1  # Number of bond types
+LATENT_DIM = 64  # Size of the latent space
+
+
+def smiles_to_graph(smiles):
+    # Converts SMILES to molecule object
+    molecule = Chem.MolFromSmiles(smiles)
+
+    # Initialize adjacency and feature tensor
+    adjacency = np.zeros((BOND_DIM, NUM_ATOMS, NUM_ATOMS), "float32")
+    features = np.zeros((NUM_ATOMS, ATOM_DIM), "float32")
+
+    # loop over each atom in molecule
+    for atom in molecule.GetAtoms():
+        i = atom.GetIdx()
+        atom_type = atom_mapping[atom.GetSymbol()]
+        features[i] = np.eye(ATOM_DIM)[atom_type]
+        # loop over one-hop neighbors
+        for neighbor in atom.GetNeighbors():
+            j = neighbor.GetIdx()
+            bond = molecule.GetBondBetweenAtoms(i, j)
+            bond_type_idx = bond_mapping[bond.GetBondType().name]
+            adjacency[bond_type_idx, [i, j], [j, i]] = 1
+
+    # Where no bond, add 1 to last channel (indicating "non-bond")
+    # Notice: channels-first
+    adjacency[-1, np.sum(adjacency, axis=0) == 0] = 1
+
+    # Where no atom, add 1 to last column (indicating "non-atom")
+    features[np.where(np.sum(features, axis=1) == 0)[0], -1] = 1
+
+    return adjacency, features
+
+
+def graph_to_molecule(graph):
+    # Unpack graph
+    adjacency, features = graph
+
+    # RWMol is a molecule object intended to be edited
+    molecule = Chem.RWMol()
+
+    # Remove "no atoms" & atoms with no bonds
+    keep_idx = np.where(
+        (np.argmax(features, axis=1) != ATOM_DIM - 1)
+        & (np.sum(adjacency[:-1], axis=(0, 1)) != 0)
+    )[0]
+    features = features[keep_idx]
+    adjacency = adjacency[:, keep_idx, :][:, :, keep_idx]
+
+    # Add atoms to molecule
+    for atom_type_idx in np.argmax(features, axis=1):
+        atom = Chem.Atom(atom_mapping[atom_type_idx])
+        _ = molecule.AddAtom(atom)
+
+    # Add bonds between atoms in molecule; based on the upper triangles
+    # of the [symmetric] adjacency tensor
+    (bonds_ij, atoms_i, atoms_j) = np.where(np.triu(adjacency) == 1)
+    for bond_ij, atom_i, atom_j in zip(bonds_ij, atoms_i, atoms_j):
+        if atom_i == atom_j or bond_ij == BOND_DIM - 1:
+            continue
+        bond_type = bond_mapping[bond_ij]
+        molecule.AddBond(int(atom_i), int(atom_j), bond_type)
+
+    # Sanitize the molecule; for more information on sanitization, see
+    # https://www.rdkit.org/docs/RDKit_Book.html#molecular-sanitization
+    flag = Chem.SanitizeMol(molecule, catchErrors=True)
+    # Let's be strict. If sanitization fails, return None
+    if flag != Chem.SanitizeFlags.SANITIZE_NONE:
+        return None
+
+    return molecule
+
+
+# Test helper functions
+graph_to_molecule(smiles_to_graph(smiles))
+
+"""
+### Generate training set
+
+To save training time, we'll only use a tenth of the QM9 dataset.
+"""
+
+adjacency_tensor, feature_tensor = [], []
+for smiles in data[::10]:
+    adjacency, features = smiles_to_graph(smiles)
+    adjacency_tensor.append(adjacency)
+    feature_tensor.append(features)
+
+adjacency_tensor = np.array(adjacency_tensor)
+feature_tensor = np.array(feature_tensor)
+
+print("adjacency_tensor.shape =", adjacency_tensor.shape)
+print("feature_tensor.shape =", feature_tensor.shape)
+
+"""
+## Model
+
+The idea is to implement a generator network and a discriminator network via WGAN-GP,
+that will result in a generator network that can generate small novel molecules
+(small graphs).
+
+The generator network needs to be able to map (for each example in the batch) a vector `z`
+to a 3-D adjacency tensor (`A`) and 2-D feature tensor (`H`). For this, `z` will first be
+passed through a fully-connected network, for which the output will be further passed
+through two separate fully-connected networks. Each of these two fully-connected
+networks will then output (for each example in the batch) a tanh-activated vector
+followed by a reshape and softmax to match that of a multi-dimensional adjacency/feature
+tensor.
+
+As the discriminator network will receives as input a graph (`A`, `H`) from either the
+generator or from the training set, we'll need to implement graph convolutional layers,
+which allows us to operate on graphs. This means that input to the discriminator network
+will first pass through graph convolutional layers, then an average-pooling layer,
+and finally a few fully-connected layers. The final output should be a scalar (for each
+example in the batch) which indicates the "realness" of the associated input
+(in this case a "fake" or "real" molecule).
+
+
+### Graph generator
+"""
+
+
+def GraphGenerator(
+    dense_units,
+    dropout_rate,
+    latent_dim,
+    adjacency_shape,
+    feature_shape,
+):
+    z = keras.layers.Input(shape=(LATENT_DIM,))
+    # Propagate through one or more densely connected layers
+    x = z
+    for units in dense_units:
+        x = keras.layers.Dense(units, activation="tanh")(x)
+        x = keras.layers.Dropout(dropout_rate)(x)
+
+    # Map outputs of previous layer (x) to [continuous] adjacency tensors (x_adjacency)
+    x_adjacency = keras.layers.Dense(tf.math.reduce_prod(adjacency_shape))(x)
+    x_adjacency = keras.layers.Reshape(adjacency_shape)(x_adjacency)
+    # Symmetrify tensors in the last two dimensions
+    x_adjacency = (x_adjacency + tf.transpose(x_adjacency, (0, 1, 3, 2))) / 2
+    x_adjacency = keras.layers.Softmax(axis=1)(x_adjacency)
+
+    # Map outputs of previous layer (x) to [continuous] feature tensors (x_features)
+    x_features = keras.layers.Dense(tf.math.reduce_prod(feature_shape))(x)
+    x_features = keras.layers.Reshape(feature_shape)(x_features)
+    x_features = keras.layers.Softmax(axis=2)(x_features)
+
+    return keras.Model(inputs=z, outputs=[x_adjacency, x_features], name="Generator")
+
+
+generator = GraphGenerator(
+    dense_units=[128, 256, 512],
+    dropout_rate=0.2,
+    latent_dim=LATENT_DIM,
+    adjacency_shape=(BOND_DIM, NUM_ATOMS, NUM_ATOMS),
+    feature_shape=(NUM_ATOMS, ATOM_DIM),
+)
+generator.summary()
+
+"""
+### Graph discriminator
+
+
+**Graph convolutional layer**. The
+[relational graph convolutional layers](https://arxiv.org/abs/1703.06103) implements non-linearly transformed
+neighborhood aggregations. We can define these layers as follows:
+
+`H^{l+1} = σ(D^{-1} @ A @ H^{l+1} @ W^{l})`
+
+
+Where `σ` denotes the non-linear transformation (commonly a ReLU activation), `A` the
+adjacency tensor, `H^{l}` the feature tensor at the `l:th` layer, `D^{-1}` the inverse
+diagonal degree tensor of `A`, and `W^{l}` the trainable weight tensor at the `l:th`
+layer. Specifically, for each bond type (relation), the degree tensor expresses, in the
+diagonal, the number of bonds attached to each atom. Notice, in this tutorial `D^{-1}` is
+omitted, for two reasons: (1) it's not obvious how to apply this normalization on the
+continuous adjacency tensors (generated by the generator), and (2) the performance of the
+WGAN without normalization seems to work just fine. Furthermore, in contrast to the
+[original paper](https://arxiv.org/abs/1703.06103), no self-loop is defined, as we don't
+want to train the generator to predict "self-bonding".
+
+
+
+"""
+
+
+class RelationalGraphConvLayer(keras.layers.Layer):
+    def __init__(
+        self,
+        units=128,
+        activation="relu",
+        use_bias=False,
+        kernel_initializer="glorot_uniform",
+        bias_initializer="zeros",
+        kernel_regularizer=None,
+        bias_regularizer=None,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        self.units = units
+        self.activation = keras.activations.get(activation)
+        self.use_bias = use_bias
+        self.kernel_initializer = keras.initializers.get(kernel_initializer)
+        self.bias_initializer = keras.initializers.get(bias_initializer)
+        self.kernel_regularizer = keras.regularizers.get(kernel_regularizer)
+        self.bias_regularizer = keras.regularizers.get(bias_regularizer)
+
+    def build(self, input_shape):
+        bond_dim = input_shape[0][1]
+        atom_dim = input_shape[1][2]
+
+        self.kernel = self.add_weight(
+            shape=(bond_dim, atom_dim, self.units),
+            initializer=self.kernel_initializer,
+            regularizer=self.kernel_regularizer,
+            trainable=True,
+            name="W",
+            dtype=tf.float32,
+        )
+
+        if self.use_bias:
+            self.bias = self.add_weight(
+                shape=(bond_dim, 1, self.units),
+                initializer=self.bias_initializer,
+                regularizer=self.bias_regularizer,
+                trainable=True,
+                name="b",
+                dtype=tf.float32,
+            )
+
+        self.built = True
+
+    def call(self, inputs, training=False):
+        adjacency, features = inputs
+        # Aggregate information from neighbors
+        x = tf.matmul(adjacency, features[:, None, :, :])
+        # Apply linear transformation
+        x = tf.matmul(x, self.kernel)
+        if self.use_bias:
+            x += self.bias
+        # Reduce bond types dim
+        x_reduced = tf.reduce_sum(x, axis=1)
+        # Apply non-linear transformation
+        return self.activation(x_reduced)
+
+
+def GraphDiscriminator(
+    gconv_units, dense_units, dropout_rate, adjacency_shape, feature_shape
+):
+    adjacency = keras.layers.Input(shape=adjacency_shape)
+    features = keras.layers.Input(shape=feature_shape)
+
+    # Propagate through one or more graph convolutional layers
+    features_transformed = features
+    for units in gconv_units:
+        features_transformed = RelationalGraphConvLayer(units)(
+            [adjacency, features_transformed]
+        )
+
+    # Reduce 2-D representation of molecule to 1-D
+    x = keras.layers.GlobalAveragePooling1D()(features_transformed)
+
+    # Propagate through one or more densely connected layers
+    for units in dense_units:
+        x = keras.layers.Dense(units, activation="relu")(x)
+        x = keras.layers.Dropout(dropout_rate)(x)
+
+    # For each molecule, output a single scalar value expressing the
+    # "realness" of the inputted molecule
+    x_out = keras.layers.Dense(1, dtype="float32")(x)
+
+    return keras.Model(inputs=[adjacency, features], outputs=x_out)
+
+
+discriminator = GraphDiscriminator(
+    gconv_units=[128, 128, 128, 128],
+    dense_units=[512, 512],
+    dropout_rate=0.2,
+    adjacency_shape=(BOND_DIM, NUM_ATOMS, NUM_ATOMS),
+    feature_shape=(NUM_ATOMS, ATOM_DIM),
+)
+discriminator.summary()
+
+"""
+### WGAN-GP
+"""
+
+
+class GraphWGAN(keras.Model):
+    def __init__(
+        self,
+        generator,
+        discriminator,
+        discriminator_steps=1,
+        generator_steps=1,
+        gp_weight=10,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.generator = generator
+        self.discriminator = discriminator
+        self.discriminator_steps = discriminator_steps
+        self.generator_steps = generator_steps
+        self.gp_weight = gp_weight
+        self.latent_dim = self.generator.input_shape[-1]
+
+    def compile(self, optimizer_generator, optimizer_discriminator, **kwargs):
+        super().compile(**kwargs)
+        self.optimizer_generator = optimizer_generator
+        self.optimizer_discriminator = optimizer_discriminator
+        self.metric_generator = keras.metrics.Mean(name="loss_gen")
+        self.metric_discriminator = keras.metrics.Mean(name="loss_dis")
+
+    def train_step(self, inputs):
+        if isinstance(inputs[0], tuple):
+            inputs = inputs[0]
+
+        graph_real = inputs
+
+        self.batch_size = tf.shape(inputs[0])[0]
+
+        # Train the discriminator for one or more steps
+        for _ in range(self.discriminator_steps):
+            z = tf.random.normal((self.batch_size, self.latent_dim))
+
+            with tf.GradientTape() as tape:
+                graph_generated = self.generator(z, training=True)
+                loss = self._loss_discriminator(graph_real, graph_generated)
+
+            grads = tape.gradient(loss, self.discriminator.trainable_weights)
+            self.optimizer_discriminator.apply_gradients(
+                zip(grads, self.discriminator.trainable_weights)
+            )
+            self.metric_discriminator.update_state(loss)
+
+        # Train the generator for one or more steps
+        for _ in range(self.generator_steps):
+            z = tf.random.normal((self.batch_size, self.latent_dim))
+
+            with tf.GradientTape() as tape:
+                graph_generated = self.generator(z, training=True)
+                loss = self._loss_generator(graph_generated)
+
+                grads = tape.gradient(loss, self.generator.trainable_weights)
+                self.optimizer_generator.apply_gradients(
+                    zip(grads, self.generator.trainable_weights)
+                )
+                self.metric_generator.update_state(loss)
+
+        return {m.name: m.result() for m in self.metrics}
+
+    def _loss_discriminator(self, graph_real, graph_generated):
+        logits_real = self.discriminator(graph_real, training=True)
+        logits_generated = self.discriminator(graph_generated, training=True)
+        loss = tf.reduce_mean(logits_generated) - tf.reduce_mean(logits_real)
+        loss_gp = self._gradient_penalty(graph_real, graph_generated)
+        return loss + loss_gp * self.gp_weight
+
+    def _loss_generator(self, graph_generated):
+        logits_generated = self.discriminator(graph_generated, training=True)
+        return -tf.reduce_mean(logits_generated)
+
+    def _gradient_penalty(self, graph_real, graph_generated):
+        # Unpack graphs
+        adjacency_real, features_real = graph_real
+        adjacency_generated, features_generated = graph_generated
+
+        # Generate interpolated graphs (adjacency_interp and features_interp)
+        alpha = tf.random.uniform([self.batch_size])
+        alpha = tf.reshape(alpha, (self.batch_size, 1, 1, 1))
+        adjacency_interp = (adjacency_real * alpha) + (1 - alpha) * adjacency_generated
+        alpha = tf.reshape(alpha, (self.batch_size, 1, 1))
+        features_interp = (features_real * alpha) + (1 - alpha) * features_generated
+
+        # Compute the logits of interpolated graphs
+        with tf.GradientTape() as tape:
+            tape.watch(adjacency_interp)
+            tape.watch(features_interp)
+            logits = self.discriminator(
+                [adjacency_interp, features_interp], training=True
+            )
+
+        # Compute the gradients with respect to the interpolated graphs
+        grads = tape.gradient(logits, [adjacency_interp, features_interp])
+        # Compute the gradient penalty
+        grads_adjacency_penalty = (1 - tf.norm(grads[0], axis=1)) ** 2
+        grads_features_penalty = (1 - tf.norm(grads[1], axis=2)) ** 2
+        return tf.reduce_mean(
+            tf.reduce_mean(grads_adjacency_penalty, axis=(-2, -1))
+            + tf.reduce_mean(grads_features_penalty, axis=(-1))
+        )
+
+
+"""
+## Train the model
+
+To save time (if run on a CPU), we'll only train the model for 10 epochs.
+"""
+
+wgan = GraphWGAN(generator, discriminator, discriminator_steps=1)
+
+wgan.compile(
+    optimizer_generator=keras.optimizers.Adam(5e-4),
+    optimizer_discriminator=keras.optimizers.Adam(5e-4),
+)
+
+wgan.fit([adjacency_tensor, feature_tensor], epochs=10, batch_size=16)
+
+"""
+## Sample novel molecules with the generator
+"""
+
+
+def sample(generator, batch_size):
+    z = tf.random.normal((batch_size, LATENT_DIM))
+    graph = generator.predict(z)
+    # obtain one-hot encoded adjacency tensor
+    adjacency = tf.argmax(graph[0], axis=1)
+    adjacency = tf.one_hot(adjacency, depth=BOND_DIM, axis=1)
+    # Remove potential self-loops from adjacency
+    adjacency = tf.linalg.set_diag(adjacency, tf.zeros(tf.shape(adjacency)[:-1]))
+    # obtain one-hot encoded feature tensor
+    features = tf.argmax(graph[1], axis=2)
+    features = tf.one_hot(features, depth=ATOM_DIM, axis=2)
+    return [
+        graph_to_molecule([adjacency[i].numpy(), features[i].numpy()])
+        for i in range(batch_size)
+    ]
+
+
+molecules = sample(wgan.generator, batch_size=48)
+
+MolsToGridImage(
+    [m for m in molecules if m is not None][:25], molsPerRow=5, subImgSize=(150, 150)
+)
+
+"""
+## Concluding thoughts
+
+**Inspecting the results**. Ten epochs of training seemed enough to generate some decent
+looking molecules! Notice, in contrast to the
+[MolGAN paper](https://arxiv.org/abs/1805.11973), the uniqueness of the generated
+molecules in this tutorial seems really high, which is great!
+
+**What we've learned, and prospects**. In this tutorial, a generative model for molecular
+graphs was successfully implemented, which allowed us to generate novel molecules. In the
+future, it would be interesting to implement generative models that can modify existing
+molecules (for instance, to optimize solubility or protein-binding of an existing
+molecule). For that however, a reconstruction loss would likely be needed, which is
+tricky to implement as there's no easy and obvious way to compute similarity between two
+molecular graphs.
+
+Example available on HuggingFace
+
+| Trained Model | Demo |
+| :--: | :--: |
+| [![Generic badge](https://img.shields.io/badge/%F0%9F%A4%97%20Model-wgan%20graphs-black.svg)](https://huggingface.co/keras-io/wgan-molecular-graphs) | [![Generic badge](https://img.shields.io/badge/%F0%9F%A4%97%20Spaces-wgan%20graphs-black.svg)](https://huggingface.co/spaces/keras-io/Generating-molecular-graphs-by-WGAN-GP) |
+"""
diff --git a/knowledge_base/generative/wgan_gp.py b/knowledge_base/generative/wgan_gp.py
new file mode 100644
index 0000000000000000000000000000000000000000..d81974b5f51f2c5ce5ce480b740885fc480d84a3
--- /dev/null
+++ b/knowledge_base/generative/wgan_gp.py
@@ -0,0 +1,441 @@
+"""
+Title: WGAN-GP overriding `Model.train_step`
+Author: [A_K_Nain](https://twitter.com/A_K_Nain)
+Date created: 2020/05/9
+Last modified: 2023/08/3
+Description: Implementation of Wasserstein GAN with Gradient Penalty.
+Accelerator: GPU
+"""
+
+"""
+## Wasserstein GAN (WGAN) with Gradient Penalty (GP)
+
+The original [Wasserstein GAN](https://arxiv.org/abs/1701.07875) leverages the
+Wasserstein distance to produce a value function that has better theoretical
+properties than the value function used in the original GAN paper. WGAN requires
+that the discriminator (aka the critic) lie within the space of 1-Lipschitz
+functions. The authors proposed the idea of weight clipping to achieve this
+constraint. Though weight clipping works, it can be a problematic way to enforce
+1-Lipschitz constraint and can cause undesirable behavior, e.g. a very deep WGAN
+discriminator (critic) often fails to converge.
+
+The [WGAN-GP](https://arxiv.org/abs/1704.00028) method proposes an
+alternative to weight clipping to ensure smooth training. Instead of clipping
+the weights, the authors proposed a "gradient penalty" by adding a loss term
+that keeps the L2 norm of the discriminator gradients close to 1.
+"""
+
+"""
+## Setup
+"""
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import keras
+import tensorflow as tf
+from keras import layers
+
+
+"""
+## Prepare the Fashion-MNIST data
+
+To demonstrate how to train WGAN-GP, we will be using the
+[Fashion-MNIST](https://github.com/zalandoresearch/fashion-mnist) dataset. Each
+sample in this dataset is a 28x28 grayscale image associated with a label from
+10 classes (e.g. trouser, pullover, sneaker, etc.)
+"""
+
+IMG_SHAPE = (28, 28, 1)
+BATCH_SIZE = 512
+
+# Size of the noise vector
+noise_dim = 128
+
+fashion_mnist = keras.datasets.fashion_mnist
+(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
+print(f"Number of examples: {len(train_images)}")
+print(f"Shape of the images in the dataset: {train_images.shape[1:]}")
+
+# Reshape each sample to (28, 28, 1) and normalize the pixel values in the [-1, 1] range
+train_images = train_images.reshape(train_images.shape[0], *IMG_SHAPE).astype("float32")
+train_images = (train_images - 127.5) / 127.5
+
+"""
+## Create the discriminator (the critic in the original WGAN)
+
+The samples in the dataset have a (28, 28, 1) shape. Because we will be
+using strided convolutions, this can result in a shape with odd dimensions.
+For example,
+`(28, 28) -> Conv_s2 -> (14, 14) -> Conv_s2 -> (7, 7) -> Conv_s2 ->(3, 3)`.
+
+While performing upsampling in the generator part of the network, we won't get
+the same input shape as the original images if we aren't careful. To avoid this,
+we will do something much simpler:
+- In the discriminator: "zero pad" the input to change the shape to `(32, 32, 1)`
+for each sample; and
+- Ihe generator: crop the final output to match the shape with input shape.
+"""
+
+
+def conv_block(
+    x,
+    filters,
+    activation,
+    kernel_size=(3, 3),
+    strides=(1, 1),
+    padding="same",
+    use_bias=True,
+    use_bn=False,
+    use_dropout=False,
+    drop_value=0.5,
+):
+    x = layers.Conv2D(
+        filters, kernel_size, strides=strides, padding=padding, use_bias=use_bias
+    )(x)
+    if use_bn:
+        x = layers.BatchNormalization()(x)
+    x = activation(x)
+    if use_dropout:
+        x = layers.Dropout(drop_value)(x)
+    return x
+
+
+def get_discriminator_model():
+    img_input = layers.Input(shape=IMG_SHAPE)
+    # Zero pad the input to make the input images size to (32, 32, 1).
+    x = layers.ZeroPadding2D((2, 2))(img_input)
+    x = conv_block(
+        x,
+        64,
+        kernel_size=(5, 5),
+        strides=(2, 2),
+        use_bn=False,
+        use_bias=True,
+        activation=layers.LeakyReLU(0.2),
+        use_dropout=False,
+        drop_value=0.3,
+    )
+    x = conv_block(
+        x,
+        128,
+        kernel_size=(5, 5),
+        strides=(2, 2),
+        use_bn=False,
+        activation=layers.LeakyReLU(0.2),
+        use_bias=True,
+        use_dropout=True,
+        drop_value=0.3,
+    )
+    x = conv_block(
+        x,
+        256,
+        kernel_size=(5, 5),
+        strides=(2, 2),
+        use_bn=False,
+        activation=layers.LeakyReLU(0.2),
+        use_bias=True,
+        use_dropout=True,
+        drop_value=0.3,
+    )
+    x = conv_block(
+        x,
+        512,
+        kernel_size=(5, 5),
+        strides=(2, 2),
+        use_bn=False,
+        activation=layers.LeakyReLU(0.2),
+        use_bias=True,
+        use_dropout=False,
+        drop_value=0.3,
+    )
+
+    x = layers.Flatten()(x)
+    x = layers.Dropout(0.2)(x)
+    x = layers.Dense(1)(x)
+
+    d_model = keras.models.Model(img_input, x, name="discriminator")
+    return d_model
+
+
+d_model = get_discriminator_model()
+d_model.summary()
+
+"""
+## Create the generator
+"""
+
+
+def upsample_block(
+    x,
+    filters,
+    activation,
+    kernel_size=(3, 3),
+    strides=(1, 1),
+    up_size=(2, 2),
+    padding="same",
+    use_bn=False,
+    use_bias=True,
+    use_dropout=False,
+    drop_value=0.3,
+):
+    x = layers.UpSampling2D(up_size)(x)
+    x = layers.Conv2D(
+        filters, kernel_size, strides=strides, padding=padding, use_bias=use_bias
+    )(x)
+
+    if use_bn:
+        x = layers.BatchNormalization()(x)
+
+    if activation:
+        x = activation(x)
+    if use_dropout:
+        x = layers.Dropout(drop_value)(x)
+    return x
+
+
+def get_generator_model():
+    noise = layers.Input(shape=(noise_dim,))
+    x = layers.Dense(4 * 4 * 256, use_bias=False)(noise)
+    x = layers.BatchNormalization()(x)
+    x = layers.LeakyReLU(0.2)(x)
+
+    x = layers.Reshape((4, 4, 256))(x)
+    x = upsample_block(
+        x,
+        128,
+        layers.LeakyReLU(0.2),
+        strides=(1, 1),
+        use_bias=False,
+        use_bn=True,
+        padding="same",
+        use_dropout=False,
+    )
+    x = upsample_block(
+        x,
+        64,
+        layers.LeakyReLU(0.2),
+        strides=(1, 1),
+        use_bias=False,
+        use_bn=True,
+        padding="same",
+        use_dropout=False,
+    )
+    x = upsample_block(
+        x, 1, layers.Activation("tanh"), strides=(1, 1), use_bias=False, use_bn=True
+    )
+    # At this point, we have an output which has the same shape as the input, (32, 32, 1).
+    # We will use a Cropping2D layer to make it (28, 28, 1).
+    x = layers.Cropping2D((2, 2))(x)
+
+    g_model = keras.models.Model(noise, x, name="generator")
+    return g_model
+
+
+g_model = get_generator_model()
+g_model.summary()
+
+"""
+## Create the WGAN-GP model
+
+Now that we have defined our generator and discriminator, it's time to implement
+the WGAN-GP model. We will also override the `train_step` for training.
+"""
+
+
+class WGAN(keras.Model):
+    def __init__(
+        self,
+        discriminator,
+        generator,
+        latent_dim,
+        discriminator_extra_steps=3,
+        gp_weight=10.0,
+    ):
+        super().__init__()
+        self.discriminator = discriminator
+        self.generator = generator
+        self.latent_dim = latent_dim
+        self.d_steps = discriminator_extra_steps
+        self.gp_weight = gp_weight
+
+    def compile(self, d_optimizer, g_optimizer, d_loss_fn, g_loss_fn):
+        super().compile()
+        self.d_optimizer = d_optimizer
+        self.g_optimizer = g_optimizer
+        self.d_loss_fn = d_loss_fn
+        self.g_loss_fn = g_loss_fn
+
+    def gradient_penalty(self, batch_size, real_images, fake_images):
+        """Calculates the gradient penalty.
+
+        This loss is calculated on an interpolated image
+        and added to the discriminator loss.
+        """
+        # Get the interpolated image
+        alpha = tf.random.uniform([batch_size, 1, 1, 1], 0.0, 1.0)
+        diff = fake_images - real_images
+        interpolated = real_images + alpha * diff
+
+        with tf.GradientTape() as gp_tape:
+            gp_tape.watch(interpolated)
+            # 1. Get the discriminator output for this interpolated image.
+            pred = self.discriminator(interpolated, training=True)
+
+        # 2. Calculate the gradients w.r.t to this interpolated image.
+        grads = gp_tape.gradient(pred, [interpolated])[0]
+        # 3. Calculate the norm of the gradients.
+        norm = tf.sqrt(tf.reduce_sum(tf.square(grads), axis=[1, 2, 3]))
+        gp = tf.reduce_mean((norm - 1.0) ** 2)
+        return gp
+
+    def train_step(self, real_images):
+        if isinstance(real_images, tuple):
+            real_images = real_images[0]
+
+        # Get the batch size
+        batch_size = tf.shape(real_images)[0]
+
+        # For each batch, we are going to perform the
+        # following steps as laid out in the original paper:
+        # 1. Train the generator and get the generator loss
+        # 2. Train the discriminator and get the discriminator loss
+        # 3. Calculate the gradient penalty
+        # 4. Multiply this gradient penalty with a constant weight factor
+        # 5. Add the gradient penalty to the discriminator loss
+        # 6. Return the generator and discriminator losses as a loss dictionary
+
+        # Train the discriminator first. The original paper recommends training
+        # the discriminator for `x` more steps (typically 5) as compared to
+        # one step of the generator. Here we will train it for 3 extra steps
+        # as compared to 5 to reduce the training time.
+        for i in range(self.d_steps):
+            # Get the latent vector
+            random_latent_vectors = tf.random.normal(
+                shape=(batch_size, self.latent_dim)
+            )
+            with tf.GradientTape() as tape:
+                # Generate fake images from the latent vector
+                fake_images = self.generator(random_latent_vectors, training=True)
+                # Get the logits for the fake images
+                fake_logits = self.discriminator(fake_images, training=True)
+                # Get the logits for the real images
+                real_logits = self.discriminator(real_images, training=True)
+
+                # Calculate the discriminator loss using the fake and real image logits
+                d_cost = self.d_loss_fn(real_img=real_logits, fake_img=fake_logits)
+                # Calculate the gradient penalty
+                gp = self.gradient_penalty(batch_size, real_images, fake_images)
+                # Add the gradient penalty to the original discriminator loss
+                d_loss = d_cost + gp * self.gp_weight
+
+            # Get the gradients w.r.t the discriminator loss
+            d_gradient = tape.gradient(d_loss, self.discriminator.trainable_variables)
+            # Update the weights of the discriminator using the discriminator optimizer
+            self.d_optimizer.apply_gradients(
+                zip(d_gradient, self.discriminator.trainable_variables)
+            )
+
+        # Train the generator
+        # Get the latent vector
+        random_latent_vectors = tf.random.normal(shape=(batch_size, self.latent_dim))
+        with tf.GradientTape() as tape:
+            # Generate fake images using the generator
+            generated_images = self.generator(random_latent_vectors, training=True)
+            # Get the discriminator logits for fake images
+            gen_img_logits = self.discriminator(generated_images, training=True)
+            # Calculate the generator loss
+            g_loss = self.g_loss_fn(gen_img_logits)
+
+        # Get the gradients w.r.t the generator loss
+        gen_gradient = tape.gradient(g_loss, self.generator.trainable_variables)
+        # Update the weights of the generator using the generator optimizer
+        self.g_optimizer.apply_gradients(
+            zip(gen_gradient, self.generator.trainable_variables)
+        )
+        return {"d_loss": d_loss, "g_loss": g_loss}
+
+
+"""
+## Create a Keras callback that periodically saves generated images
+"""
+
+
+class GANMonitor(keras.callbacks.Callback):
+    def __init__(self, num_img=6, latent_dim=128):
+        self.num_img = num_img
+        self.latent_dim = latent_dim
+
+    def on_epoch_end(self, epoch, logs=None):
+        random_latent_vectors = tf.random.normal(shape=(self.num_img, self.latent_dim))
+        generated_images = self.model.generator(random_latent_vectors)
+        generated_images = (generated_images * 127.5) + 127.5
+
+        for i in range(self.num_img):
+            img = generated_images[i].numpy()
+            img = keras.utils.array_to_img(img)
+            img.save("generated_img_{i}_{epoch}.png".format(i=i, epoch=epoch))
+
+
+"""
+## Train the end-to-end model
+"""
+
+# Instantiate the optimizer for both networks
+# (learning_rate=0.0002, beta_1=0.5 are recommended)
+generator_optimizer = keras.optimizers.Adam(
+    learning_rate=0.0002, beta_1=0.5, beta_2=0.9
+)
+discriminator_optimizer = keras.optimizers.Adam(
+    learning_rate=0.0002, beta_1=0.5, beta_2=0.9
+)
+
+
+# Define the loss functions for the discriminator,
+# which should be (fake_loss - real_loss).
+# We will add the gradient penalty later to this loss function.
+def discriminator_loss(real_img, fake_img):
+    real_loss = tf.reduce_mean(real_img)
+    fake_loss = tf.reduce_mean(fake_img)
+    return fake_loss - real_loss
+
+
+# Define the loss functions for the generator.
+def generator_loss(fake_img):
+    return -tf.reduce_mean(fake_img)
+
+
+# Set the number of epochs for training.
+epochs = 20
+
+# Instantiate the customer `GANMonitor` Keras callback.
+cbk = GANMonitor(num_img=3, latent_dim=noise_dim)
+
+# Get the wgan model
+wgan = WGAN(
+    discriminator=d_model,
+    generator=g_model,
+    latent_dim=noise_dim,
+    discriminator_extra_steps=3,
+)
+
+# Compile the wgan model
+wgan.compile(
+    d_optimizer=discriminator_optimizer,
+    g_optimizer=generator_optimizer,
+    g_loss_fn=generator_loss,
+    d_loss_fn=discriminator_loss,
+)
+
+# Start training
+wgan.fit(train_images, batch_size=BATCH_SIZE, epochs=epochs, callbacks=[cbk])
+
+"""
+Display the last generated images:
+"""
+
+from IPython.display import Image, display
+
+display(Image("generated_img_0_19.png"))
+display(Image("generated_img_1_19.png"))
+display(Image("generated_img_2_19.png"))
diff --git a/knowledge_base/graph/gat_node_classification.py b/knowledge_base/graph/gat_node_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..701e52718a5e926683e5e6d4dc7ff4dbc4eb1172
--- /dev/null
+++ b/knowledge_base/graph/gat_node_classification.py
@@ -0,0 +1,398 @@
+"""
+Title: Graph attention network (GAT) for node classification
+Author: [akensert](https://github.com/akensert)
+Date created: 2021/09/13
+Last modified: 2021/12/26
+Description: An implementation of a Graph Attention Network (GAT) for node classification.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+[Graph neural networks](https://en.wikipedia.org/wiki/Graph_neural_network)
+is the preferred neural network architecture for processing data structured as
+graphs (for example, social networks or molecule structures), yielding
+better results than fully-connected networks or convolutional networks.
+
+In this tutorial, we will implement a specific graph neural network known as a
+[Graph Attention Network](https://arxiv.org/abs/1710.10903) (GAT) to predict labels of
+scientific papers based on what type of papers cite them (using the
+[Cora](https://linqs.soe.ucsc.edu/data) dataset).
+
+### References
+
+For more information on GAT, see the original paper
+[Graph Attention Networks](https://arxiv.org/abs/1710.10903) as well as
+[DGL's Graph Attention Networks](https://docs.dgl.ai/en/0.4.x/tutorials/models/1_gnn/9_gat.html)
+documentation.
+"""
+
+"""
+### Import packages
+"""
+
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+import numpy as np
+import pandas as pd
+import os
+import warnings
+
+warnings.filterwarnings("ignore")
+pd.set_option("display.max_columns", 6)
+pd.set_option("display.max_rows", 6)
+np.random.seed(2)
+
+"""
+## Obtain the dataset
+
+The preparation of the [Cora dataset](https://linqs.soe.ucsc.edu/data) follows that of the
+[Node classification with Graph Neural Networks](https://keras.io/examples/graph/gnn_citations/)
+tutorial. Refer to this tutorial for more details on the dataset and exploratory data analysis.
+In brief, the Cora dataset consists of two files: `cora.cites` which contains *directed links* (citations) between
+papers; and `cora.content` which contains *features* of the corresponding papers and one
+of seven labels (the *subject* of the paper).
+"""
+
+zip_file = keras.utils.get_file(
+    fname="cora.tgz",
+    origin="https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz",
+    extract=True,
+)
+
+data_dir = os.path.join(os.path.dirname(zip_file), "cora")
+
+citations = pd.read_csv(
+    os.path.join(data_dir, "cora.cites"),
+    sep="\t",
+    header=None,
+    names=["target", "source"],
+)
+
+papers = pd.read_csv(
+    os.path.join(data_dir, "cora.content"),
+    sep="\t",
+    header=None,
+    names=["paper_id"] + [f"term_{idx}" for idx in range(1433)] + ["subject"],
+)
+
+class_values = sorted(papers["subject"].unique())
+class_idx = {name: id for id, name in enumerate(class_values)}
+paper_idx = {name: idx for idx, name in enumerate(sorted(papers["paper_id"].unique()))}
+
+papers["paper_id"] = papers["paper_id"].apply(lambda name: paper_idx[name])
+citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
+citations["target"] = citations["target"].apply(lambda name: paper_idx[name])
+papers["subject"] = papers["subject"].apply(lambda value: class_idx[value])
+
+print(citations)
+
+print(papers)
+
+"""
+### Split the dataset
+"""
+
+# Obtain random indices
+random_indices = np.random.permutation(range(papers.shape[0]))
+
+# 50/50 split
+train_data = papers.iloc[random_indices[: len(random_indices) // 2]]
+test_data = papers.iloc[random_indices[len(random_indices) // 2 :]]
+
+"""
+### Prepare the graph data
+"""
+
+# Obtain paper indices which will be used to gather node states
+# from the graph later on when training the model
+train_indices = train_data["paper_id"].to_numpy()
+test_indices = test_data["paper_id"].to_numpy()
+
+# Obtain ground truth labels corresponding to each paper_id
+train_labels = train_data["subject"].to_numpy()
+test_labels = test_data["subject"].to_numpy()
+
+# Define graph, namely an edge tensor and a node feature tensor
+edges = tf.convert_to_tensor(citations[["target", "source"]])
+node_states = tf.convert_to_tensor(papers.sort_values("paper_id").iloc[:, 1:-1])
+
+# Print shapes of the graph
+print("Edges shape:\t\t", edges.shape)
+print("Node features shape:", node_states.shape)
+
+"""
+## Build the model
+
+GAT takes as input a graph (namely an edge tensor and a node feature tensor) and
+outputs \[updated\] node states. The node states are, for each target node, neighborhood
+aggregated information of *N*-hops (where *N* is decided by the number of layers of the
+GAT). Importantly, in contrast to the
+[graph convolutional network](https://arxiv.org/abs/1609.02907) (GCN)
+the GAT makes use of attention mechanisms
+to aggregate information from neighboring nodes (or *source nodes*). In other words, instead of simply
+averaging/summing node states from source nodes (*source papers*) to the target node (*target papers*),
+GAT first applies normalized attention scores to each source node state and then sums.
+"""
+
+"""
+### (Multi-head) graph attention layer
+
+The GAT model implements multi-head graph attention layers. The `MultiHeadGraphAttention`
+layer is simply a concatenation (or averaging) of multiple graph attention layers
+(`GraphAttention`), each with separate learnable weights `W`. The `GraphAttention` layer
+does the following:
+
+Consider inputs node states `h^{l}` which are linearly transformed by `W^{l}`, resulting in `z^{l}`.
+
+For each target node:
+
+1. Computes pair-wise attention scores `a^{l}^{T}(z^{l}_{i}||z^{l}_{j})` for all `j`,
+resulting in `e_{ij}` (for all `j`).
+`||` denotes a concatenation, `_{i}` corresponds to the target node, and `_{j}`
+corresponds to a given 1-hop neighbor/source node.
+2. Normalizes `e_{ij}` via softmax, so as the sum of incoming edges' attention scores
+to the target node (`sum_{k}{e_{norm}_{ik}}`) will add up to 1.
+3. Applies attention scores `e_{norm}_{ij}` to `z_{j}`
+and adds it to the new target node state `h^{l+1}_{i}`, for all `j`.
+"""
+
+
+class GraphAttention(layers.Layer):
+    def __init__(
+        self,
+        units,
+        kernel_initializer="glorot_uniform",
+        kernel_regularizer=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.units = units
+        self.kernel_initializer = keras.initializers.get(kernel_initializer)
+        self.kernel_regularizer = keras.regularizers.get(kernel_regularizer)
+
+    def build(self, input_shape):
+        self.kernel = self.add_weight(
+            shape=(input_shape[0][-1], self.units),
+            trainable=True,
+            initializer=self.kernel_initializer,
+            regularizer=self.kernel_regularizer,
+            name="kernel",
+        )
+        self.kernel_attention = self.add_weight(
+            shape=(self.units * 2, 1),
+            trainable=True,
+            initializer=self.kernel_initializer,
+            regularizer=self.kernel_regularizer,
+            name="kernel_attention",
+        )
+        self.built = True
+
+    def call(self, inputs):
+        node_states, edges = inputs
+
+        # Linearly transform node states
+        node_states_transformed = tf.matmul(node_states, self.kernel)
+
+        # (1) Compute pair-wise attention scores
+        node_states_expanded = tf.gather(node_states_transformed, edges)
+        node_states_expanded = tf.reshape(
+            node_states_expanded, (tf.shape(edges)[0], -1)
+        )
+        attention_scores = tf.nn.leaky_relu(
+            tf.matmul(node_states_expanded, self.kernel_attention)
+        )
+        attention_scores = tf.squeeze(attention_scores, -1)
+
+        # (2) Normalize attention scores
+        attention_scores = tf.math.exp(tf.clip_by_value(attention_scores, -2, 2))
+        attention_scores_sum = tf.math.unsorted_segment_sum(
+            data=attention_scores,
+            segment_ids=edges[:, 0],
+            num_segments=tf.reduce_max(edges[:, 0]) + 1,
+        )
+        attention_scores_sum = tf.repeat(
+            attention_scores_sum, tf.math.bincount(tf.cast(edges[:, 0], "int32"))
+        )
+        attention_scores_norm = attention_scores / attention_scores_sum
+
+        # (3) Gather node states of neighbors, apply attention scores and aggregate
+        node_states_neighbors = tf.gather(node_states_transformed, edges[:, 1])
+        out = tf.math.unsorted_segment_sum(
+            data=node_states_neighbors * attention_scores_norm[:, tf.newaxis],
+            segment_ids=edges[:, 0],
+            num_segments=tf.shape(node_states)[0],
+        )
+        return out
+
+
+class MultiHeadGraphAttention(layers.Layer):
+    def __init__(self, units, num_heads=8, merge_type="concat", **kwargs):
+        super().__init__(**kwargs)
+        self.num_heads = num_heads
+        self.merge_type = merge_type
+        self.attention_layers = [GraphAttention(units) for _ in range(num_heads)]
+
+    def call(self, inputs):
+        atom_features, pair_indices = inputs
+
+        # Obtain outputs from each attention head
+        outputs = [
+            attention_layer([atom_features, pair_indices])
+            for attention_layer in self.attention_layers
+        ]
+        # Concatenate or average the node states from each head
+        if self.merge_type == "concat":
+            outputs = tf.concat(outputs, axis=-1)
+        else:
+            outputs = tf.reduce_mean(tf.stack(outputs, axis=-1), axis=-1)
+        # Activate and return node states
+        return tf.nn.relu(outputs)
+
+
+"""
+### Implement training logic with custom `train_step`, `test_step`, and `predict_step` methods
+
+Notice, the GAT model operates on the entire graph (namely, `node_states` and
+`edges`) in all phases (training, validation and testing). Hence, `node_states` and
+`edges` are passed to the constructor of the `keras.Model` and used as attributes.
+The difference between the phases are the indices (and labels), which gathers
+certain outputs (`tf.gather(outputs, indices)`).
+
+"""
+
+
+class GraphAttentionNetwork(keras.Model):
+    def __init__(
+        self,
+        node_states,
+        edges,
+        hidden_units,
+        num_heads,
+        num_layers,
+        output_dim,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.node_states = node_states
+        self.edges = edges
+        self.preprocess = layers.Dense(hidden_units * num_heads, activation="relu")
+        self.attention_layers = [
+            MultiHeadGraphAttention(hidden_units, num_heads) for _ in range(num_layers)
+        ]
+        self.output_layer = layers.Dense(output_dim)
+
+    def call(self, inputs):
+        node_states, edges = inputs
+        x = self.preprocess(node_states)
+        for attention_layer in self.attention_layers:
+            x = attention_layer([x, edges]) + x
+        outputs = self.output_layer(x)
+        return outputs
+
+    def train_step(self, data):
+        indices, labels = data
+
+        with tf.GradientTape() as tape:
+            # Forward pass
+            outputs = self([self.node_states, self.edges])
+            # Compute loss
+            loss = self.compiled_loss(labels, tf.gather(outputs, indices))
+        # Compute gradients
+        grads = tape.gradient(loss, self.trainable_weights)
+        # Apply gradients (update weights)
+        optimizer.apply_gradients(zip(grads, self.trainable_weights))
+        # Update metric(s)
+        self.compiled_metrics.update_state(labels, tf.gather(outputs, indices))
+
+        return {m.name: m.result() for m in self.metrics}
+
+    def predict_step(self, data):
+        indices = data
+        # Forward pass
+        outputs = self([self.node_states, self.edges])
+        # Compute probabilities
+        return tf.nn.softmax(tf.gather(outputs, indices))
+
+    def test_step(self, data):
+        indices, labels = data
+        # Forward pass
+        outputs = self([self.node_states, self.edges])
+        # Compute loss
+        loss = self.compiled_loss(labels, tf.gather(outputs, indices))
+        # Update metric(s)
+        self.compiled_metrics.update_state(labels, tf.gather(outputs, indices))
+
+        return {m.name: m.result() for m in self.metrics}
+
+
+"""
+### Train and evaluate
+"""
+
+# Define hyper-parameters
+HIDDEN_UNITS = 100
+NUM_HEADS = 8
+NUM_LAYERS = 3
+OUTPUT_DIM = len(class_values)
+
+NUM_EPOCHS = 100
+BATCH_SIZE = 256
+VALIDATION_SPLIT = 0.1
+LEARNING_RATE = 3e-1
+MOMENTUM = 0.9
+
+loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+optimizer = keras.optimizers.SGD(LEARNING_RATE, momentum=MOMENTUM)
+accuracy_fn = keras.metrics.SparseCategoricalAccuracy(name="acc")
+early_stopping = keras.callbacks.EarlyStopping(
+    monitor="val_acc", min_delta=1e-5, patience=5, restore_best_weights=True
+)
+
+# Build model
+gat_model = GraphAttentionNetwork(
+    node_states, edges, HIDDEN_UNITS, NUM_HEADS, NUM_LAYERS, OUTPUT_DIM
+)
+
+# Compile model
+gat_model.compile(loss=loss_fn, optimizer=optimizer, metrics=[accuracy_fn])
+
+gat_model.fit(
+    x=train_indices,
+    y=train_labels,
+    validation_split=VALIDATION_SPLIT,
+    batch_size=BATCH_SIZE,
+    epochs=NUM_EPOCHS,
+    callbacks=[early_stopping],
+    verbose=2,
+)
+
+_, test_accuracy = gat_model.evaluate(x=test_indices, y=test_labels, verbose=0)
+
+print("--" * 38 + f"\nTest Accuracy {test_accuracy*100:.1f}%")
+
+"""
+### Predict (probabilities)
+"""
+test_probs = gat_model.predict(x=test_indices)
+
+mapping = {v: k for (k, v) in class_idx.items()}
+
+for i, (probs, label) in enumerate(zip(test_probs[:10], test_labels[:10])):
+    print(f"Example {i+1}: {mapping[label]}")
+    for j, c in zip(probs, class_idx.keys()):
+        print(f"\tProbability of {c: <24} = {j*100:7.3f}%")
+    print("---" * 20)
+
+"""
+## Conclusions
+
+The results look OK! The GAT model seems to correctly predict the subjects of the papers,
+based on what they cite, about 80% of the time. Further improvements could be
+made by fine-tuning the hyper-parameters of the GAT. For instance, try changing the number of layers,
+the number of hidden units, or the optimizer/learning rate; add regularization (e.g., dropout);
+or modify the preprocessing step. We could also try to implement *self-loops*
+(i.e., paper X cites paper X) and/or make the graph *undirected*.
+"""
diff --git a/knowledge_base/graph/gnn_citations.py b/knowledge_base/graph/gnn_citations.py
new file mode 100644
index 0000000000000000000000000000000000000000..8165cb5a05bd15e2762bd03b121a2416ec2fd9e2
--- /dev/null
+++ b/knowledge_base/graph/gnn_citations.py
@@ -0,0 +1,706 @@
+"""
+Title: Node Classification with Graph Neural Networks
+Author: [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/)
+Date created: 2021/05/30
+Last modified: 2021/05/30
+Description: Implementing a graph neural network model for predicting the topic of a paper given its citations.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+Many datasets in various machine learning (ML) applications have structural relationships
+between their entities, which can be represented as graphs. Such application includes
+social and communication networks analysis, traffic prediction, and fraud detection.
+[Graph representation Learning](https://www.cs.mcgill.ca/~wlh/grl_book/)
+aims to build and train models for graph datasets to be used for a variety of ML tasks.
+
+This example demonstrate a simple implementation of a [Graph Neural Network](https://arxiv.org/pdf/1901.00596.pdf)
+(GNN) model. The model is used for a node prediction task on the [Cora dataset](https://relational.fit.cvut.cz/dataset/CORA)
+to predict the subject of a paper given its words and citations network.
+
+Note that, **we implement a Graph Convolution Layer from scratch** to provide better
+understanding of how they work. However, there is a number of specialized TensorFlow-based
+libraries that provide rich GNN APIs, such as [Spectral](https://graphneural.network/),
+[StellarGraph](https://stellargraph.readthedocs.io/en/stable/README.html), and
+[GraphNets](https://github.com/deepmind/graph_nets).
+"""
+
+"""
+## Setup
+"""
+
+import os
+import pandas as pd
+import numpy as np
+import networkx as nx
+import matplotlib.pyplot as plt
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+
+"""
+## Prepare the Dataset
+
+The Cora dataset consists of 2,708 scientific papers classified into one of seven classes.
+The citation network consists of 5,429 links. Each paper has a binary word vector of size
+1,433, indicating the presence of a corresponding word.
+
+### Download the dataset
+
+The dataset has two tap-separated files: `cora.cites` and `cora.content`.
+
+1. The `cora.cites` includes the citation records with two columns:
+`cited_paper_id` (target) and `citing_paper_id` (source).
+2. The `cora.content` includes the paper content records with 1,435 columns:
+`paper_id`, `subject`, and 1,433 binary features.
+
+Let's download the dataset.
+"""
+
+zip_file = keras.utils.get_file(
+    fname="cora.tgz",
+    origin="https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz",
+    extract=True,
+)
+data_dir = os.path.join(os.path.dirname(zip_file), "cora")
+"""
+### Process and visualize the dataset
+
+Then we load the citations data into a Pandas DataFrame.
+"""
+
+citations = pd.read_csv(
+    os.path.join(data_dir, "cora.cites"),
+    sep="\t",
+    header=None,
+    names=["target", "source"],
+)
+print("Citations shape:", citations.shape)
+
+"""
+Now we display a sample of the `citations` DataFrame.
+The `target` column includes the paper ids cited by the paper ids in the `source` column.
+"""
+
+citations.sample(frac=1).head()
+
+"""
+Now let's load the papers data into a Pandas DataFrame.
+"""
+
+column_names = ["paper_id"] + [f"term_{idx}" for idx in range(1433)] + ["subject"]
+papers = pd.read_csv(
+    os.path.join(data_dir, "cora.content"),
+    sep="\t",
+    header=None,
+    names=column_names,
+)
+print("Papers shape:", papers.shape)
+
+"""
+Now we display a sample of the `papers` DataFrame. The DataFrame includes the `paper_id`
+and the `subject` columns, as well as 1,433 binary column representing whether a term exists
+in the paper or not.
+"""
+
+print(papers.sample(5).T)
+
+"""
+Let's display the count of the papers in each subject.
+"""
+
+print(papers.subject.value_counts())
+
+"""
+We convert the paper ids and the subjects into zero-based indices.
+"""
+
+class_values = sorted(papers["subject"].unique())
+class_idx = {name: id for id, name in enumerate(class_values)}
+paper_idx = {name: idx for idx, name in enumerate(sorted(papers["paper_id"].unique()))}
+
+papers["paper_id"] = papers["paper_id"].apply(lambda name: paper_idx[name])
+citations["source"] = citations["source"].apply(lambda name: paper_idx[name])
+citations["target"] = citations["target"].apply(lambda name: paper_idx[name])
+papers["subject"] = papers["subject"].apply(lambda value: class_idx[value])
+
+"""
+Now let's visualize the citation graph. Each node in the graph represents a paper,
+and the color of the node corresponds to its subject. Note that we only show a sample of
+the papers in the dataset.
+"""
+
+plt.figure(figsize=(10, 10))
+colors = papers["subject"].tolist()
+cora_graph = nx.from_pandas_edgelist(citations.sample(n=1500))
+subjects = list(papers[papers["paper_id"].isin(list(cora_graph.nodes))]["subject"])
+nx.draw_spring(cora_graph, node_size=15, node_color=subjects)
+
+
+"""
+### Split the dataset into stratified train and test sets
+"""
+
+train_data, test_data = [], []
+
+for _, group_data in papers.groupby("subject"):
+    # Select around 50% of the dataset for training.
+    random_selection = np.random.rand(len(group_data.index)) <= 0.5
+    train_data.append(group_data[random_selection])
+    test_data.append(group_data[~random_selection])
+
+train_data = pd.concat(train_data).sample(frac=1)
+test_data = pd.concat(test_data).sample(frac=1)
+
+print("Train data shape:", train_data.shape)
+print("Test data shape:", test_data.shape)
+
+"""
+## Implement Train and Evaluate Experiment
+"""
+
+hidden_units = [32, 32]
+learning_rate = 0.01
+dropout_rate = 0.5
+num_epochs = 300
+batch_size = 256
+
+"""
+This function compiles and trains an input model using the given training data.
+"""
+
+
+def run_experiment(model, x_train, y_train):
+    # Compile the model.
+    model.compile(
+        optimizer=keras.optimizers.Adam(learning_rate),
+        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+        metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")],
+    )
+    # Create an early stopping callback.
+    early_stopping = keras.callbacks.EarlyStopping(
+        monitor="val_acc", patience=50, restore_best_weights=True
+    )
+    # Fit the model.
+    history = model.fit(
+        x=x_train,
+        y=y_train,
+        epochs=num_epochs,
+        batch_size=batch_size,
+        validation_split=0.15,
+        callbacks=[early_stopping],
+    )
+
+    return history
+
+
+"""
+This function displays the loss and accuracy curves of the model during training.
+"""
+
+
+def display_learning_curves(history):
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
+
+    ax1.plot(history.history["loss"])
+    ax1.plot(history.history["val_loss"])
+    ax1.legend(["train", "test"], loc="upper right")
+    ax1.set_xlabel("Epochs")
+    ax1.set_ylabel("Loss")
+
+    ax2.plot(history.history["acc"])
+    ax2.plot(history.history["val_acc"])
+    ax2.legend(["train", "test"], loc="upper right")
+    ax2.set_xlabel("Epochs")
+    ax2.set_ylabel("Accuracy")
+    plt.show()
+
+
+"""
+## Implement Feedforward Network (FFN) Module
+
+We will use this module in the baseline and the GNN models.
+"""
+
+
+def create_ffn(hidden_units, dropout_rate, name=None):
+    fnn_layers = []
+
+    for units in hidden_units:
+        fnn_layers.append(layers.BatchNormalization())
+        fnn_layers.append(layers.Dropout(dropout_rate))
+        fnn_layers.append(layers.Dense(units, activation=tf.nn.gelu))
+
+    return keras.Sequential(fnn_layers, name=name)
+
+
+"""
+## Build a Baseline Neural Network Model
+
+### Prepare the data for the baseline model
+"""
+
+feature_names = list(set(papers.columns) - {"paper_id", "subject"})
+num_features = len(feature_names)
+num_classes = len(class_idx)
+
+# Create train and test features as a numpy array.
+x_train = train_data[feature_names].to_numpy()
+x_test = test_data[feature_names].to_numpy()
+# Create train and test targets as a numpy array.
+y_train = train_data["subject"]
+y_test = test_data["subject"]
+
+"""
+### Implement a baseline classifier
+
+We add five FFN blocks with skip connections, so that we generate a baseline model with
+roughly the same number of parameters as the GNN models to be built later.
+"""
+
+
+def create_baseline_model(hidden_units, num_classes, dropout_rate=0.2):
+    inputs = layers.Input(shape=(num_features,), name="input_features")
+    x = create_ffn(hidden_units, dropout_rate, name=f"ffn_block1")(inputs)
+    for block_idx in range(4):
+        # Create an FFN block.
+        x1 = create_ffn(hidden_units, dropout_rate, name=f"ffn_block{block_idx + 2}")(x)
+        # Add skip connection.
+        x = layers.Add(name=f"skip_connection{block_idx + 2}")([x, x1])
+    # Compute logits.
+    logits = layers.Dense(num_classes, name="logits")(x)
+    # Create the model.
+    return keras.Model(inputs=inputs, outputs=logits, name="baseline")
+
+
+baseline_model = create_baseline_model(hidden_units, num_classes, dropout_rate)
+baseline_model.summary()
+
+"""
+### Train the baseline classifier
+"""
+
+history = run_experiment(baseline_model, x_train, y_train)
+
+"""
+Let's plot the learning curves.
+"""
+
+display_learning_curves(history)
+
+"""
+Now we evaluate the baseline model on the test data split.
+"""
+
+_, test_accuracy = baseline_model.evaluate(x=x_test, y=y_test, verbose=0)
+print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")
+
+"""
+### Examine the baseline model predictions
+
+Let's create new data instances by randomly generating binary word vectors with respect to
+the word presence probabilities.
+"""
+
+
+def generate_random_instances(num_instances):
+    token_probability = x_train.mean(axis=0)
+    instances = []
+    for _ in range(num_instances):
+        probabilities = np.random.uniform(size=len(token_probability))
+        instance = (probabilities <= token_probability).astype(int)
+        instances.append(instance)
+
+    return np.array(instances)
+
+
+def display_class_probabilities(probabilities):
+    for instance_idx, probs in enumerate(probabilities):
+        print(f"Instance {instance_idx + 1}:")
+        for class_idx, prob in enumerate(probs):
+            print(f"- {class_values[class_idx]}: {round(prob * 100, 2)}%")
+
+
+"""
+Now we show the baseline model predictions given these randomly generated instances.
+"""
+
+new_instances = generate_random_instances(num_classes)
+logits = baseline_model.predict(new_instances)
+probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
+display_class_probabilities(probabilities)
+
+"""
+## Build a Graph Neural Network Model
+
+### Prepare the data for the graph model
+
+Preparing and loading the graphs data into the model for training is the most challenging
+part in GNN models, which is addressed in different ways by the specialised libraries.
+In this example, we show a simple approach for preparing and using graph data that is suitable
+if your dataset consists of a single graph that fits entirely in memory.
+
+The graph data is represented by the `graph_info` tuple, which consists of the following
+three elements:
+
+1. `node_features`: This is a `[num_nodes, num_features]` NumPy array that includes the
+node features. In this dataset, the nodes are the papers, and the `node_features` are the
+word-presence binary vectors of each paper.
+2. `edges`:  This is `[num_edges, num_edges]` NumPy array representing a sparse
+[adjacency matrix](https://en.wikipedia.org/wiki/Adjacency_matrix#:~:text=In%20graph%20theory%20and%20computer,with%20zeros%20on%20its%20diagonal.)
+of the links between the nodes. In this example, the links are the citations between the papers.
+3. `edge_weights` (optional): This is a `[num_edges]` NumPy array that includes the edge weights, which *quantify*
+the relationships between nodes in the graph. In this example, there are no weights for the paper citations.
+"""
+
+# Create an edges array (sparse adjacency matrix) of shape [2, num_edges].
+edges = citations[["source", "target"]].to_numpy().T
+# Create an edge weights array of ones.
+edge_weights = tf.ones(shape=edges.shape[1])
+# Create a node features array of shape [num_nodes, num_features].
+node_features = tf.cast(
+    papers.sort_values("paper_id")[feature_names].to_numpy(), dtype=tf.dtypes.float32
+)
+# Create graph info tuple with node_features, edges, and edge_weights.
+graph_info = (node_features, edges, edge_weights)
+
+print("Edges shape:", edges.shape)
+print("Nodes shape:", node_features.shape)
+
+"""
+### Implement a graph convolution layer
+
+We implement a graph convolution module as a [Keras Layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Layer?version=nightly).
+Our `GraphConvLayer` performs the following steps:
+
+1. **Prepare**: The input node representations are processed using a FFN to produce a *message*. You can simplify
+the processing by only applying linear transformation to the representations.
+2. **Aggregate**: The messages of the neighbours of each node are aggregated with
+respect to the `edge_weights` using a *permutation invariant* pooling operation, such as *sum*, *mean*, and *max*,
+to prepare a single aggregated message for each node. See, for example, [tf.math.unsorted_segment_sum](https://www.tensorflow.org/api_docs/python/tf/math/unsorted_segment_sum)
+APIs used to aggregate neighbour messages.
+3. **Update**: The `node_repesentations` and `aggregated_messages`—both of shape `[num_nodes, representation_dim]`—
+are combined and processed to produce the new state of the node representations (node embeddings).
+If `combination_type` is `gru`, the `node_repesentations` and `aggregated_messages` are stacked to create a sequence,
+then processed by a GRU layer. Otherwise, the `node_repesentations` and `aggregated_messages` are added
+or concatenated, then processed using a FFN.
+
+
+The technique implemented use ideas from [Graph Convolutional Networks](https://arxiv.org/abs/1609.02907),
+[GraphSage](https://arxiv.org/abs/1706.02216), [Graph Isomorphism Network](https://arxiv.org/abs/1810.00826),
+[Simple Graph Networks](https://arxiv.org/abs/1902.07153), and
+[Gated Graph Sequence Neural Networks](https://arxiv.org/abs/1511.05493).
+Two other key techniques that are not covered are [Graph Attention Networks](https://arxiv.org/abs/1710.10903)
+and [Message Passing Neural Networks](https://arxiv.org/abs/1704.01212).
+"""
+
+
+def create_gru(hidden_units, dropout_rate):
+    inputs = keras.layers.Input(shape=(2, hidden_units[0]))
+    x = inputs
+    for units in hidden_units:
+        x = layers.GRU(
+            units=units,
+            activation="tanh",
+            recurrent_activation="sigmoid",
+            return_sequences=True,
+            dropout=dropout_rate,
+            return_state=False,
+            recurrent_dropout=dropout_rate,
+        )(x)
+    return keras.Model(inputs=inputs, outputs=x)
+
+
+class GraphConvLayer(layers.Layer):
+    def __init__(
+        self,
+        hidden_units,
+        dropout_rate=0.2,
+        aggregation_type="mean",
+        combination_type="concat",
+        normalize=False,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+
+        self.aggregation_type = aggregation_type
+        self.combination_type = combination_type
+        self.normalize = normalize
+
+        self.ffn_prepare = create_ffn(hidden_units, dropout_rate)
+        if self.combination_type == "gru":
+            self.update_fn = create_gru(hidden_units, dropout_rate)
+        else:
+            self.update_fn = create_ffn(hidden_units, dropout_rate)
+
+    def prepare(self, node_repesentations, weights=None):
+        # node_repesentations shape is [num_edges, embedding_dim].
+        messages = self.ffn_prepare(node_repesentations)
+        if weights is not None:
+            messages = messages * tf.expand_dims(weights, -1)
+        return messages
+
+    def aggregate(self, node_indices, neighbour_messages, node_repesentations):
+        # node_indices shape is [num_edges].
+        # neighbour_messages shape: [num_edges, representation_dim].
+        # node_repesentations shape is [num_nodes, representation_dim]
+        num_nodes = node_repesentations.shape[0]
+        if self.aggregation_type == "sum":
+            aggregated_message = tf.math.unsorted_segment_sum(
+                neighbour_messages, node_indices, num_segments=num_nodes
+            )
+        elif self.aggregation_type == "mean":
+            aggregated_message = tf.math.unsorted_segment_mean(
+                neighbour_messages, node_indices, num_segments=num_nodes
+            )
+        elif self.aggregation_type == "max":
+            aggregated_message = tf.math.unsorted_segment_max(
+                neighbour_messages, node_indices, num_segments=num_nodes
+            )
+        else:
+            raise ValueError(f"Invalid aggregation type: {self.aggregation_type}.")
+
+        return aggregated_message
+
+    def update(self, node_repesentations, aggregated_messages):
+        # node_repesentations shape is [num_nodes, representation_dim].
+        # aggregated_messages shape is [num_nodes, representation_dim].
+        if self.combination_type == "gru":
+            # Create a sequence of two elements for the GRU layer.
+            h = tf.stack([node_repesentations, aggregated_messages], axis=1)
+        elif self.combination_type == "concat":
+            # Concatenate the node_repesentations and aggregated_messages.
+            h = tf.concat([node_repesentations, aggregated_messages], axis=1)
+        elif self.combination_type == "add":
+            # Add node_repesentations and aggregated_messages.
+            h = node_repesentations + aggregated_messages
+        else:
+            raise ValueError(f"Invalid combination type: {self.combination_type}.")
+
+        # Apply the processing function.
+        node_embeddings = self.update_fn(h)
+        if self.combination_type == "gru":
+            node_embeddings = tf.unstack(node_embeddings, axis=1)[-1]
+
+        if self.normalize:
+            node_embeddings = tf.nn.l2_normalize(node_embeddings, axis=-1)
+        return node_embeddings
+
+    def call(self, inputs):
+        """Process the inputs to produce the node_embeddings.
+
+        inputs: a tuple of three elements: node_repesentations, edges, edge_weights.
+        Returns: node_embeddings of shape [num_nodes, representation_dim].
+        """
+
+        node_repesentations, edges, edge_weights = inputs
+        # Get node_indices (source) and neighbour_indices (target) from edges.
+        node_indices, neighbour_indices = edges[0], edges[1]
+        # neighbour_repesentations shape is [num_edges, representation_dim].
+        neighbour_repesentations = tf.gather(node_repesentations, neighbour_indices)
+
+        # Prepare the messages of the neighbours.
+        neighbour_messages = self.prepare(neighbour_repesentations, edge_weights)
+        # Aggregate the neighbour messages.
+        aggregated_messages = self.aggregate(
+            node_indices, neighbour_messages, node_repesentations
+        )
+        # Update the node embedding with the neighbour messages.
+        return self.update(node_repesentations, aggregated_messages)
+
+
+"""
+### Implement a graph neural network node classifier
+
+The GNN classification model follows the [Design Space for Graph Neural Networks](https://arxiv.org/abs/2011.08843) approach,
+as follows:
+
+1. Apply preprocessing using FFN to the node features to generate initial node representations.
+2. Apply one or more graph convolutional layer, with skip connections,  to the node representation
+to produce node embeddings.
+3. Apply post-processing using FFN to the node embeddings to generate  the final node embeddings.
+4. Feed the node embeddings in a Softmax layer to predict the node class.
+
+Each graph convolutional layer added captures information from a further level of neighbours.
+However, adding many graph convolutional layer can cause oversmoothing, where the model
+produces similar embeddings for all the nodes.
+
+Note that the `graph_info` passed to the constructor of the Keras model, and used as a *property*
+of the Keras model object, rather than input data for training or prediction.
+The model will accept a **batch** of `node_indices`, which are used to lookup the
+node features and neighbours from the `graph_info`.
+"""
+
+
+class GNNNodeClassifier(tf.keras.Model):
+    def __init__(
+        self,
+        graph_info,
+        num_classes,
+        hidden_units,
+        aggregation_type="sum",
+        combination_type="concat",
+        dropout_rate=0.2,
+        normalize=True,
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+
+        # Unpack graph_info to three elements: node_features, edges, and edge_weight.
+        node_features, edges, edge_weights = graph_info
+        self.node_features = node_features
+        self.edges = edges
+        self.edge_weights = edge_weights
+        # Set edge_weights to ones if not provided.
+        if self.edge_weights is None:
+            self.edge_weights = tf.ones(shape=edges.shape[1])
+        # Scale edge_weights to sum to 1.
+        self.edge_weights = self.edge_weights / tf.math.reduce_sum(self.edge_weights)
+
+        # Create a process layer.
+        self.preprocess = create_ffn(hidden_units, dropout_rate, name="preprocess")
+        # Create the first GraphConv layer.
+        self.conv1 = GraphConvLayer(
+            hidden_units,
+            dropout_rate,
+            aggregation_type,
+            combination_type,
+            normalize,
+            name="graph_conv1",
+        )
+        # Create the second GraphConv layer.
+        self.conv2 = GraphConvLayer(
+            hidden_units,
+            dropout_rate,
+            aggregation_type,
+            combination_type,
+            normalize,
+            name="graph_conv2",
+        )
+        # Create a postprocess layer.
+        self.postprocess = create_ffn(hidden_units, dropout_rate, name="postprocess")
+        # Create a compute logits layer.
+        self.compute_logits = layers.Dense(units=num_classes, name="logits")
+
+    def call(self, input_node_indices):
+        # Preprocess the node_features to produce node representations.
+        x = self.preprocess(self.node_features)
+        # Apply the first graph conv layer.
+        x1 = self.conv1((x, self.edges, self.edge_weights))
+        # Skip connection.
+        x = x1 + x
+        # Apply the second graph conv layer.
+        x2 = self.conv2((x, self.edges, self.edge_weights))
+        # Skip connection.
+        x = x2 + x
+        # Postprocess node embedding.
+        x = self.postprocess(x)
+        # Fetch node embeddings for the input node_indices.
+        node_embeddings = tf.gather(x, input_node_indices)
+        # Compute logits
+        return self.compute_logits(node_embeddings)
+
+
+"""
+Let's test instantiating and calling the GNN model.
+Notice that if you provide `N` node indices, the output will be a tensor of shape `[N, num_classes]`,
+regardless of the size of the graph.
+"""
+
+gnn_model = GNNNodeClassifier(
+    graph_info=graph_info,
+    num_classes=num_classes,
+    hidden_units=hidden_units,
+    dropout_rate=dropout_rate,
+    name="gnn_model",
+)
+
+print("GNN output shape:", gnn_model([1, 10, 100]))
+
+gnn_model.summary()
+
+"""
+### Train the GNN model
+
+Note that we use the standard *supervised* cross-entropy loss to train the model.
+However, we can add another *self-supervised* loss term for the generated node embeddings
+that makes sure that neighbouring nodes in graph have similar representations, while faraway
+nodes have dissimilar representations.
+"""
+
+x_train = train_data.paper_id.to_numpy()
+history = run_experiment(gnn_model, x_train, y_train)
+
+"""
+Let's plot the learning curves
+"""
+
+display_learning_curves(history)
+
+"""
+Now we evaluate the GNN model on the test data split.
+The results may vary depending on the training sample, however the GNN model always outperforms
+the baseline model in terms of the test accuracy.
+"""
+
+x_test = test_data.paper_id.to_numpy()
+_, test_accuracy = gnn_model.evaluate(x=x_test, y=y_test, verbose=0)
+print(f"Test accuracy: {round(test_accuracy * 100, 2)}%")
+
+"""
+### Examine the GNN model predictions
+
+Let's add the new instances as nodes to the `node_features`, and generate links
+(citations) to existing nodes.
+"""
+
+# First we add the N new_instances as nodes to the graph
+# by appending the new_instance to node_features.
+num_nodes = node_features.shape[0]
+new_node_features = np.concatenate([node_features, new_instances])
+# Second we add the M edges (citations) from each new node to a set
+# of existing nodes in a particular subject
+new_node_indices = [i + num_nodes for i in range(num_classes)]
+new_citations = []
+for subject_idx, group in papers.groupby("subject"):
+    subject_papers = list(group.paper_id)
+    # Select random x papers specific subject.
+    selected_paper_indices1 = np.random.choice(subject_papers, 5)
+    # Select random y papers from any subject (where y < x).
+    selected_paper_indices2 = np.random.choice(list(papers.paper_id), 2)
+    # Merge the selected paper indices.
+    selected_paper_indices = np.concatenate(
+        [selected_paper_indices1, selected_paper_indices2], axis=0
+    )
+    # Create edges between a citing paper idx and the selected cited papers.
+    citing_paper_indx = new_node_indices[subject_idx]
+    for cited_paper_idx in selected_paper_indices:
+        new_citations.append([citing_paper_indx, cited_paper_idx])
+
+new_citations = np.array(new_citations).T
+new_edges = np.concatenate([edges, new_citations], axis=1)
+
+"""
+Now let's update the `node_features` and the `edges` in the GNN model.
+"""
+
+print("Original node_features shape:", gnn_model.node_features.shape)
+print("Original edges shape:", gnn_model.edges.shape)
+gnn_model.node_features = new_node_features
+gnn_model.edges = new_edges
+gnn_model.edge_weights = tf.ones(shape=new_edges.shape[1])
+print("New node_features shape:", gnn_model.node_features.shape)
+print("New edges shape:", gnn_model.edges.shape)
+
+logits = gnn_model.predict(tf.convert_to_tensor(new_node_indices))
+probabilities = keras.activations.softmax(tf.convert_to_tensor(logits)).numpy()
+display_class_probabilities(probabilities)
+
+"""
+Notice that the probabilities of the expected subjects
+(to which several citations are added) are higher compared to the baseline model.
+"""
diff --git a/knowledge_base/graph/mpnn-molecular-graphs.py b/knowledge_base/graph/mpnn-molecular-graphs.py
new file mode 100644
index 0000000000000000000000000000000000000000..d41e69f4ad238aed3cc77c9bbf25bffe03e90e0c
--- /dev/null
+++ b/knowledge_base/graph/mpnn-molecular-graphs.py
@@ -0,0 +1,675 @@
+"""
+Title: Message-passing neural network (MPNN) for molecular property prediction
+Author: [akensert](http://github.com/akensert)
+Date created: 2021/08/16
+Last modified: 2021/12/27
+Description: Implementation of an MPNN to predict blood-brain barrier permeability.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+In this tutorial, we will implement a type of graph neural network (GNN) known as
+_ message passing neural network_ (MPNN) to predict graph properties. Specifically, we will
+implement an MPNN to predict a molecular property known as
+_blood-brain barrier permeability_ (BBBP).
+
+Motivation: as molecules are naturally represented as an undirected graph `G = (V, E)`,
+where `V` is a set or vertices (nodes; atoms) and `E` a set of edges (bonds), GNNs (such
+as MPNN) are proving to be a useful method for predicting molecular properties.
+
+Until now, more traditional methods, such as random forests, support vector machines, etc.,
+have been commonly used to predict molecular properties. In contrast to GNNs, these
+traditional approaches often operate on precomputed molecular features such as
+molecular weight, polarity, charge, number of carbon atoms, etc. Although these
+molecular features prove to be good predictors for various molecular properties, it is
+hypothesized that operating on these more "raw", "low-level", features could prove even
+better.
+
+### References
+
+In recent years, a lot of effort has been put into developing neural networks for
+graph data, including molecular graphs. For a summary of graph neural networks, see e.g.,
+[A Comprehensive Survey on Graph Neural Networks](https://arxiv.org/abs/1901.00596) and
+[Graph Neural Networks: A Review of Methods and Applications](https://arxiv.org/abs/1812.08434);
+and for further reading on the specific
+graph neural network implemented in this tutorial see
+[Neural Message Passing for Quantum Chemistry](https://arxiv.org/abs/1704.01212) and
+[DeepChem's MPNNModel](https://deepchem.readthedocs.io/en/latest/api_reference/models.html#mpnnmodel).
+"""
+
+"""
+## Setup
+
+### Install RDKit and other dependencies
+
+(Text below taken from
+[this tutorial](https://keras.io/examples/generative/wgan-graphs/)).
+
+[RDKit](https://www.rdkit.org/) is a collection of cheminformatics and machine-learning
+software written in C++ and Python. In this tutorial, RDKit is used to conveniently and
+efficiently transform
+[SMILES](https://en.wikipedia.org/wiki/Simplified_molecular-input_line-entry_system) to
+molecule objects, and then from those obtain sets of atoms and bonds.
+
+SMILES expresses the structure of a given molecule in the form of an ASCII string.
+The SMILES string is a compact encoding which, for smaller molecules, is relatively
+human-readable. Encoding molecules as a string both alleviates and facilitates database
+and/or web searching of a given molecule. RDKit uses algorithms to
+accurately transform a given SMILES to a molecule object, which can then
+be used to compute a great number of molecular properties/features.
+
+Notice, RDKit is commonly installed via [Conda](https://www.rdkit.org/docs/Install.html).
+However, thanks to
+[rdkit_platform_wheels](https://github.com/kuelumbus/rdkit_platform_wheels), rdkit
+can now (for the sake of this tutorial) be installed easily via pip, as follows:
+
+```
+pip -q install rdkit-pypi
+```
+
+And for easy and efficient reading of csv files and visualization, the below needs to be
+installed:
+
+```
+pip -q install pandas
+pip -q install Pillow
+pip -q install matplotlib
+pip -q install pydot
+sudo apt-get -qq install graphviz
+```
+"""
+
+"""
+### Import packages
+"""
+
+import os
+
+# Temporary suppress tf logs
+os.environ["TF_CPP_MIN_LOG_LEVEL"] = "3"
+
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+import warnings
+from rdkit import Chem
+from rdkit import RDLogger
+from rdkit.Chem.Draw import IPythonConsole
+from rdkit.Chem.Draw import MolsToGridImage
+
+# Temporary suppress warnings and RDKit logs
+warnings.filterwarnings("ignore")
+RDLogger.DisableLog("rdApp.*")
+
+np.random.seed(42)
+tf.random.set_seed(42)
+
+"""
+## Dataset
+
+Information about the dataset can be found in
+[A Bayesian Approach to in Silico Blood-Brain Barrier Penetration Modeling](https://pubs.acs.org/doi/10.1021/ci300124c)
+and [MoleculeNet: A Benchmark for Molecular Machine Learning](https://arxiv.org/abs/1703.00564).
+The dataset will be downloaded from [MoleculeNet.org](https://moleculenet.org/datasets-1).
+
+### About
+
+The dataset contains **2,050** molecules. Each molecule come with a **name**, **label**
+and **SMILES** string.
+
+The blood-brain barrier (BBB) is a membrane separating the blood from the brain
+extracellular fluid, hence blocking out most drugs (molecules) from reaching
+the brain. Because of this, the BBBP has been important to study for the development of
+new drugs that aim to target the central nervous system. The labels for this
+data set are binary (1 or 0) and indicate the permeability of the molecules.
+"""
+
+csv_path = keras.utils.get_file(
+    "BBBP.csv", "https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/BBBP.csv"
+)
+
+df = pd.read_csv(csv_path, usecols=[1, 2, 3])
+df.iloc[96:104]
+
+"""
+### Define features
+
+To encode features for atoms and bonds (which we will need later),
+we'll define two classes: `AtomFeaturizer` and `BondFeaturizer` respectively.
+
+To reduce the lines of code, i.e., to keep this tutorial short and concise,
+only about a handful of (atom and bond) features will be considered: \[atom features\]
+[symbol (element)](https://en.wikipedia.org/wiki/Chemical_element),
+[number of valence electrons](https://en.wikipedia.org/wiki/Valence_electron),
+[number of hydrogen bonds](https://en.wikipedia.org/wiki/Hydrogen),
+[orbital hybridization](https://en.wikipedia.org/wiki/Orbital_hybridisation),
+\[bond features\]
+[(covalent) bond type](https://en.wikipedia.org/wiki/Covalent_bond), and
+[conjugation](https://en.wikipedia.org/wiki/Conjugated_system).
+"""
+
+
+class Featurizer:
+    def __init__(self, allowable_sets):
+        self.dim = 0
+        self.features_mapping = {}
+        for k, s in allowable_sets.items():
+            s = sorted(list(s))
+            self.features_mapping[k] = dict(zip(s, range(self.dim, len(s) + self.dim)))
+            self.dim += len(s)
+
+    def encode(self, inputs):
+        output = np.zeros((self.dim,))
+        for name_feature, feature_mapping in self.features_mapping.items():
+            feature = getattr(self, name_feature)(inputs)
+            if feature not in feature_mapping:
+                continue
+            output[feature_mapping[feature]] = 1.0
+        return output
+
+
+class AtomFeaturizer(Featurizer):
+    def __init__(self, allowable_sets):
+        super().__init__(allowable_sets)
+
+    def symbol(self, atom):
+        return atom.GetSymbol()
+
+    def n_valence(self, atom):
+        return atom.GetTotalValence()
+
+    def n_hydrogens(self, atom):
+        return atom.GetTotalNumHs()
+
+    def hybridization(self, atom):
+        return atom.GetHybridization().name.lower()
+
+
+class BondFeaturizer(Featurizer):
+    def __init__(self, allowable_sets):
+        super().__init__(allowable_sets)
+        self.dim += 1
+
+    def encode(self, bond):
+        output = np.zeros((self.dim,))
+        if bond is None:
+            output[-1] = 1.0
+            return output
+        output = super().encode(bond)
+        return output
+
+    def bond_type(self, bond):
+        return bond.GetBondType().name.lower()
+
+    def conjugated(self, bond):
+        return bond.GetIsConjugated()
+
+
+atom_featurizer = AtomFeaturizer(
+    allowable_sets={
+        "symbol": {"B", "Br", "C", "Ca", "Cl", "F", "H", "I", "N", "Na", "O", "P", "S"},
+        "n_valence": {0, 1, 2, 3, 4, 5, 6},
+        "n_hydrogens": {0, 1, 2, 3, 4},
+        "hybridization": {"s", "sp", "sp2", "sp3"},
+    }
+)
+
+bond_featurizer = BondFeaturizer(
+    allowable_sets={
+        "bond_type": {"single", "double", "triple", "aromatic"},
+        "conjugated": {True, False},
+    }
+)
+
+
+"""
+### Generate graphs
+
+Before we can generate complete graphs from SMILES, we need to implement the following functions:
+
+1. `molecule_from_smiles`, which takes as input a SMILES and returns a molecule object.
+This is all handled by RDKit.
+
+2. `graph_from_molecule`, which takes as input a molecule object and returns a graph,
+represented as a three-tuple (atom_features, bond_features, pair_indices). For this we
+will make use of the classes defined previously.
+
+Finally, we can now implement the function `graphs_from_smiles`, which applies function (1)
+and subsequently (2) on all SMILES of the training, validation and test datasets.
+
+Notice: although scaffold splitting is recommended for this data set (see
+[here](https://arxiv.org/abs/1703.00564)), for simplicity, simple random splittings were
+performed.
+"""
+
+
+def molecule_from_smiles(smiles):
+    # MolFromSmiles(m, sanitize=True) should be equivalent to
+    # MolFromSmiles(m, sanitize=False) -> SanitizeMol(m) -> AssignStereochemistry(m, ...)
+    molecule = Chem.MolFromSmiles(smiles, sanitize=False)
+
+    # If sanitization is unsuccessful, catch the error, and try again without
+    # the sanitization step that caused the error
+    flag = Chem.SanitizeMol(molecule, catchErrors=True)
+    if flag != Chem.SanitizeFlags.SANITIZE_NONE:
+        Chem.SanitizeMol(molecule, sanitizeOps=Chem.SanitizeFlags.SANITIZE_ALL ^ flag)
+
+    Chem.AssignStereochemistry(molecule, cleanIt=True, force=True)
+    return molecule
+
+
+def graph_from_molecule(molecule):
+    # Initialize graph
+    atom_features = []
+    bond_features = []
+    pair_indices = []
+
+    for atom in molecule.GetAtoms():
+        atom_features.append(atom_featurizer.encode(atom))
+
+        # Add self-loops
+        pair_indices.append([atom.GetIdx(), atom.GetIdx()])
+        bond_features.append(bond_featurizer.encode(None))
+
+        for neighbor in atom.GetNeighbors():
+            bond = molecule.GetBondBetweenAtoms(atom.GetIdx(), neighbor.GetIdx())
+            pair_indices.append([atom.GetIdx(), neighbor.GetIdx()])
+            bond_features.append(bond_featurizer.encode(bond))
+
+    return np.array(atom_features), np.array(bond_features), np.array(pair_indices)
+
+
+def graphs_from_smiles(smiles_list):
+    # Initialize graphs
+    atom_features_list = []
+    bond_features_list = []
+    pair_indices_list = []
+
+    for smiles in smiles_list:
+        molecule = molecule_from_smiles(smiles)
+        atom_features, bond_features, pair_indices = graph_from_molecule(molecule)
+
+        atom_features_list.append(atom_features)
+        bond_features_list.append(bond_features)
+        pair_indices_list.append(pair_indices)
+
+    # Convert lists to ragged tensors for tf.data.Dataset later on
+    return (
+        tf.ragged.constant(atom_features_list, dtype=tf.float32),
+        tf.ragged.constant(bond_features_list, dtype=tf.float32),
+        tf.ragged.constant(pair_indices_list, dtype=tf.int64),
+    )
+
+
+# Shuffle array of indices ranging from 0 to 2049
+permuted_indices = np.random.permutation(np.arange(df.shape[0]))
+
+# Train set: 80 % of data
+train_index = permuted_indices[: int(df.shape[0] * 0.8)]
+x_train = graphs_from_smiles(df.iloc[train_index].smiles)
+y_train = df.iloc[train_index].p_np
+
+# Valid set: 19 % of data
+valid_index = permuted_indices[int(df.shape[0] * 0.8) : int(df.shape[0] * 0.99)]
+x_valid = graphs_from_smiles(df.iloc[valid_index].smiles)
+y_valid = df.iloc[valid_index].p_np
+
+# Test set: 1 % of data
+test_index = permuted_indices[int(df.shape[0] * 0.99) :]
+x_test = graphs_from_smiles(df.iloc[test_index].smiles)
+y_test = df.iloc[test_index].p_np
+
+"""
+### Test the functions
+"""
+
+print(f"Name:\t{df.name[100]}\nSMILES:\t{df.smiles[100]}\nBBBP:\t{df.p_np[100]}")
+molecule = molecule_from_smiles(df.iloc[100].smiles)
+print("Molecule:")
+molecule
+
+"""
+"""
+
+graph = graph_from_molecule(molecule)
+print("Graph (including self-loops):")
+print("\tatom features\t", graph[0].shape)
+print("\tbond features\t", graph[1].shape)
+print("\tpair indices\t", graph[2].shape)
+
+
+"""
+### Create a `tf.data.Dataset`
+
+In this tutorial, the MPNN implementation will take as input (per iteration) a single graph.
+Therefore, given a batch of (sub)graphs (molecules), we need to merge them into a
+single graph (we'll refer to this graph as *global graph*).
+This global graph is a disconnected graph where each subgraph is
+completely separated from the other subgraphs.
+"""
+
+
+def prepare_batch(x_batch, y_batch):
+    """Merges (sub)graphs of batch into a single global (disconnected) graph"""
+
+    atom_features, bond_features, pair_indices = x_batch
+
+    # Obtain number of atoms and bonds for each graph (molecule)
+    num_atoms = atom_features.row_lengths()
+    num_bonds = bond_features.row_lengths()
+
+    # Obtain partition indices (molecule_indicator), which will be used to
+    # gather (sub)graphs from global graph in model later on
+    molecule_indices = tf.range(len(num_atoms))
+    molecule_indicator = tf.repeat(molecule_indices, num_atoms)
+
+    # Merge (sub)graphs into a global (disconnected) graph. Adding 'increment' to
+    # 'pair_indices' (and merging ragged tensors) actualizes the global graph
+    gather_indices = tf.repeat(molecule_indices[:-1], num_bonds[1:])
+    increment = tf.cumsum(num_atoms[:-1])
+    increment = tf.pad(tf.gather(increment, gather_indices), [(num_bonds[0], 0)])
+    pair_indices = pair_indices.merge_dims(outer_axis=0, inner_axis=1).to_tensor()
+    pair_indices = pair_indices + increment[:, tf.newaxis]
+    atom_features = atom_features.merge_dims(outer_axis=0, inner_axis=1).to_tensor()
+    bond_features = bond_features.merge_dims(outer_axis=0, inner_axis=1).to_tensor()
+
+    return (atom_features, bond_features, pair_indices, molecule_indicator), y_batch
+
+
+def MPNNDataset(X, y, batch_size=32, shuffle=False):
+    dataset = tf.data.Dataset.from_tensor_slices((X, (y)))
+    if shuffle:
+        dataset = dataset.shuffle(1024)
+    return dataset.batch(batch_size).map(prepare_batch, -1).prefetch(-1)
+
+
+"""
+## Model
+
+The MPNN model can take on various shapes and forms. In this tutorial, we will implement an
+MPNN based on the original paper
+[Neural Message Passing for Quantum Chemistry](https://arxiv.org/abs/1704.01212) and
+[DeepChem's MPNNModel](https://deepchem.readthedocs.io/en/latest/api_reference/models.html#mpnnmodel).
+The MPNN of this tutorial consists of three stages: message passing, readout and
+classification.
+
+
+### Message passing
+
+The message passing step itself consists of two parts:
+
+1. The *edge network*, which passes messages from 1-hop neighbors `w_{i}` of `v`
+to `v`, based on the edge features between them (`e_{vw_{i}}`),
+resulting in an updated node (state) `v'`. `w_{i}` denotes the `i:th` neighbor of
+`v`.
+
+2. The *gated recurrent unit* (GRU), which takes as input the most recent node state
+and updates it based on previous node states. In
+other words, the most recent node state serves as the input to the GRU, while the previous
+node states are incorporated within the memory state of the GRU. This allows information
+to travel from one node state (e.g., `v`) to another (e.g., `v''`).
+
+Importantly, step (1) and (2) are repeated for `k steps`, and where at each step `1...k`,
+the radius (or number of hops) of aggregated information from `v` increases by 1.
+"""
+
+
+class EdgeNetwork(layers.Layer):
+    def build(self, input_shape):
+        self.atom_dim = input_shape[0][-1]
+        self.bond_dim = input_shape[1][-1]
+        self.kernel = self.add_weight(
+            shape=(self.bond_dim, self.atom_dim * self.atom_dim),
+            initializer="glorot_uniform",
+            name="kernel",
+        )
+        self.bias = self.add_weight(
+            shape=(self.atom_dim * self.atom_dim),
+            initializer="zeros",
+            name="bias",
+        )
+        self.built = True
+
+    def call(self, inputs):
+        atom_features, bond_features, pair_indices = inputs
+
+        # Apply linear transformation to bond features
+        bond_features = tf.matmul(bond_features, self.kernel) + self.bias
+
+        # Reshape for neighborhood aggregation later
+        bond_features = tf.reshape(bond_features, (-1, self.atom_dim, self.atom_dim))
+
+        # Obtain atom features of neighbors
+        atom_features_neighbors = tf.gather(atom_features, pair_indices[:, 1])
+        atom_features_neighbors = tf.expand_dims(atom_features_neighbors, axis=-1)
+
+        # Apply neighborhood aggregation
+        transformed_features = tf.matmul(bond_features, atom_features_neighbors)
+        transformed_features = tf.squeeze(transformed_features, axis=-1)
+        aggregated_features = tf.math.unsorted_segment_sum(
+            transformed_features,
+            pair_indices[:, 0],
+            num_segments=tf.shape(atom_features)[0],
+        )
+        return aggregated_features
+
+
+class MessagePassing(layers.Layer):
+    def __init__(self, units, steps=4, **kwargs):
+        super().__init__(**kwargs)
+        self.units = units
+        self.steps = steps
+
+    def build(self, input_shape):
+        self.atom_dim = input_shape[0][-1]
+        self.message_step = EdgeNetwork()
+        self.pad_length = max(0, self.units - self.atom_dim)
+        self.update_step = layers.GRUCell(self.atom_dim + self.pad_length)
+        self.built = True
+
+    def call(self, inputs):
+        atom_features, bond_features, pair_indices = inputs
+
+        # Pad atom features if number of desired units exceeds atom_features dim.
+        # Alternatively, a dense layer could be used here.
+        atom_features_updated = tf.pad(atom_features, [(0, 0), (0, self.pad_length)])
+
+        # Perform a number of steps of message passing
+        for i in range(self.steps):
+            # Aggregate information from neighbors
+            atom_features_aggregated = self.message_step(
+                [atom_features_updated, bond_features, pair_indices]
+            )
+
+            # Update node state via a step of GRU
+            atom_features_updated, _ = self.update_step(
+                atom_features_aggregated, atom_features_updated
+            )
+        return atom_features_updated
+
+
+"""
+### Readout
+
+When the message passing procedure ends, the k-step-aggregated node states are to be partitioned
+into subgraphs (corresponding to each molecule in the batch) and subsequently
+reduced to graph-level embeddings. In the
+[original paper](https://arxiv.org/abs/1704.01212), a
+[set-to-set layer](https://arxiv.org/abs/1511.06391) was used for this purpose.
+In this tutorial however, a transformer encoder + average pooling will be used. Specifically:
+
+* the k-step-aggregated node states will be partitioned into the subgraphs
+(corresponding to each molecule in the batch);
+* each subgraph will then be padded to match the subgraph with the greatest number of nodes, followed
+by a `tf.stack(...)`;
+* the (stacked padded) tensor, encoding subgraphs (each subgraph containing a set of node states), are
+masked to make sure the paddings don't interfere with training;
+* finally, the tensor is passed to the transformer followed by average pooling.
+"""
+
+
+class PartitionPadding(layers.Layer):
+    def __init__(self, batch_size, **kwargs):
+        super().__init__(**kwargs)
+        self.batch_size = batch_size
+
+    def call(self, inputs):
+        atom_features, molecule_indicator = inputs
+
+        # Obtain subgraphs
+        atom_features_partitioned = tf.dynamic_partition(
+            atom_features, molecule_indicator, self.batch_size
+        )
+
+        # Pad and stack subgraphs
+        num_atoms = [tf.shape(f)[0] for f in atom_features_partitioned]
+        max_num_atoms = tf.reduce_max(num_atoms)
+        atom_features_stacked = tf.stack(
+            [
+                tf.pad(f, [(0, max_num_atoms - n), (0, 0)])
+                for f, n in zip(atom_features_partitioned, num_atoms)
+            ],
+            axis=0,
+        )
+
+        # Remove empty subgraphs (usually for last batch in dataset)
+        gather_indices = tf.where(tf.reduce_sum(atom_features_stacked, (1, 2)) != 0)
+        gather_indices = tf.squeeze(gather_indices, axis=-1)
+        return tf.gather(atom_features_stacked, gather_indices, axis=0)
+
+
+class TransformerEncoderReadout(layers.Layer):
+    def __init__(
+        self, num_heads=8, embed_dim=64, dense_dim=512, batch_size=32, **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        self.partition_padding = PartitionPadding(batch_size)
+        self.attention = layers.MultiHeadAttention(num_heads, embed_dim)
+        self.dense_proj = keras.Sequential(
+            [
+                layers.Dense(dense_dim, activation="relu"),
+                layers.Dense(embed_dim),
+            ]
+        )
+        self.layernorm_1 = layers.LayerNormalization()
+        self.layernorm_2 = layers.LayerNormalization()
+        self.average_pooling = layers.GlobalAveragePooling1D()
+
+    def call(self, inputs):
+        x = self.partition_padding(inputs)
+        padding_mask = tf.reduce_any(tf.not_equal(x, 0.0), axis=-1)
+        padding_mask = padding_mask[:, tf.newaxis, tf.newaxis, :]
+        attention_output = self.attention(x, x, attention_mask=padding_mask)
+        proj_input = self.layernorm_1(x + attention_output)
+        proj_output = self.layernorm_2(proj_input + self.dense_proj(proj_input))
+        return self.average_pooling(proj_output)
+
+
+"""
+### Message Passing Neural Network (MPNN)
+
+It is now time to complete the MPNN model. In addition to the message passing
+and readout, a two-layer classification network will be implemented to make
+predictions of BBBP.
+"""
+
+
+def MPNNModel(
+    atom_dim,
+    bond_dim,
+    batch_size=32,
+    message_units=64,
+    message_steps=4,
+    num_attention_heads=8,
+    dense_units=512,
+):
+    atom_features = layers.Input((atom_dim), dtype="float32", name="atom_features")
+    bond_features = layers.Input((bond_dim), dtype="float32", name="bond_features")
+    pair_indices = layers.Input((2), dtype="int32", name="pair_indices")
+    molecule_indicator = layers.Input((), dtype="int32", name="molecule_indicator")
+
+    x = MessagePassing(message_units, message_steps)(
+        [atom_features, bond_features, pair_indices]
+    )
+
+    x = TransformerEncoderReadout(
+        num_attention_heads, message_units, dense_units, batch_size
+    )([x, molecule_indicator])
+
+    x = layers.Dense(dense_units, activation="relu")(x)
+    x = layers.Dense(1, activation="sigmoid")(x)
+
+    model = keras.Model(
+        inputs=[atom_features, bond_features, pair_indices, molecule_indicator],
+        outputs=[x],
+    )
+    return model
+
+
+mpnn = MPNNModel(
+    atom_dim=x_train[0][0][0].shape[0],
+    bond_dim=x_train[1][0][0].shape[0],
+)
+
+mpnn.compile(
+    loss=keras.losses.BinaryCrossentropy(),
+    optimizer=keras.optimizers.Adam(learning_rate=5e-4),
+    metrics=[keras.metrics.AUC(name="AUC")],
+)
+
+keras.utils.plot_model(mpnn, show_dtype=True, show_shapes=True)
+
+"""
+### Training
+"""
+
+train_dataset = MPNNDataset(x_train, y_train)
+valid_dataset = MPNNDataset(x_valid, y_valid)
+test_dataset = MPNNDataset(x_test, y_test)
+
+history = mpnn.fit(
+    train_dataset,
+    validation_data=valid_dataset,
+    epochs=40,
+    verbose=2,
+    class_weight={0: 2.0, 1: 0.5},
+)
+
+plt.figure(figsize=(10, 6))
+plt.plot(history.history["AUC"], label="train AUC")
+plt.plot(history.history["val_AUC"], label="valid AUC")
+plt.xlabel("Epochs", fontsize=16)
+plt.ylabel("AUC", fontsize=16)
+plt.legend(fontsize=16)
+
+"""
+### Predicting
+"""
+
+molecules = [molecule_from_smiles(df.smiles.values[index]) for index in test_index]
+y_true = [df.p_np.values[index] for index in test_index]
+y_pred = tf.squeeze(mpnn.predict(test_dataset), axis=1)
+
+legends = [f"y_true/y_pred = {y_true[i]}/{y_pred[i]:.2f}" for i in range(len(y_true))]
+MolsToGridImage(molecules, molsPerRow=4, legends=legends)
+
+"""
+## Conclusions
+
+In this tutorial, we demonstrated a message passing neural network (MPNN) to
+predict blood-brain barrier permeability (BBBP) for a number of different molecules. We
+first had to construct graphs from SMILES, then build a Keras model that could
+operate on these graphs, and finally train the model to make the predictions.
+
+Example available on HuggingFace
+
+| Trained Model | Demo |
+| :--: | :--: |
+| [![Generic badge](https://img.shields.io/badge/%F0%9F%A4%97%20Model-mpnn%20molecular%20graphs-black.svg)](https://huggingface.co/keras-io/MPNN-for-molecular-property-prediction) | [![Generic badge](https://img.shields.io/badge/%F0%9F%A4%97%20Spaces-mpnn%20molecular%20graphs-black.svg)](https://huggingface.co/spaces/keras-io/molecular-property-prediction) |
+"""
diff --git a/knowledge_base/graph/node2vec_movielens.py b/knowledge_base/graph/node2vec_movielens.py
new file mode 100644
index 0000000000000000000000000000000000000000..0e9972f92ae9769f2e0f9aec6d3f9b8cfd2384e5
--- /dev/null
+++ b/knowledge_base/graph/node2vec_movielens.py
@@ -0,0 +1,599 @@
+"""
+Title: Graph representation learning with node2vec
+Author: [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/)
+Date created: 2021/05/15
+Last modified: 2021/05/15
+Description: Implementing the node2vec model to generate embeddings for movies from the MovieLens dataset.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+Learning useful representations from objects structured as graphs is useful for
+a variety of machine learning (ML) applications—such as social and communication networks analysis,
+biomedicine studies, and recommendation systems.
+[Graph representation Learning](https://www.cs.mcgill.ca/~wlh/grl_book/) aims to
+learn embeddings for the graph nodes, which can be used for a variety of ML tasks
+such as node label prediction (e.g. categorizing an article based on its citations)
+and link prediction (e.g. recommending an interest group to a user in a social network).
+
+[node2vec](https://arxiv.org/abs/1607.00653) is a simple, yet scalable and effective
+technique for learning low-dimensional embeddings for nodes in a graph by optimizing
+a neighborhood-preserving objective. The aim is to learn similar embeddings for
+neighboring nodes, with respect to the graph structure.
+
+Given your data items structured as a graph (where the items are represented as
+nodes and the relationship between items are represented as edges),
+node2vec works as follows:
+
+1. Generate item sequences using (biased) random walk.
+2. Create positive and negative training examples from these sequences.
+3. Train a [word2vec](https://www.tensorflow.org/tutorials/text/word2vec) model
+(skip-gram) to learn embeddings for the items.
+
+In this example, we demonstrate the node2vec technique on the
+[small version of the Movielens dataset](https://files.grouplens.org/datasets/movielens/ml-latest-small-README.html)
+to learn movie embeddings. Such a dataset can be represented as a graph by treating
+the movies as nodes, and creating edges between movies that have similar ratings
+by the users. The learnt movie embeddings can be used for tasks such as movie recommendation,
+or movie genres prediction.
+
+This example requires `networkx` package, which can be installed using the following command:
+
+```shell
+pip install networkx
+```
+"""
+
+"""
+## Setup
+"""
+
+import os
+from collections import defaultdict
+import math
+import networkx as nx
+import random
+from tqdm import tqdm
+from zipfile import ZipFile
+from urllib.request import urlretrieve
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+import matplotlib.pyplot as plt
+
+"""
+## Download the MovieLens dataset and prepare the data
+
+The small version of the MovieLens dataset includes around 100k ratings
+from 610 users on 9,742 movies.
+
+First, let's download the dataset. The downloaded folder will contain
+three data files: `users.csv`, `movies.csv`, and `ratings.csv`. In this example,
+we will only need the `movies.dat`, and `ratings.dat` data files.
+"""
+
+urlretrieve(
+    "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip", "movielens.zip"
+)
+ZipFile("movielens.zip", "r").extractall()
+
+"""
+Then, we load the data into a Pandas DataFrame and perform some basic preprocessing.
+"""
+
+# Load movies to a DataFrame.
+movies = pd.read_csv("ml-latest-small/movies.csv")
+# Create a `movieId` string.
+movies["movieId"] = movies["movieId"].apply(lambda x: f"movie_{x}")
+
+# Load ratings to a DataFrame.
+ratings = pd.read_csv("ml-latest-small/ratings.csv")
+# Convert the `ratings` to floating point
+ratings["rating"] = ratings["rating"].apply(lambda x: float(x))
+# Create the `movie_id` string.
+ratings["movieId"] = ratings["movieId"].apply(lambda x: f"movie_{x}")
+
+print("Movies data shape:", movies.shape)
+print("Ratings data shape:", ratings.shape)
+
+"""
+Let's inspect a sample instance of the `ratings` DataFrame.
+"""
+
+ratings.head()
+
+"""
+Next, let's check a sample instance of the `movies` DataFrame.
+"""
+
+movies.head()
+
+"""
+Implement two utility functions for the `movies` DataFrame.
+"""
+
+
+def get_movie_title_by_id(movieId):
+    return list(movies[movies.movieId == movieId].title)[0]
+
+
+def get_movie_id_by_title(title):
+    return list(movies[movies.title == title].movieId)[0]
+
+
+"""
+## Construct the Movies graph
+
+We create an edge between two movie nodes in the graph if both movies are rated
+by the same user >= `min_rating`. The weight of the edge will be based on the
+[pointwise mutual information](https://en.wikipedia.org/wiki/Pointwise_mutual_information)
+between the two movies, which is computed as: `log(xy) - log(x) - log(y) + log(D)`, where:
+
+* `xy` is how many users rated both movie `x` and movie `y` with >= `min_rating`.
+* `x` is how many users rated movie `x` >= `min_rating`.
+* `y` is how many users rated movie `y` >= `min_rating`.
+* `D` total number of movie ratings >= `min_rating`.
+"""
+
+"""
+### Step 1: create the weighted edges between movies.
+"""
+
+min_rating = 5
+pair_frequency = defaultdict(int)
+item_frequency = defaultdict(int)
+
+# Filter instances where rating is greater than or equal to min_rating.
+rated_movies = ratings[ratings.rating >= min_rating]
+# Group instances by user.
+movies_grouped_by_users = list(rated_movies.groupby("userId"))
+for group in tqdm(
+    movies_grouped_by_users,
+    position=0,
+    leave=True,
+    desc="Compute movie rating frequencies",
+):
+    # Get a list of movies rated by the user.
+    current_movies = list(group[1]["movieId"])
+
+    for i in range(len(current_movies)):
+        item_frequency[current_movies[i]] += 1
+        for j in range(i + 1, len(current_movies)):
+            x = min(current_movies[i], current_movies[j])
+            y = max(current_movies[i], current_movies[j])
+            pair_frequency[(x, y)] += 1
+
+"""
+### Step 2: create the graph with the nodes and the edges
+
+To reduce the number of edges between nodes, we only add an edge between movies
+if the weight of the edge is greater than `min_weight`.
+"""
+
+min_weight = 10
+D = math.log(sum(item_frequency.values()))
+
+# Create the movies undirected graph.
+movies_graph = nx.Graph()
+# Add weighted edges between movies.
+# This automatically adds the movie nodes to the graph.
+for pair in tqdm(
+    pair_frequency, position=0, leave=True, desc="Creating the movie graph"
+):
+    x, y = pair
+    xy_frequency = pair_frequency[pair]
+    x_frequency = item_frequency[x]
+    y_frequency = item_frequency[y]
+    pmi = math.log(xy_frequency) - math.log(x_frequency) - math.log(y_frequency) + D
+    weight = pmi * xy_frequency
+    # Only include edges with weight >= min_weight.
+    if weight >= min_weight:
+        movies_graph.add_edge(x, y, weight=weight)
+
+"""
+Let's display the total number of nodes and edges in the graph.
+Note that the number of nodes is less than the total number of movies,
+since only the movies that have edges to other movies are added.
+"""
+
+print("Total number of graph nodes:", movies_graph.number_of_nodes())
+print("Total number of graph edges:", movies_graph.number_of_edges())
+
+"""
+Let's display the average node degree (number of neighbours) in the graph.
+"""
+
+degrees = []
+for node in movies_graph.nodes:
+    degrees.append(movies_graph.degree[node])
+
+print("Average node degree:", round(sum(degrees) / len(degrees), 2))
+
+"""
+### Step 3: Create vocabulary and a mapping from tokens to integer indices
+
+The vocabulary is the nodes (movie IDs) in the graph.
+"""
+
+vocabulary = ["NA"] + list(movies_graph.nodes)
+vocabulary_lookup = {token: idx for idx, token in enumerate(vocabulary)}
+
+"""
+## Implement the biased random walk
+
+A random walk starts from a given node, and randomly picks a neighbour node to move to.
+If the edges are weighted, the neighbour is selected *probabilistically* with
+respect to weights of the edges between the current node and its neighbours.
+This procedure is repeated for `num_steps` to generate a sequence of *related* nodes.
+
+The [*biased* random walk](https://en.wikipedia.org/wiki/Biased_random_walk_on_a_graph) balances between **breadth-first sampling**
+(where only local neighbours are visited) and **depth-first sampling**
+(where  distant neighbours are visited) by introducing the following two parameters:
+
+1. **Return parameter** (`p`): Controls the likelihood of immediately revisiting
+a node in the walk. Setting it to a high value encourages moderate exploration,
+while setting it to a low value would keep the walk local.
+2. **In-out parameter** (`q`): Allows the search to differentiate
+between *inward* and *outward* nodes. Setting it to a high value biases the
+random walk towards local nodes, while setting it to a low value biases the walk
+to visit nodes which are further away.
+
+"""
+
+
+def next_step(graph, previous, current, p, q):
+    neighbors = list(graph.neighbors(current))
+
+    weights = []
+    # Adjust the weights of the edges to the neighbors with respect to p and q.
+    for neighbor in neighbors:
+        if neighbor == previous:
+            # Control the probability to return to the previous node.
+            weights.append(graph[current][neighbor]["weight"] / p)
+        elif graph.has_edge(neighbor, previous):
+            # The probability of visiting a local node.
+            weights.append(graph[current][neighbor]["weight"])
+        else:
+            # Control the probability to move forward.
+            weights.append(graph[current][neighbor]["weight"] / q)
+
+    # Compute the probabilities of visiting each neighbor.
+    weight_sum = sum(weights)
+    probabilities = [weight / weight_sum for weight in weights]
+    # Probabilistically select a neighbor to visit.
+    next = np.random.choice(neighbors, size=1, p=probabilities)[0]
+    return next
+
+
+def random_walk(graph, num_walks, num_steps, p, q):
+    walks = []
+    nodes = list(graph.nodes())
+    # Perform multiple iterations of the random walk.
+    for walk_iteration in range(num_walks):
+        random.shuffle(nodes)
+
+        for node in tqdm(
+            nodes,
+            position=0,
+            leave=True,
+            desc=f"Random walks iteration {walk_iteration + 1} of {num_walks}",
+        ):
+            # Start the walk with a random node from the graph.
+            walk = [node]
+            # Randomly walk for num_steps.
+            while len(walk) < num_steps:
+                current = walk[-1]
+                previous = walk[-2] if len(walk) > 1 else None
+                # Compute the next node to visit.
+                next = next_step(graph, previous, current, p, q)
+                walk.append(next)
+            # Replace node ids (movie ids) in the walk with token ids.
+            walk = [vocabulary_lookup[token] for token in walk]
+            # Add the walk to the generated sequence.
+            walks.append(walk)
+
+    return walks
+
+
+"""
+## Generate training data using the biased random walk
+
+You can explore different configurations of `p` and `q` to different results of
+related movies.
+"""
+# Random walk return parameter.
+p = 1
+# Random walk in-out parameter.
+q = 1
+# Number of iterations of random walks.
+num_walks = 5
+# Number of steps of each random walk.
+num_steps = 10
+walks = random_walk(movies_graph, num_walks, num_steps, p, q)
+
+print("Number of walks generated:", len(walks))
+
+"""
+## Generate positive and negative examples
+
+To train a skip-gram model, we use the generated walks to create positive and
+negative training examples. Each example includes the following features:
+
+1. `target`: A movie in a walk sequence.
+2. `context`: Another movie in a walk sequence.
+3. `weight`: How many times these two movies occurred  in walk sequences.
+4. `label`: The label is 1 if these two movies are samples from the walk sequences,
+otherwise (i.e., if randomly sampled) the label is 0.
+"""
+
+"""
+### Generate examples
+"""
+
+
+def generate_examples(sequences, window_size, num_negative_samples, vocabulary_size):
+    example_weights = defaultdict(int)
+    # Iterate over all sequences (walks).
+    for sequence in tqdm(
+        sequences,
+        position=0,
+        leave=True,
+        desc=f"Generating positive  and negative examples",
+    ):
+        # Generate positive and negative skip-gram pairs for a sequence (walk).
+        pairs, labels = keras.preprocessing.sequence.skipgrams(
+            sequence,
+            vocabulary_size=vocabulary_size,
+            window_size=window_size,
+            negative_samples=num_negative_samples,
+        )
+        for idx in range(len(pairs)):
+            pair = pairs[idx]
+            label = labels[idx]
+            target, context = min(pair[0], pair[1]), max(pair[0], pair[1])
+            if target == context:
+                continue
+            entry = (target, context, label)
+            example_weights[entry] += 1
+
+    targets, contexts, labels, weights = [], [], [], []
+    for entry in example_weights:
+        weight = example_weights[entry]
+        target, context, label = entry
+        targets.append(target)
+        contexts.append(context)
+        labels.append(label)
+        weights.append(weight)
+
+    return np.array(targets), np.array(contexts), np.array(labels), np.array(weights)
+
+
+num_negative_samples = 4
+targets, contexts, labels, weights = generate_examples(
+    sequences=walks,
+    window_size=num_steps,
+    num_negative_samples=num_negative_samples,
+    vocabulary_size=len(vocabulary),
+)
+
+"""
+Let's display the shapes of the outputs
+"""
+
+print(f"Targets shape: {targets.shape}")
+print(f"Contexts shape: {contexts.shape}")
+print(f"Labels shape: {labels.shape}")
+print(f"Weights shape: {weights.shape}")
+
+"""
+### Convert the data into `tf.data.Dataset` objects
+"""
+
+batch_size = 1024
+
+
+def create_dataset(targets, contexts, labels, weights, batch_size):
+    inputs = {
+        "target": targets,
+        "context": contexts,
+    }
+    dataset = tf.data.Dataset.from_tensor_slices((inputs, labels, weights))
+    dataset = dataset.shuffle(buffer_size=batch_size * 2)
+    dataset = dataset.batch(batch_size, drop_remainder=True)
+    dataset = dataset.prefetch(tf.data.AUTOTUNE)
+    return dataset
+
+
+dataset = create_dataset(
+    targets=targets,
+    contexts=contexts,
+    labels=labels,
+    weights=weights,
+    batch_size=batch_size,
+)
+
+"""
+## Train the skip-gram model
+
+Our skip-gram is a simple binary classification model that works as follows:
+
+1. An embedding is looked up for the `target` movie.
+2. An embedding is looked up for the `context` movie.
+3. The dot product is computed between these two embeddings.
+4. The result (after a sigmoid activation) is compared to the label.
+5. A binary crossentropy loss is used.
+"""
+
+learning_rate = 0.001
+embedding_dim = 50
+num_epochs = 10
+
+"""
+### Implement the model
+"""
+
+
+def create_model(vocabulary_size, embedding_dim):
+    inputs = {
+        "target": layers.Input(name="target", shape=(), dtype="int32"),
+        "context": layers.Input(name="context", shape=(), dtype="int32"),
+    }
+    # Initialize item embeddings.
+    embed_item = layers.Embedding(
+        input_dim=vocabulary_size,
+        output_dim=embedding_dim,
+        embeddings_initializer="he_normal",
+        embeddings_regularizer=keras.regularizers.l2(1e-6),
+        name="item_embeddings",
+    )
+    # Lookup embeddings for target.
+    target_embeddings = embed_item(inputs["target"])
+    # Lookup embeddings for context.
+    context_embeddings = embed_item(inputs["context"])
+    # Compute dot similarity between target and context embeddings.
+    logits = layers.Dot(axes=1, normalize=False, name="dot_similarity")(
+        [target_embeddings, context_embeddings]
+    )
+    # Create the model.
+    model = keras.Model(inputs=inputs, outputs=logits)
+    return model
+
+
+"""
+### Train the model
+"""
+
+"""
+We instantiate the model and compile it.
+"""
+
+model = create_model(len(vocabulary), embedding_dim)
+model.compile(
+    optimizer=keras.optimizers.Adam(learning_rate),
+    loss=keras.losses.BinaryCrossentropy(from_logits=True),
+)
+
+"""
+Let's plot the model.
+"""
+
+keras.utils.plot_model(
+    model,
+    show_shapes=True,
+    show_dtype=True,
+    show_layer_names=True,
+)
+
+"""
+Now we train the model on the `dataset`.
+"""
+
+history = model.fit(dataset, epochs=num_epochs)
+
+"""
+Finally we plot the learning history.
+"""
+
+plt.plot(history.history["loss"])
+plt.ylabel("loss")
+plt.xlabel("epoch")
+plt.show()
+
+"""
+## Analyze the learnt embeddings.
+"""
+
+movie_embeddings = model.get_layer("item_embeddings").get_weights()[0]
+print("Embeddings shape:", movie_embeddings.shape)
+
+"""
+### Find related movies
+
+Define a list with some movies called `query_movies`.
+"""
+
+query_movies = [
+    "Matrix, The (1999)",
+    "Star Wars: Episode IV - A New Hope (1977)",
+    "Lion King, The (1994)",
+    "Terminator 2: Judgment Day (1991)",
+    "Godfather, The (1972)",
+]
+
+"""
+Get the embeddings of the movies in `query_movies`.
+"""
+
+query_embeddings = []
+
+for movie_title in query_movies:
+    movieId = get_movie_id_by_title(movie_title)
+    token_id = vocabulary_lookup[movieId]
+    movie_embedding = movie_embeddings[token_id]
+    query_embeddings.append(movie_embedding)
+
+query_embeddings = np.array(query_embeddings)
+
+"""
+Compute the [consine similarity](https://en.wikipedia.org/wiki/Cosine_similarity) between the embeddings of `query_movies`
+and all the other movies, then pick the top k for each.
+"""
+
+similarities = tf.linalg.matmul(
+    tf.math.l2_normalize(query_embeddings),
+    tf.math.l2_normalize(movie_embeddings),
+    transpose_b=True,
+)
+
+_, indices = tf.math.top_k(similarities, k=5)
+indices = indices.numpy().tolist()
+
+"""
+Display the top related movies in `query_movies`.
+"""
+
+for idx, title in enumerate(query_movies):
+    print(title)
+    print("".rjust(len(title), "-"))
+    similar_tokens = indices[idx]
+    for token in similar_tokens:
+        similar_movieId = vocabulary[token]
+        similar_title = get_movie_title_by_id(similar_movieId)
+        print(f"- {similar_title}")
+    print()
+
+"""
+### Visualize the embeddings using the Embedding Projector
+"""
+
+import io
+
+out_v = io.open("embeddings.tsv", "w", encoding="utf-8")
+out_m = io.open("metadata.tsv", "w", encoding="utf-8")
+
+for idx, movie_id in enumerate(vocabulary[1:]):
+    movie_title = list(movies[movies.movieId == movie_id].title)[0]
+    vector = movie_embeddings[idx]
+    out_v.write("\t".join([str(x) for x in vector]) + "\n")
+    out_m.write(movie_title + "\n")
+
+out_v.close()
+out_m.close()
+
+"""
+Download the `embeddings.tsv` and `metadata.tsv` to analyze the obtained embeddings
+in the [Embedding Projector](https://projector.tensorflow.org/).
+"""
+
+"""
+
+**Example available on HuggingFace**
+
+| Trained Model | Demo |
+| :--: | :--: |
+| [![Generic badge](https://img.shields.io/badge/%F0%9F%A4%97%20Model%3A%20-Node2Vec%20Movielens-black.svg)](https://huggingface.co/keras-io/Node2Vec_MovieLens) | [![Generic badge](https://img.shields.io/badge/%F0%9F%A4%97%20Spaces%3A-Node2Vec%20Movielens-black.svg)](https://huggingface.co/spaces/keras-io/Node2Vec_MovieLens) |
+"""
diff --git a/knowledge_base/nlp/abstractive_summarization_with_bart.py b/knowledge_base/nlp/abstractive_summarization_with_bart.py
new file mode 100644
index 0000000000000000000000000000000000000000..b1c2d495a62acaa7b6cd0ee0510ff007bd89a64f
--- /dev/null
+++ b/knowledge_base/nlp/abstractive_summarization_with_bart.py
@@ -0,0 +1,240 @@
+"""
+Title: Abstractive Text Summarization with BART
+Author: [Abheesht Sharma](https://github.com/abheesht17/)
+Date created: 2023/07/08
+Last modified: 2024/03/20
+Description: Use KerasHub to fine-tune BART on the abstractive summarization task.
+Accelerator: GPU
+Converted to Keras 3 by: [Sitam Meur](https://github.com/sitamgithub-MSIT)
+"""
+
+"""
+## Introduction
+
+In the era of information overload, it has become crucial to extract the crux
+of a long document or a conversation and express it in a few sentences. Owing
+to the fact that summarization has widespread applications in different domains,
+it has become a key, well-studied NLP task in recent years.
+
+[Bidirectional Autoregressive Transformer (BART)](https://arxiv.org/abs/1910.13461)
+is a Transformer-based encoder-decoder model, often used for
+sequence-to-sequence tasks like summarization and neural machine translation.
+BART is pre-trained in a self-supervised fashion on a large text corpus. During
+pre-training, the text is corrupted and BART is trained to reconstruct the
+original text (hence called a "denoising autoencoder"). Some pre-training tasks
+include token masking, token deletion, sentence permutation (shuffle sentences
+and train BART to fix the order), etc.
+
+In this example, we will demonstrate how to fine-tune BART on the abstractive
+summarization task (on conversations!) using KerasHub, and generate summaries
+using the fine-tuned model.
+"""
+
+"""
+## Setup
+
+Before we start implementing the pipeline, let's install and import all the
+libraries we need. We'll be using the KerasHub library. We will also need a
+couple of utility libraries.
+"""
+
+"""shell
+pip install git+https://github.com/keras-team/keras-hub.git py7zr -q
+"""
+
+"""
+This examples uses [Keras 3](https://keras.io/keras_3/) to work in any of
+`"tensorflow"`, `"jax"` or `"torch"`. Support for Keras 3 is baked into
+KerasHub, simply change the `"KERAS_BACKEND"` environment variable to select
+the backend of your choice. We select the JAX backend below.
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "jax"
+
+"""
+Import all necessary libraries.
+"""
+
+import py7zr
+import time
+
+import keras_hub
+import keras
+import tensorflow as tf
+import tensorflow_datasets as tfds
+
+"""
+Let's also define our hyperparameters.
+"""
+
+BATCH_SIZE = 8
+NUM_BATCHES = 600
+EPOCHS = 1  # Can be set to a higher value for better results
+MAX_ENCODER_SEQUENCE_LENGTH = 512
+MAX_DECODER_SEQUENCE_LENGTH = 128
+MAX_GENERATION_LENGTH = 40
+
+"""
+## Dataset
+
+Let's load the [SAMSum dataset](https://arxiv.org/abs/1911.12237). This dataset
+contains around 15,000 pairs of conversations/dialogues and summaries.
+"""
+
+# Download the dataset.
+filename = keras.utils.get_file(
+    "corpus.7z",
+    origin="https://huggingface.co/datasets/samsum/resolve/main/data/corpus.7z",
+)
+
+# Extract the `.7z` file.
+with py7zr.SevenZipFile(filename, mode="r") as z:
+    z.extractall(path="/root/tensorflow_datasets/downloads/manual")
+
+# Load data using TFDS.
+samsum_ds = tfds.load("samsum", split="train", as_supervised=True)
+
+"""
+The dataset has two fields: `dialogue` and `summary`. Let's see a sample.
+"""
+for dialogue, summary in samsum_ds:
+    print(dialogue.numpy())
+    print(summary.numpy())
+    break
+
+"""
+We'll now batch the dataset and retain only a subset of the dataset for the
+purpose of this example. The dialogue is fed to the encoder, and the
+corresponding summary serves as input to the decoder. We will, therefore, change
+the format of the dataset to a dictionary having two keys: `"encoder_text"` and
+`"decoder_text"`.This is how `keras_hub.models.BartSeq2SeqLMPreprocessor`
+expects the input format to be.
+"""
+
+train_ds = (
+    samsum_ds.map(
+        lambda dialogue, summary: {"encoder_text": dialogue, "decoder_text": summary}
+    )
+    .batch(BATCH_SIZE)
+    .cache()
+)
+train_ds = train_ds.take(NUM_BATCHES)
+
+"""
+## Fine-tune BART
+
+Let's load the model and preprocessor first. We use sequence lengths of 512
+and 128 for the encoder and decoder, respectively, instead of 1024 (which is the
+default sequence length). This will allow us to run this example quickly
+on Colab.
+
+If you observe carefully, the preprocessor is attached to the model. What this
+means is that we don't have to worry about preprocessing the text inputs;
+everything will be done internally. The preprocessor tokenizes the encoder text
+and the decoder text, adds special tokens and pads them. To generate labels
+for auto-regressive training, the preprocessor shifts the decoder text one
+position to the right. This is done because at every timestep, the model is
+trained to predict the next token.
+"""
+
+preprocessor = keras_hub.models.BartSeq2SeqLMPreprocessor.from_preset(
+    "bart_base_en",
+    encoder_sequence_length=MAX_ENCODER_SEQUENCE_LENGTH,
+    decoder_sequence_length=MAX_DECODER_SEQUENCE_LENGTH,
+)
+bart_lm = keras_hub.models.BartSeq2SeqLM.from_preset(
+    "bart_base_en", preprocessor=preprocessor
+)
+
+bart_lm.summary()
+
+"""
+Define the optimizer and loss. We use the Adam optimizer with a linearly
+decaying learning rate. Compile the model.
+"""
+
+optimizer = keras.optimizers.AdamW(
+    learning_rate=5e-5,
+    weight_decay=0.01,
+    epsilon=1e-6,
+    global_clipnorm=1.0,  # Gradient clipping.
+)
+# Exclude layernorm and bias terms from weight decay.
+optimizer.exclude_from_weight_decay(var_names=["bias"])
+optimizer.exclude_from_weight_decay(var_names=["gamma"])
+optimizer.exclude_from_weight_decay(var_names=["beta"])
+
+loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+
+bart_lm.compile(
+    optimizer=optimizer,
+    loss=loss,
+    weighted_metrics=["accuracy"],
+)
+
+"""
+Let's train the model!
+"""
+
+bart_lm.fit(train_ds, epochs=EPOCHS)
+
+"""
+## Generate summaries and evaluate them!
+
+Now that the model has been trained, let's get to the fun part - actually
+generating summaries! Let's pick the first 100 samples from the validation set
+and generate summaries for them. We will use the default decoding strategy, i.e.,
+greedy search.
+
+Generation in KerasHub is highly optimized. It is backed by the power of XLA.
+Secondly, key/value tensors in the self-attention layer and cross-attention layer
+in the decoder are cached to avoid recomputation at every timestep.
+"""
+
+
+def generate_text(model, input_text, max_length=200, print_time_taken=False):
+    start = time.time()
+    output = model.generate(input_text, max_length=max_length)
+    end = time.time()
+    print(f"Total Time Elapsed: {end - start:.2f}s")
+    return output
+
+
+# Load the dataset.
+val_ds = tfds.load("samsum", split="validation", as_supervised=True)
+val_ds = val_ds.take(100)
+
+dialogues = []
+ground_truth_summaries = []
+for dialogue, summary in val_ds:
+    dialogues.append(dialogue.numpy())
+    ground_truth_summaries.append(summary.numpy())
+
+# Let's make a dummy call - the first call to XLA generally takes a bit longer.
+_ = generate_text(bart_lm, "sample text", max_length=MAX_GENERATION_LENGTH)
+
+# Generate summaries.
+generated_summaries = generate_text(
+    bart_lm,
+    val_ds.map(lambda dialogue, _: dialogue).batch(8),
+    max_length=MAX_GENERATION_LENGTH,
+    print_time_taken=True,
+)
+
+"""
+Let's see some of the summaries.
+"""
+for dialogue, generated_summary, ground_truth_summary in zip(
+    dialogues[:5], generated_summaries[:5], ground_truth_summaries[:5]
+):
+    print("Dialogue:", dialogue)
+    print("Generated Summary:", generated_summary)
+    print("Ground Truth Summary:", ground_truth_summary)
+    print("=============================")
+
+"""
+The generated summaries look awesome! Not bad for a model trained only for 1
+epoch and on 5000 examples :)
+"""
diff --git a/knowledge_base/nlp/active_learning_review_classification.py b/knowledge_base/nlp/active_learning_review_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6ec81780ec02c24e31cd92ffca4437f984aaf4a
--- /dev/null
+++ b/knowledge_base/nlp/active_learning_review_classification.py
@@ -0,0 +1,524 @@
+"""
+Title: Review Classification using Active Learning
+Author: [Darshan Deshpande](https://twitter.com/getdarshan)
+Date created: 2021/10/29
+Last modified: 2024/05/08
+Description: Demonstrating the advantages of active learning through review classification.
+Accelerator: GPU
+Converted to Keras 3 by: [Sachin Prasad](https://github.com/sachinprasadhs)
+"""
+
+"""
+## Introduction
+
+With the growth of data-centric Machine Learning, Active Learning has grown in popularity
+amongst businesses and researchers. Active Learning seeks to progressively
+train ML models so that the resultant model requires lesser amount of training data to
+achieve competitive scores.
+
+The structure of an Active Learning pipeline involves a classifier and an oracle. The
+oracle is an annotator that cleans, selects, labels the data, and feeds it to the model
+when required. The oracle is a trained individual or a group of individuals that
+ensure consistency in labeling of new data.
+
+The process starts with annotating a small subset of the full dataset and training an
+initial model. The best model checkpoint is saved and then tested on a balanced test
+set. The test set must be carefully sampled because the full training process will be
+dependent on it. Once we have the initial evaluation scores, the oracle is tasked with
+labeling more samples; the number of data points to be sampled is usually determined by
+the business requirements. After that, the newly sampled data is added to the training
+set, and the training procedure repeats. This cycle continues until either an
+acceptable score is reached or some other business metric is met.
+
+This tutorial provides a basic demonstration of how Active Learning works by
+demonstrating a ratio-based (least confidence) sampling strategy that results in lower
+overall false positive and negative rates when compared to a model trained on the entire
+dataset. This sampling falls under the domain of *uncertainty sampling*, in which new
+datasets are sampled based on the uncertainty that the model outputs for the
+corresponding label. In our example, we compare our model's false positive and false
+negative rates and annotate the new data based on their ratio.
+
+Some other sampling techniques include:
+
+1. [Committee sampling](https://www.researchgate.net/publication/51909346_Committee-Based_Sample_Selection_for_Probabilistic_Classifiers):
+Using multiple models to vote for the best data points to be sampled
+2. [Entropy reduction](https://www.researchgate.net/publication/51909346_Committee-Based_Sample_Selection_for_Probabilistic_Classifiers):
+Sampling according to an entropy threshold, selecting more of the samples that produce the highest entropy score.
+3. [Minimum margin based sampling](https://arxiv.org/abs/1906.00025v1):
+Selects data points closest to the decision boundary
+"""
+
+"""
+## Importing required libraries
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"  # @param ["tensorflow", "jax", "torch"]
+import keras
+from keras import ops
+from keras import layers
+import tensorflow_datasets as tfds
+import tensorflow as tf
+import matplotlib.pyplot as plt
+import re
+import string
+
+tfds.disable_progress_bar()
+
+"""
+## Loading and preprocessing the data
+
+We will be using the IMDB reviews dataset for our experiments. This dataset has 50,000
+reviews in total, including training and testing splits. We will merge these splits and
+sample our own, balanced training, validation and testing sets.
+"""
+
+dataset = tfds.load(
+    "imdb_reviews",
+    split="train + test",
+    as_supervised=True,
+    batch_size=-1,
+    shuffle_files=False,
+)
+reviews, labels = tfds.as_numpy(dataset)
+
+print("Total examples:", reviews.shape[0])
+
+"""
+Active learning starts with labeling a subset of data.
+For the ratio sampling technique that we will be using, we will need well-balanced training,
+validation and testing splits.
+"""
+
+val_split = 2500
+test_split = 2500
+train_split = 7500
+
+# Separating the negative and positive samples for manual stratification
+x_positives, y_positives = reviews[labels == 1], labels[labels == 1]
+x_negatives, y_negatives = reviews[labels == 0], labels[labels == 0]
+
+# Creating training, validation and testing splits
+x_val, y_val = (
+    tf.concat((x_positives[:val_split], x_negatives[:val_split]), 0),
+    tf.concat((y_positives[:val_split], y_negatives[:val_split]), 0),
+)
+x_test, y_test = (
+    tf.concat(
+        (
+            x_positives[val_split : val_split + test_split],
+            x_negatives[val_split : val_split + test_split],
+        ),
+        0,
+    ),
+    tf.concat(
+        (
+            y_positives[val_split : val_split + test_split],
+            y_negatives[val_split : val_split + test_split],
+        ),
+        0,
+    ),
+)
+x_train, y_train = (
+    tf.concat(
+        (
+            x_positives[val_split + test_split : val_split + test_split + train_split],
+            x_negatives[val_split + test_split : val_split + test_split + train_split],
+        ),
+        0,
+    ),
+    tf.concat(
+        (
+            y_positives[val_split + test_split : val_split + test_split + train_split],
+            y_negatives[val_split + test_split : val_split + test_split + train_split],
+        ),
+        0,
+    ),
+)
+
+# Remaining pool of samples are stored separately. These are only labeled as and when required
+x_pool_positives, y_pool_positives = (
+    x_positives[val_split + test_split + train_split :],
+    y_positives[val_split + test_split + train_split :],
+)
+x_pool_negatives, y_pool_negatives = (
+    x_negatives[val_split + test_split + train_split :],
+    y_negatives[val_split + test_split + train_split :],
+)
+
+# Creating TF Datasets for faster prefetching and parallelization
+train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+val_dataset = tf.data.Dataset.from_tensor_slices((x_val, y_val))
+test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
+
+pool_negatives = tf.data.Dataset.from_tensor_slices(
+    (x_pool_negatives, y_pool_negatives)
+)
+pool_positives = tf.data.Dataset.from_tensor_slices(
+    (x_pool_positives, y_pool_positives)
+)
+
+print(f"Initial training set size: {len(train_dataset)}")
+print(f"Validation set size: {len(val_dataset)}")
+print(f"Testing set size: {len(test_dataset)}")
+print(f"Unlabeled negative pool: {len(pool_negatives)}")
+print(f"Unlabeled positive pool: {len(pool_positives)}")
+
+"""
+### Fitting the `TextVectorization` layer
+
+Since we are working with text data, we will need to encode the text strings as vectors which
+would then be passed through an `Embedding` layer. To make this tokenization process
+faster, we use the `map()` function with its parallelization functionality.
+"""
+
+
+vectorizer = layers.TextVectorization(
+    3000, standardize="lower_and_strip_punctuation", output_sequence_length=150
+)
+# Adapting the dataset
+vectorizer.adapt(
+    train_dataset.map(lambda x, y: x, num_parallel_calls=tf.data.AUTOTUNE).batch(256)
+)
+
+
+def vectorize_text(text, label):
+    text = vectorizer(text)
+    return text, label
+
+
+train_dataset = train_dataset.map(
+    vectorize_text, num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+pool_negatives = pool_negatives.map(vectorize_text, num_parallel_calls=tf.data.AUTOTUNE)
+pool_positives = pool_positives.map(vectorize_text, num_parallel_calls=tf.data.AUTOTUNE)
+
+val_dataset = val_dataset.batch(256).map(
+    vectorize_text, num_parallel_calls=tf.data.AUTOTUNE
+)
+test_dataset = test_dataset.batch(256).map(
+    vectorize_text, num_parallel_calls=tf.data.AUTOTUNE
+)
+
+"""
+## Creating Helper Functions
+"""
+
+
+# Helper function for merging new history objects with older ones
+def append_history(losses, val_losses, accuracy, val_accuracy, history):
+    losses = losses + history.history["loss"]
+    val_losses = val_losses + history.history["val_loss"]
+    accuracy = accuracy + history.history["binary_accuracy"]
+    val_accuracy = val_accuracy + history.history["val_binary_accuracy"]
+    return losses, val_losses, accuracy, val_accuracy
+
+
+# Plotter function
+def plot_history(losses, val_losses, accuracies, val_accuracies):
+    plt.plot(losses)
+    plt.plot(val_losses)
+    plt.legend(["train_loss", "val_loss"])
+    plt.xlabel("Epochs")
+    plt.ylabel("Loss")
+    plt.show()
+
+    plt.plot(accuracies)
+    plt.plot(val_accuracies)
+    plt.legend(["train_accuracy", "val_accuracy"])
+    plt.xlabel("Epochs")
+    plt.ylabel("Accuracy")
+    plt.show()
+
+
+"""
+## Creating the Model
+
+We create a small bidirectional LSTM model. When using Active Learning, you should make sure
+that the model architecture is capable of overfitting to the initial data.
+Overfitting gives a strong hint that the model will have enough capacity for
+future, unseen data.
+"""
+
+
+def create_model():
+    model = keras.models.Sequential(
+        [
+            layers.Input(shape=(150,)),
+            layers.Embedding(input_dim=3000, output_dim=128),
+            layers.Bidirectional(layers.LSTM(32, return_sequences=True)),
+            layers.GlobalMaxPool1D(),
+            layers.Dense(20, activation="relu"),
+            layers.Dropout(0.5),
+            layers.Dense(1, activation="sigmoid"),
+        ]
+    )
+    model.summary()
+    return model
+
+
+"""
+## Training on the entire dataset
+
+To show the effectiveness of Active Learning, we will first train the model on the entire
+dataset containing 40,000 labeled samples. This model will be used for comparison later.
+"""
+
+
+def train_full_model(full_train_dataset, val_dataset, test_dataset):
+    model = create_model()
+    model.compile(
+        loss="binary_crossentropy",
+        optimizer="rmsprop",
+        metrics=[
+            keras.metrics.BinaryAccuracy(),
+            keras.metrics.FalseNegatives(),
+            keras.metrics.FalsePositives(),
+        ],
+    )
+
+    # We will save the best model at every epoch and load the best one for evaluation on the test set
+    history = model.fit(
+        full_train_dataset.batch(256),
+        epochs=20,
+        validation_data=val_dataset,
+        callbacks=[
+            keras.callbacks.EarlyStopping(patience=4, verbose=1),
+            keras.callbacks.ModelCheckpoint(
+                "FullModelCheckpoint.keras", verbose=1, save_best_only=True
+            ),
+        ],
+    )
+
+    # Plot history
+    plot_history(
+        history.history["loss"],
+        history.history["val_loss"],
+        history.history["binary_accuracy"],
+        history.history["val_binary_accuracy"],
+    )
+
+    # Loading the best checkpoint
+    model = keras.models.load_model("FullModelCheckpoint.keras")
+
+    print("-" * 100)
+    print(
+        "Test set evaluation: ",
+        model.evaluate(test_dataset, verbose=0, return_dict=True),
+    )
+    print("-" * 100)
+    return model
+
+
+# Sampling the full train dataset to train on
+full_train_dataset = (
+    train_dataset.concatenate(pool_positives)
+    .concatenate(pool_negatives)
+    .cache()
+    .shuffle(20000)
+)
+
+# Training the full model
+full_dataset_model = train_full_model(full_train_dataset, val_dataset, test_dataset)
+
+"""
+## Training via Active Learning
+
+The general process we follow when performing Active Learning is demonstrated below:
+
+![Active Learning](https://i.imgur.com/dmNKusp.png)
+
+The pipeline can be summarized in five parts:
+
+1. Sample and annotate a small, balanced training dataset
+2. Train the model on this small subset
+3. Evaluate the model on a balanced testing set
+4. If the model satisfies the business criteria, deploy it in a real time setting
+5. If it doesn't pass the criteria, sample a few more samples according to the ratio of
+false positives and negatives, add them to the training set and repeat from step 2 till
+the model passes the tests or till all available data is exhausted.
+
+For the code below, we will perform sampling using the following formula:<br/>
+
+![Ratio Sampling](https://i.imgur.com/LyZEiZL.png)
+
+Active Learning techniques use callbacks extensively for progress tracking. We will be
+using model checkpointing and early stopping for this example. The `patience` parameter
+for Early Stopping can help minimize overfitting and the time required. We have set it
+`patience=4` for now but since the model is robust, we can increase the patience level if
+desired.
+
+Note: We are not loading the checkpoint after the first training iteration. In my
+experience working on Active Learning techniques, this helps the model probe the
+newly formed loss landscape. Even if the model fails to improve in the second iteration,
+we will still gain insight about the possible future false positive and negative rates.
+This will help us sample a better set in the next iteration where the model will have a
+greater chance to improve.
+"""
+
+
+def train_active_learning_models(
+    train_dataset,
+    pool_negatives,
+    pool_positives,
+    val_dataset,
+    test_dataset,
+    num_iterations=3,
+    sampling_size=5000,
+):
+
+    # Creating lists for storing metrics
+    losses, val_losses, accuracies, val_accuracies = [], [], [], []
+
+    model = create_model()
+    # We will monitor the false positives and false negatives predicted by our model
+    # These will decide the subsequent sampling ratio for every Active Learning loop
+    model.compile(
+        loss="binary_crossentropy",
+        optimizer="rmsprop",
+        metrics=[
+            keras.metrics.BinaryAccuracy(),
+            keras.metrics.FalseNegatives(),
+            keras.metrics.FalsePositives(),
+        ],
+    )
+
+    # Defining checkpoints.
+    # The checkpoint callback is reused throughout the training since it only saves the best overall model.
+    checkpoint = keras.callbacks.ModelCheckpoint(
+        "AL_Model.keras", save_best_only=True, verbose=1
+    )
+    # Here, patience is set to 4. This can be set higher if desired.
+    early_stopping = keras.callbacks.EarlyStopping(patience=4, verbose=1)
+
+    print(f"Starting to train with {len(train_dataset)} samples")
+    # Initial fit with a small subset of the training set
+    history = model.fit(
+        train_dataset.cache().shuffle(20000).batch(256),
+        epochs=20,
+        validation_data=val_dataset,
+        callbacks=[checkpoint, early_stopping],
+    )
+
+    # Appending history
+    losses, val_losses, accuracies, val_accuracies = append_history(
+        losses, val_losses, accuracies, val_accuracies, history
+    )
+
+    for iteration in range(num_iterations):
+        # Getting predictions from previously trained model
+        predictions = model.predict(test_dataset)
+
+        # Generating labels from the output probabilities
+        rounded = ops.where(ops.greater(predictions, 0.5), 1, 0)
+
+        # Evaluating the number of zeros and ones incorrrectly classified
+        _, _, false_negatives, false_positives = model.evaluate(test_dataset, verbose=0)
+
+        print("-" * 100)
+        print(
+            f"Number of zeros incorrectly classified: {false_negatives}, Number of ones incorrectly classified: {false_positives}"
+        )
+
+        # This technique of Active Learning demonstrates ratio based sampling where
+        # Number of ones/zeros to sample = Number of ones/zeros incorrectly classified / Total incorrectly classified
+        if false_negatives != 0 and false_positives != 0:
+            total = false_negatives + false_positives
+            sample_ratio_ones, sample_ratio_zeros = (
+                false_positives / total,
+                false_negatives / total,
+            )
+        # In the case where all samples are correctly predicted, we can sample both classes equally
+        else:
+            sample_ratio_ones, sample_ratio_zeros = 0.5, 0.5
+
+        print(
+            f"Sample ratio for positives: {sample_ratio_ones}, Sample ratio for negatives:{sample_ratio_zeros}"
+        )
+
+        # Sample the required number of ones and zeros
+        sampled_dataset = pool_negatives.take(
+            int(sample_ratio_zeros * sampling_size)
+        ).concatenate(pool_positives.take(int(sample_ratio_ones * sampling_size)))
+
+        # Skip the sampled data points to avoid repetition of sample
+        pool_negatives = pool_negatives.skip(int(sample_ratio_zeros * sampling_size))
+        pool_positives = pool_positives.skip(int(sample_ratio_ones * sampling_size))
+
+        # Concatenating the train_dataset with the sampled_dataset
+        train_dataset = train_dataset.concatenate(sampled_dataset).prefetch(
+            tf.data.AUTOTUNE
+        )
+
+        print(f"Starting training with {len(train_dataset)} samples")
+        print("-" * 100)
+
+        # We recompile the model to reset the optimizer states and retrain the model
+        model.compile(
+            loss="binary_crossentropy",
+            optimizer="rmsprop",
+            metrics=[
+                keras.metrics.BinaryAccuracy(),
+                keras.metrics.FalseNegatives(),
+                keras.metrics.FalsePositives(),
+            ],
+        )
+        history = model.fit(
+            train_dataset.cache().shuffle(20000).batch(256),
+            validation_data=val_dataset,
+            epochs=20,
+            callbacks=[
+                checkpoint,
+                keras.callbacks.EarlyStopping(patience=4, verbose=1),
+            ],
+        )
+
+        # Appending the history
+        losses, val_losses, accuracies, val_accuracies = append_history(
+            losses, val_losses, accuracies, val_accuracies, history
+        )
+
+        # Loading the best model from this training loop
+        model = keras.models.load_model("AL_Model.keras")
+
+    # Plotting the overall history and evaluating the final model
+    plot_history(losses, val_losses, accuracies, val_accuracies)
+    print("-" * 100)
+    print(
+        "Test set evaluation: ",
+        model.evaluate(test_dataset, verbose=0, return_dict=True),
+    )
+    print("-" * 100)
+
+    return model
+
+
+active_learning_model = train_active_learning_models(
+    train_dataset, pool_negatives, pool_positives, val_dataset, test_dataset
+)
+
+"""
+## Conclusion
+
+Active Learning is a growing area of research. This example demonstrates the cost-efficiency
+benefits of using Active Learning, as it eliminates the need to annotate large amounts of
+data, saving resources.
+
+The following are some noteworthy observations from this example:
+
+1. We only require 30,000 samples to reach the same (if not better) scores as the model
+trained on the full dataset. This means that in a real life setting, we save the effort
+required for annotating 10,000 images!
+2. The number of false negatives and false positives are well balanced at the end of the
+training as compared to the skewed ratio obtained from the full training. This makes the
+model slightly more useful in real life scenarios where both the labels hold equal
+importance.
+
+For further reading about the types of sampling ratios, training techniques or available
+open source libraries/implementations, you can refer to the resources below:
+
+1. [Active Learning Literature Survey](http://burrsettles.com/pub/settles.activelearning.pdf) (Burr Settles, 2010).
+2. [modAL](https://github.com/modAL-python/modAL): A Modular Active Learning framework.
+3. Google's unofficial [Active Learning playground](https://github.com/google/active-learning).
+"""
diff --git a/knowledge_base/nlp/addition_rnn.py b/knowledge_base/nlp/addition_rnn.py
new file mode 100644
index 0000000000000000000000000000000000000000..409d64568e139149824a080a835aab1af88cb0e8
--- /dev/null
+++ b/knowledge_base/nlp/addition_rnn.py
@@ -0,0 +1,251 @@
+"""
+Title: Sequence to sequence learning for performing number addition
+Author: [Smerity](https://twitter.com/Smerity) and others
+Date created: 2015/08/17
+Last modified: 2024/02/13
+Description: A model that learns to add strings of numbers, e.g. "535+61" -> "596".
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+In this example, we train a model to learn to add two numbers, provided as strings.
+
+**Example:**
+
+- Input: "535+61"
+- Output: "596"
+
+Input may optionally be reversed, which was shown to increase performance in many tasks
+ in: [Learning to Execute](http://arxiv.org/abs/1410.4615) and
+[Sequence to Sequence Learning with Neural Networks](http://papers.nips.cc/paper/5346-sequence-to-sequence-learning-with-neural-networks.pdf).
+
+Theoretically, sequence order inversion introduces shorter term dependencies between
+ source and target for this problem.
+
+**Results:**
+
+For two digits (reversed):
+
++ One layer LSTM (128 HN), 5k training examples = 99% train/test accuracy in 55 epochs
+
+Three digits (reversed):
+
++ One layer LSTM (128 HN), 50k training examples = 99% train/test accuracy in 100 epochs
+
+Four digits (reversed):
+
++ One layer LSTM (128 HN), 400k training examples = 99% train/test accuracy in 20 epochs
+
+Five digits (reversed):
+
++ One layer LSTM (128 HN), 550k training examples = 99% train/test accuracy in 30 epochs
+"""
+
+"""
+## Setup
+"""
+
+import keras
+from keras import layers
+import numpy as np
+
+# Parameters for the model and dataset.
+TRAINING_SIZE = 50000
+DIGITS = 3
+REVERSE = True
+
+# Maximum length of input is 'int + int' (e.g., '345+678'). Maximum length of
+# int is DIGITS.
+MAXLEN = DIGITS + 1 + DIGITS
+
+"""
+## Generate the data
+"""
+
+
+class CharacterTable:
+    """Given a set of characters:
+    + Encode them to a one-hot integer representation
+    + Decode the one-hot or integer representation to their character output
+    + Decode a vector of probabilities to their character output
+    """
+
+    def __init__(self, chars):
+        """Initialize character table.
+        # Arguments
+            chars: Characters that can appear in the input.
+        """
+        self.chars = sorted(set(chars))
+        self.char_indices = dict((c, i) for i, c in enumerate(self.chars))
+        self.indices_char = dict((i, c) for i, c in enumerate(self.chars))
+
+    def encode(self, C, num_rows):
+        """One-hot encode given string C.
+        # Arguments
+            C: string, to be encoded.
+            num_rows: Number of rows in the returned one-hot encoding. This is
+                used to keep the # of rows for each data the same.
+        """
+        x = np.zeros((num_rows, len(self.chars)))
+        for i, c in enumerate(C):
+            x[i, self.char_indices[c]] = 1
+        return x
+
+    def decode(self, x, calc_argmax=True):
+        """Decode the given vector or 2D array to their character output.
+        # Arguments
+            x: A vector or a 2D array of probabilities or one-hot representations;
+                or a vector of character indices (used with `calc_argmax=False`).
+            calc_argmax: Whether to find the character index with maximum
+                probability, defaults to `True`.
+        """
+        if calc_argmax:
+            x = x.argmax(axis=-1)
+        return "".join(self.indices_char[x] for x in x)
+
+
+# All the numbers, plus sign and space for padding.
+chars = "0123456789+ "
+ctable = CharacterTable(chars)
+
+questions = []
+expected = []
+seen = set()
+print("Generating data...")
+while len(questions) < TRAINING_SIZE:
+    f = lambda: int(
+        "".join(
+            np.random.choice(list("0123456789"))
+            for i in range(np.random.randint(1, DIGITS + 1))
+        )
+    )
+    a, b = f(), f()
+    # Skip any addition questions we've already seen
+    # Also skip any such that x+Y == Y+x (hence the sorting).
+    key = tuple(sorted((a, b)))
+    if key in seen:
+        continue
+    seen.add(key)
+    # Pad the data with spaces such that it is always MAXLEN.
+    q = "{}+{}".format(a, b)
+    query = q + " " * (MAXLEN - len(q))
+    ans = str(a + b)
+    # Answers can be of maximum size DIGITS + 1.
+    ans += " " * (DIGITS + 1 - len(ans))
+    if REVERSE:
+        # Reverse the query, e.g., '12+345  ' becomes '  543+21'. (Note the
+        # space used for padding.)
+        query = query[::-1]
+    questions.append(query)
+    expected.append(ans)
+print("Total questions:", len(questions))
+
+"""
+## Vectorize the data
+"""
+
+print("Vectorization...")
+x = np.zeros((len(questions), MAXLEN, len(chars)), dtype=bool)
+y = np.zeros((len(questions), DIGITS + 1, len(chars)), dtype=bool)
+for i, sentence in enumerate(questions):
+    x[i] = ctable.encode(sentence, MAXLEN)
+for i, sentence in enumerate(expected):
+    y[i] = ctable.encode(sentence, DIGITS + 1)
+
+# Shuffle (x, y) in unison as the later parts of x will almost all be larger
+# digits.
+indices = np.arange(len(y))
+np.random.shuffle(indices)
+x = x[indices]
+y = y[indices]
+
+# Explicitly set apart 10% for validation data that we never train over.
+split_at = len(x) - len(x) // 10
+(x_train, x_val) = x[:split_at], x[split_at:]
+(y_train, y_val) = y[:split_at], y[split_at:]
+
+print("Training Data:")
+print(x_train.shape)
+print(y_train.shape)
+
+print("Validation Data:")
+print(x_val.shape)
+print(y_val.shape)
+
+"""
+## Build the model
+"""
+
+print("Build model...")
+num_layers = 1  # Try to add more LSTM layers!
+
+model = keras.Sequential()
+# "Encode" the input sequence using a LSTM, producing an output of size 128.
+# Note: In a situation where your input sequences have a variable length,
+# use input_shape=(None, num_feature).
+model.add(layers.Input((MAXLEN, len(chars))))
+model.add(layers.LSTM(128))
+# As the decoder RNN's input, repeatedly provide with the last output of
+# RNN for each time step. Repeat 'DIGITS + 1' times as that's the maximum
+# length of output, e.g., when DIGITS=3, max output is 999+999=1998.
+model.add(layers.RepeatVector(DIGITS + 1))
+# The decoder RNN could be multiple layers stacked or a single layer.
+for _ in range(num_layers):
+    # By setting return_sequences to True, return not only the last output but
+    # all the outputs so far in the form of (num_samples, timesteps,
+    # output_dim). This is necessary as TimeDistributed in the below expects
+    # the first dimension to be the timesteps.
+    model.add(layers.LSTM(128, return_sequences=True))
+
+# Apply a dense layer to the every temporal slice of an input. For each of step
+# of the output sequence, decide which character should be chosen.
+model.add(layers.Dense(len(chars), activation="softmax"))
+model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
+model.summary()
+
+"""
+## Train the model
+"""
+
+# Training parameters.
+epochs = 30
+batch_size = 32
+
+# Formatting characters for results display.
+green_color = "\033[92m"
+red_color = "\033[91m"
+end_char = "\033[0m"
+
+# Train the model each generation and show predictions against the validation
+# dataset.
+for epoch in range(1, epochs):
+    print()
+    print("Iteration", epoch)
+    model.fit(
+        x_train,
+        y_train,
+        batch_size=batch_size,
+        epochs=1,
+        validation_data=(x_val, y_val),
+    )
+    # Select 10 samples from the validation set at random so we can visualize
+    # errors.
+    for i in range(10):
+        ind = np.random.randint(0, len(x_val))
+        rowx, rowy = x_val[np.array([ind])], y_val[np.array([ind])]
+        preds = np.argmax(model.predict(rowx, verbose=0), axis=-1)
+        q = ctable.decode(rowx[0])
+        correct = ctable.decode(rowy[0])
+        guess = ctable.decode(preds[0], calc_argmax=False)
+        print("Q", q[::-1] if REVERSE else q, end=" ")
+        print("T", correct, end=" ")
+        if correct == guess:
+            print(f"{green_color}☑ {guess}{end_char}")
+        else:
+            print(f"{red_color}☒ {guess}{end_char}")
+
+"""
+You'll get to 99+% validation accuracy after ~30 epochs.
+"""
diff --git a/knowledge_base/nlp/bidirectional_lstm_imdb.py b/knowledge_base/nlp/bidirectional_lstm_imdb.py
new file mode 100644
index 0000000000000000000000000000000000000000..c52814532e9207c51925e76e04b555c9224bbbda
--- /dev/null
+++ b/knowledge_base/nlp/bidirectional_lstm_imdb.py
@@ -0,0 +1,59 @@
+"""
+Title: Bidirectional LSTM on IMDB
+Author: [fchollet](https://twitter.com/fchollet)
+Date created: 2020/05/03
+Last modified: 2020/05/03
+Description: Train a 2-layer bidirectional LSTM on the IMDB movie review sentiment classification dataset.
+Accelerator: GPU
+"""
+
+"""
+## Setup
+"""
+
+import numpy as np
+import keras
+from keras import layers
+
+max_features = 20000  # Only consider the top 20k words
+maxlen = 200  # Only consider the first 200 words of each movie review
+
+"""
+## Build the model
+"""
+
+# Input for variable-length sequences of integers
+inputs = keras.Input(shape=(None,), dtype="int32")
+# Embed each integer in a 128-dimensional vector
+x = layers.Embedding(max_features, 128)(inputs)
+# Add 2 bidirectional LSTMs
+x = layers.Bidirectional(layers.LSTM(64, return_sequences=True))(x)
+x = layers.Bidirectional(layers.LSTM(64))(x)
+# Add a classifier
+outputs = layers.Dense(1, activation="sigmoid")(x)
+model = keras.Model(inputs, outputs)
+model.summary()
+
+"""
+## Load the IMDB movie review sentiment data
+"""
+
+(x_train, y_train), (x_val, y_val) = keras.datasets.imdb.load_data(
+    num_words=max_features
+)
+print(len(x_train), "Training sequences")
+print(len(x_val), "Validation sequences")
+# Use pad_sequence to standardize sequence length:
+# this will truncate sequences longer than 200 words and zero-pad sequences shorter than 200 words.
+x_train = keras.utils.pad_sequences(x_train, maxlen=maxlen)
+x_val = keras.utils.pad_sequences(x_val, maxlen=maxlen)
+
+"""
+## Train and evaluate the model
+
+You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/keras-io/bidirectional-lstm-imdb)
+and try the demo on [Hugging Face Spaces](https://huggingface.co/spaces/keras-io/bidirectional_lstm_imdb).
+"""
+
+model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
+model.fit(x_train, y_train, batch_size=32, epochs=2, validation_data=(x_val, y_val))
diff --git a/knowledge_base/nlp/data_parallel_training_with_keras_hub.py b/knowledge_base/nlp/data_parallel_training_with_keras_hub.py
new file mode 100644
index 0000000000000000000000000000000000000000..88912968e4175f6ab62027c8807b5e728b5ff7ef
--- /dev/null
+++ b/knowledge_base/nlp/data_parallel_training_with_keras_hub.py
@@ -0,0 +1,242 @@
+"""
+Title: Data Parallel Training with KerasHub and tf.distribute
+Author: Anshuman Mishra
+Date created: 2023/07/07
+Last modified: 2023/07/07
+Description: Data Parallel training with KerasHub and tf.distribute.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+Distributed training is a technique used to train deep learning models on multiple devices
+or machines simultaneously. It helps to reduce training time and allows for training larger
+models with more data. KerasHub is a library that provides tools and utilities for natural
+language processing tasks, including distributed training.
+
+In this tutorial, we will use KerasHub to train a BERT-based masked language model (MLM)
+on the wikitext-2 dataset (a 2 million word dataset of wikipedia articles). The MLM task
+involves predicting the masked words in a sentence, which helps the model learn contextual
+representations of words.
+
+This guide focuses on data parallelism, in particular synchronous data parallelism, where
+each accelerator (a GPU or TPU) holds a complete replica of the model, and sees a
+different partial batch of the input data. Partial gradients are computed on each device,
+aggregated, and used to compute a global gradient update.
+
+Specifically, this guide teaches you how to use the `tf.distribute` API to train Keras
+models on multiple GPUs, with minimal changes to your code, in the following two setups:
+
+- On multiple GPUs (typically 2 to 8) installed on a single machine (single host,
+multi-device training). This is the most common setup for researchers and small-scale
+industry workflows.
+- On a cluster of many machines, each hosting one or multiple GPUs (multi-worker
+distributed training). This is a good setup for large-scale industry workflows, e.g.
+training high-resolution text summarization models on billion word datasets on 20-100 GPUs.
+"""
+
+"""shell
+pip install -q --upgrade keras-hub
+pip install -q --upgrade keras  # Upgrade to Keras 3.
+"""
+
+"""
+## Imports
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import tensorflow as tf
+import keras
+import keras_hub
+
+"""
+Before we start any training, let's configure our single GPU to show up as two logical
+devices.
+
+When you are training with two or more physical GPUs, this is totally uncessary. This
+is just a trick to show real distributed training on the default colab GPU runtime,
+which has only one GPU available.
+"""
+
+"""shell
+nvidia-smi --query-gpu=memory.total --format=csv,noheader
+"""
+
+physical_devices = tf.config.list_physical_devices("GPU")
+tf.config.set_logical_device_configuration(
+    physical_devices[0],
+    [
+        tf.config.LogicalDeviceConfiguration(memory_limit=15360 // 2),
+        tf.config.LogicalDeviceConfiguration(memory_limit=15360 // 2),
+    ],
+)
+
+logical_devices = tf.config.list_logical_devices("GPU")
+logical_devices
+
+EPOCHS = 3
+
+
+"""
+To do single-host, multi-device synchronous training with a Keras model, you would use
+the `tf.distribute.MirroredStrategy` API. Here's how it works:
+
+- Instantiate a `MirroredStrategy`, optionally configuring which specific devices you
+want to use (by default the strategy will use all GPUs available).
+- Use the strategy object to open a scope, and within this scope, create all the Keras
+objects you need that contain variables. Typically, that means **creating & compiling the
+model** inside the distribution scope.
+- Train the model via `fit()` as usual.
+"""
+strategy = tf.distribute.MirroredStrategy()
+print(f"Number of devices: {strategy.num_replicas_in_sync}")
+
+"""
+Base batch size and learning rate
+"""
+base_batch_size = 32
+base_learning_rate = 1e-4
+
+"""
+Calculate scaled batch size and learning rate
+
+"""
+scaled_batch_size = base_batch_size * strategy.num_replicas_in_sync
+scaled_learning_rate = base_learning_rate * strategy.num_replicas_in_sync
+
+"""
+Now, we need to download and preprocess the wikitext-2 dataset. This dataset will be
+used for pretraining the BERT model. We will filter out short lines to ensure that the
+data has enough context for training.
+"""
+
+keras.utils.get_file(
+    origin="https://s3.amazonaws.com/research.metamind.io/wikitext/wikitext-2-v1.zip",
+    extract=True,
+)
+wiki_dir = os.path.expanduser("~/.keras/datasets/wikitext-2/")
+
+# Load wikitext-103 and filter out short lines.
+wiki_train_ds = (
+    tf.data.TextLineDataset(
+        wiki_dir + "wiki.train.tokens",
+    )
+    .filter(lambda x: tf.strings.length(x) > 100)
+    .shuffle(buffer_size=500)
+    .batch(scaled_batch_size)
+    .cache()
+    .prefetch(tf.data.AUTOTUNE)
+)
+wiki_val_ds = (
+    tf.data.TextLineDataset(wiki_dir + "wiki.valid.tokens")
+    .filter(lambda x: tf.strings.length(x) > 100)
+    .shuffle(buffer_size=500)
+    .batch(scaled_batch_size)
+    .cache()
+    .prefetch(tf.data.AUTOTUNE)
+)
+wiki_test_ds = (
+    tf.data.TextLineDataset(wiki_dir + "wiki.test.tokens")
+    .filter(lambda x: tf.strings.length(x) > 100)
+    .shuffle(buffer_size=500)
+    .batch(scaled_batch_size)
+    .cache()
+    .prefetch(tf.data.AUTOTUNE)
+)
+
+"""
+In the above code, we download the wikitext-2 dataset and extract it. Then, we define
+three datasets: wiki_train_ds, wiki_val_ds, and wiki_test_ds. These datasets are
+filtered to remove short lines and are batched for efficient training.
+"""
+
+"""
+It's a common practice to use a decayed learning rate in NLP training/tuning. We'll
+use `PolynomialDecay` schedule here.
+
+"""
+
+total_training_steps = sum(1 for _ in wiki_train_ds.as_numpy_iterator()) * EPOCHS
+lr_schedule = tf.keras.optimizers.schedules.PolynomialDecay(
+    initial_learning_rate=scaled_learning_rate,
+    decay_steps=total_training_steps,
+    end_learning_rate=0.0,
+)
+
+
+class PrintLR(tf.keras.callbacks.Callback):
+    def on_epoch_end(self, epoch, logs=None):
+        print(
+            f"\nLearning rate for epoch {epoch + 1} is {model_dist.optimizer.learning_rate.numpy()}"
+        )
+
+
+"""
+Let's also make a callback to TensorBoard, this will enable visualization of different
+metrics while we train the model in later part of this tutorial. We put all the callbacks
+together as follows:
+"""
+callbacks = [
+    tf.keras.callbacks.TensorBoard(log_dir="./logs"),
+    PrintLR(),
+]
+
+
+print(tf.config.list_physical_devices("GPU"))
+
+
+"""
+With the datasets prepared, we now initialize and compile our model and optimizer within
+the `strategy.scope()`:
+"""
+
+with strategy.scope():
+    # Everything that creates variables should be under the strategy scope.
+    # In general this is only model construction & `compile()`.
+    model_dist = keras_hub.models.BertMaskedLM.from_preset("bert_tiny_en_uncased")
+
+    # This line just sets pooled_dense layer as non-trainiable, we do this to avoid
+    # warnings of this layer being unused
+    model_dist.get_layer("bert_backbone").get_layer("pooled_dense").trainable = False
+
+    model_dist.compile(
+        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+        optimizer=tf.keras.optimizers.AdamW(learning_rate=scaled_learning_rate),
+        weighted_metrics=[keras.metrics.SparseCategoricalAccuracy()],
+        jit_compile=False,
+    )
+
+    model_dist.fit(
+        wiki_train_ds, validation_data=wiki_val_ds, epochs=EPOCHS, callbacks=callbacks
+    )
+
+"""
+After fitting our model under the scope, we evaluate it normally!
+"""
+
+model_dist.evaluate(wiki_test_ds)
+
+"""
+For distributed training across multiple machines (as opposed to training that only leverages
+multiple devices on a single machine), there are two distribution strategies you
+could use: `MultiWorkerMirroredStrategy` and `ParameterServerStrategy`:
+
+- `tf.distribute.MultiWorkerMirroredStrategy` implements a synchronous CPU/GPU
+multi-worker solution to work with Keras-style model building and training loop,
+using synchronous reduction of gradients across the replicas.
+- `tf.distribute.experimental.ParameterServerStrategy` implements an asynchronous CPU/GPU
+multi-worker solution, where the parameters are stored on parameter servers, and
+workers update the gradients to parameter servers asynchronously.
+
+### Further reading
+
+1. [TensorFlow distributed training guide](https://www.tensorflow.org/guide/distributed_training)
+2. [Tutorial on multi-worker training with Keras](https://www.tensorflow.org/tutorials/distribute/multi_worker_with_keras)
+3. [MirroredStrategy docs](https://www.tensorflow.org/api_docs/python/tf/distribute/MirroredStrategy)
+4. [MultiWorkerMirroredStrategy docs](https://www.tensorflow.org/api_docs/python/tf/distribute/experimental/MultiWorkerMirroredStrategy)
+5. [Distributed training in tf.keras with Weights & Biases](https://towardsdatascience.com/distributed-training-in-tf-keras-with-w-b-ccf021f9322e)
+"""
diff --git a/knowledge_base/nlp/fnet_classification_with_keras_hub.py b/knowledge_base/nlp/fnet_classification_with_keras_hub.py
new file mode 100644
index 0000000000000000000000000000000000000000..8ca026cdb1ad16748d932abc91d25f8021e44819
--- /dev/null
+++ b/knowledge_base/nlp/fnet_classification_with_keras_hub.py
@@ -0,0 +1,366 @@
+"""
+Title: Text Classification using FNet
+Author: [Abheesht Sharma](https://github.com/abheesht17/)
+Date created: 2022/06/01
+Last modified: 2022/12/21
+Description: Text Classification on the IMDb Dataset using `keras_hub.layers.FNetEncoder` layer.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+In this example, we will demonstrate the ability of FNet to achieve comparable
+results with a vanilla Transformer model on the text classification task.
+We will be using the IMDb dataset, which is a
+collection of movie reviews labelled either positive or negative (sentiment
+analysis).
+
+To build the tokenizer, model, etc., we will use components from
+[KerasHub](https://github.com/keras-team/keras-hub). KerasHub makes life easier
+for people who want to build NLP pipelines! :)
+
+### Model
+
+Transformer-based language models (LMs) such as BERT, RoBERTa, XLNet, etc. have
+demonstrated the effectiveness of the self-attention mechanism for computing
+rich embeddings for input text. However, the self-attention mechanism is an
+expensive operation, with a time complexity of `O(n^2)`, where `n` is the number
+of tokens in the input. Hence, there has been an effort to reduce the time
+complexity of the self-attention mechanism and improve performance without
+sacrificing the quality of results.
+
+In 2020, a paper titled
+[FNet: Mixing Tokens with Fourier Transforms](https://arxiv.org/abs/2105.03824)
+replaced the self-attention layer in BERT with a simple Fourier Transform layer
+for "token mixing". This resulted in comparable accuracy and a speed-up during
+training. In particular, a couple of points from the paper stand out:
+
+* The authors claim that FNet is 80% faster than BERT on GPUs and 70% faster on
+TPUs. The reason for this speed-up is two-fold: a) the Fourier Transform layer
+is unparametrized, it does not have any parameters, and b) the authors use Fast
+Fourier Transform (FFT); this reduces the time complexity from `O(n^2)`
+(in the case of self-attention) to `O(n log n)`.
+* FNet manages to achieve 92-97% of the accuracy of BERT on the GLUE benchmark.
+"""
+
+"""
+## Setup
+
+Before we start with the implementation, let's import all the necessary packages.
+"""
+
+"""shell
+pip install -q --upgrade keras-hub
+pip install -q --upgrade keras  # Upgrade to Keras 3.
+"""
+
+import keras_hub
+import keras
+import tensorflow as tf
+import os
+
+keras.utils.set_random_seed(42)
+
+"""
+Let's also define our hyperparameters.
+"""
+BATCH_SIZE = 64
+EPOCHS = 3
+MAX_SEQUENCE_LENGTH = 512
+VOCAB_SIZE = 15000
+
+EMBED_DIM = 128
+INTERMEDIATE_DIM = 512
+
+"""
+## Loading the dataset
+
+First, let's download the IMDB dataset and extract it.
+"""
+
+"""shell
+wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
+tar -xzf aclImdb_v1.tar.gz
+"""
+
+"""
+Samples are present in the form of text files. Let's inspect the structure of
+the directory.
+"""
+
+print(os.listdir("./aclImdb"))
+print(os.listdir("./aclImdb/train"))
+print(os.listdir("./aclImdb/test"))
+
+"""
+The directory contains two sub-directories: `train` and `test`. Each subdirectory
+in turn contains two folders: `pos` and `neg` for positive and negative reviews,
+respectively. Before we load the dataset, let's delete the `./aclImdb/train/unsup`
+folder since it has unlabelled samples.
+"""
+
+"""shell
+rm -rf aclImdb/train/unsup
+"""
+
+"""
+We'll use the `keras.utils.text_dataset_from_directory` utility to generate
+our labelled `tf.data.Dataset` dataset from text files.
+"""
+
+train_ds = keras.utils.text_dataset_from_directory(
+    "aclImdb/train",
+    batch_size=BATCH_SIZE,
+    validation_split=0.2,
+    subset="training",
+    seed=42,
+)
+val_ds = keras.utils.text_dataset_from_directory(
+    "aclImdb/train",
+    batch_size=BATCH_SIZE,
+    validation_split=0.2,
+    subset="validation",
+    seed=42,
+)
+test_ds = keras.utils.text_dataset_from_directory("aclImdb/test", batch_size=BATCH_SIZE)
+
+"""
+We will now convert the text to lowercase.
+"""
+train_ds = train_ds.map(lambda x, y: (tf.strings.lower(x), y))
+val_ds = val_ds.map(lambda x, y: (tf.strings.lower(x), y))
+test_ds = test_ds.map(lambda x, y: (tf.strings.lower(x), y))
+
+"""
+Let's print a few samples.
+"""
+for text_batch, label_batch in train_ds.take(1):
+    for i in range(3):
+        print(text_batch.numpy()[i])
+        print(label_batch.numpy()[i])
+
+
+"""
+### Tokenizing the data
+
+We'll be using the `keras_hub.tokenizers.WordPieceTokenizer` layer to tokenize
+the text. `keras_hub.tokenizers.WordPieceTokenizer` takes a WordPiece vocabulary
+and has functions for tokenizing the text, and detokenizing sequences of tokens.
+
+Before we define the tokenizer, we first need to train it on the dataset
+we have. The WordPiece tokenization algorithm is a subword tokenization algorithm;
+training it on a corpus gives us a vocabulary of subwords. A subword tokenizer
+is a compromise between word tokenizers (word tokenizers need very large
+vocabularies for good coverage of input words), and character tokenizers
+(characters don't really encode meaning like words do). Luckily, KerasHub
+makes it very simple to train WordPiece on a corpus with the
+`keras_hub.tokenizers.compute_word_piece_vocabulary` utility.
+
+Note: The official implementation of FNet uses the SentencePiece Tokenizer.
+"""
+
+
+def train_word_piece(ds, vocab_size, reserved_tokens):
+    word_piece_ds = ds.unbatch().map(lambda x, y: x)
+    vocab = keras_hub.tokenizers.compute_word_piece_vocabulary(
+        word_piece_ds.batch(1000).prefetch(2),
+        vocabulary_size=vocab_size,
+        reserved_tokens=reserved_tokens,
+    )
+    return vocab
+
+
+"""
+Every vocabulary has a few special, reserved tokens. We have two such tokens:
+
+- `"[PAD]"` - Padding token. Padding tokens are appended to the input sequence length
+when the input sequence length is shorter than the maximum sequence length.
+- `"[UNK]"` - Unknown token.
+"""
+reserved_tokens = ["[PAD]", "[UNK]"]
+train_sentences = [element[0] for element in train_ds]
+vocab = train_word_piece(train_ds, VOCAB_SIZE, reserved_tokens)
+
+"""
+Let's see some tokens!
+"""
+print("Tokens: ", vocab[100:110])
+
+"""
+Now, let's define the tokenizer. We will configure the tokenizer with the
+the vocabularies trained above. We will define a maximum sequence length so that
+all sequences are padded to the same length, if the length of the sequence is
+less than the specified sequence length. Otherwise, the sequence is truncated.
+"""
+tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
+    vocabulary=vocab,
+    lowercase=False,
+    sequence_length=MAX_SEQUENCE_LENGTH,
+)
+
+"""
+Let's try and tokenize a sample from our dataset! To verify whether the text has
+been tokenized correctly, we can also detokenize the list of tokens back to the
+original text.
+"""
+input_sentence_ex = train_ds.take(1).get_single_element()[0][0]
+input_tokens_ex = tokenizer(input_sentence_ex)
+
+print("Sentence: ", input_sentence_ex)
+print("Tokens: ", input_tokens_ex)
+print("Recovered text after detokenizing: ", tokenizer.detokenize(input_tokens_ex))
+
+
+"""
+## Formatting the dataset
+
+Next, we'll format our datasets in the form that will be fed to the models. We
+need to tokenize the text.
+"""
+
+
+def format_dataset(sentence, label):
+    sentence = tokenizer(sentence)
+    return ({"input_ids": sentence}, label)
+
+
+def make_dataset(dataset):
+    dataset = dataset.map(format_dataset, num_parallel_calls=tf.data.AUTOTUNE)
+    return dataset.shuffle(512).prefetch(16).cache()
+
+
+train_ds = make_dataset(train_ds)
+val_ds = make_dataset(val_ds)
+test_ds = make_dataset(test_ds)
+
+"""
+## Building the model
+
+Now, let's move on to the exciting part - defining our model!
+We first need an embedding layer, i.e., a layer that maps every token in the input
+sequence to a vector. This embedding layer can be initialised randomly. We also
+need a positional embedding layer which encodes the word order in the sequence.
+The convention is to add, i.e., sum, these two embeddings. KerasHub has a
+`keras_hub.layers.TokenAndPositionEmbedding ` layer which does all of the above
+steps for us.
+
+Our FNet classification model consists of three `keras_hub.layers.FNetEncoder`
+layers with a `keras.layers.Dense` layer on top.
+
+Note: For FNet, masking the padding tokens has a minimal effect on results. In the
+official implementation, the padding tokens are not masked.
+"""
+
+input_ids = keras.Input(shape=(None,), dtype="int64", name="input_ids")
+
+x = keras_hub.layers.TokenAndPositionEmbedding(
+    vocabulary_size=VOCAB_SIZE,
+    sequence_length=MAX_SEQUENCE_LENGTH,
+    embedding_dim=EMBED_DIM,
+    mask_zero=True,
+)(input_ids)
+
+x = keras_hub.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)
+x = keras_hub.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)
+x = keras_hub.layers.FNetEncoder(intermediate_dim=INTERMEDIATE_DIM)(inputs=x)
+
+
+x = keras.layers.GlobalAveragePooling1D()(x)
+x = keras.layers.Dropout(0.1)(x)
+outputs = keras.layers.Dense(1, activation="sigmoid")(x)
+
+fnet_classifier = keras.Model(input_ids, outputs, name="fnet_classifier")
+
+"""
+## Training our model
+
+We'll use accuracy to monitor training progress on the validation data. Let's
+train our model for 3 epochs.
+"""
+fnet_classifier.summary()
+fnet_classifier.compile(
+    optimizer=keras.optimizers.Adam(learning_rate=0.001),
+    loss="binary_crossentropy",
+    metrics=["accuracy"],
+)
+fnet_classifier.fit(train_ds, epochs=EPOCHS, validation_data=val_ds)
+
+"""
+We obtain a train accuracy of around 92% and a validation accuracy of around
+85%. Moreover, for 3 epochs, it takes around 86 seconds to train the model
+(on Colab with a 16 GB Tesla T4 GPU).
+
+Let's calculate the test accuracy.
+"""
+fnet_classifier.evaluate(test_ds, batch_size=BATCH_SIZE)
+
+
+"""
+## Comparison with Transformer model
+
+Let's compare our FNet Classifier model with a Transformer Classifier model. We
+keep all the parameters/hyperparameters the same. For example, we use three
+`TransformerEncoder` layers.
+
+We set the number of heads to 2.
+"""
+NUM_HEADS = 2
+input_ids = keras.Input(shape=(None,), dtype="int64", name="input_ids")
+
+
+x = keras_hub.layers.TokenAndPositionEmbedding(
+    vocabulary_size=VOCAB_SIZE,
+    sequence_length=MAX_SEQUENCE_LENGTH,
+    embedding_dim=EMBED_DIM,
+    mask_zero=True,
+)(input_ids)
+
+x = keras_hub.layers.TransformerEncoder(
+    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
+)(inputs=x)
+x = keras_hub.layers.TransformerEncoder(
+    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
+)(inputs=x)
+x = keras_hub.layers.TransformerEncoder(
+    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
+)(inputs=x)
+
+
+x = keras.layers.GlobalAveragePooling1D()(x)
+x = keras.layers.Dropout(0.1)(x)
+outputs = keras.layers.Dense(1, activation="sigmoid")(x)
+
+transformer_classifier = keras.Model(input_ids, outputs, name="transformer_classifier")
+
+
+transformer_classifier.summary()
+transformer_classifier.compile(
+    optimizer=keras.optimizers.Adam(learning_rate=0.001),
+    loss="binary_crossentropy",
+    metrics=["accuracy"],
+)
+transformer_classifier.fit(train_ds, epochs=EPOCHS, validation_data=val_ds)
+
+"""
+We obtain a train accuracy of around 94% and a validation accuracy of around
+86.5%. It takes around 146 seconds to train the model (on Colab with a 16 GB Tesla
+T4 GPU).
+
+Let's calculate the test accuracy.
+"""
+transformer_classifier.evaluate(test_ds, batch_size=BATCH_SIZE)
+
+"""
+Let's make a table and compare the two models. We can see that FNet
+significantly speeds up our run time (1.7x), with only a small sacrifice in
+overall accuracy (drop of 0.75%).
+
+|                         | **FNet Classifier** | **Transformer Classifier** |
+|:-----------------------:|:-------------------:|:--------------------------:|
+|    **Training Time**    |      86 seconds     |         146 seconds        |
+|    **Train Accuracy**   |        92.34%       |           93.85%           |
+| **Validation Accuracy** |        85.21%       |           86.42%           |
+|    **Test Accuracy**    |        83.94%       |           84.69%           |
+|       **#Params**       |      2,321,921      |          2,520,065         |
+"""
diff --git a/knowledge_base/nlp/lstm_seq2seq.py b/knowledge_base/nlp/lstm_seq2seq.py
new file mode 100644
index 0000000000000000000000000000000000000000..01f7fadd4c95ab3e3682728717ee1a69b97b1873
--- /dev/null
+++ b/knowledge_base/nlp/lstm_seq2seq.py
@@ -0,0 +1,274 @@
+"""
+Title: Character-level recurrent sequence-to-sequence model
+Author: [fchollet](https://twitter.com/fchollet)
+Date created: 2017/09/29
+Last modified: 2023/11/22
+Description: Character-level recurrent sequence-to-sequence model.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+This example demonstrates how to implement a basic character-level
+recurrent sequence-to-sequence model. We apply it to translating
+short English sentences into short French sentences,
+character-by-character. Note that it is fairly unusual to
+do character-level machine translation, as word-level
+models are more common in this domain.
+
+**Summary of the algorithm**
+
+- We start with input sequences from a domain (e.g. English sentences)
+    and corresponding target sequences from another domain
+    (e.g. French sentences).
+- An encoder LSTM turns input sequences to 2 state vectors
+    (we keep the last LSTM state and discard the outputs).
+- A decoder LSTM is trained to turn the target sequences into
+    the same sequence but offset by one timestep in the future,
+    a training process called "teacher forcing" in this context.
+    It uses as initial state the state vectors from the encoder.
+    Effectively, the decoder learns to generate `targets[t+1...]`
+    given `targets[...t]`, conditioned on the input sequence.
+- In inference mode, when we want to decode unknown input sequences, we:
+    - Encode the input sequence into state vectors
+    - Start with a target sequence of size 1
+        (just the start-of-sequence character)
+    - Feed the state vectors and 1-char target sequence
+        to the decoder to produce predictions for the next character
+    - Sample the next character using these predictions
+        (we simply use argmax).
+    - Append the sampled character to the target sequence
+    - Repeat until we generate the end-of-sequence character or we
+        hit the character limit.
+"""
+
+"""
+## Setup
+"""
+
+import numpy as np
+import keras
+import os
+from pathlib import Path
+
+"""
+## Download the data
+"""
+
+fpath = keras.utils.get_file(origin="http://www.manythings.org/anki/fra-eng.zip")
+dirpath = Path(fpath).parent.absolute()
+os.system(f"unzip -q {fpath} -d {dirpath}")
+
+"""
+## Configuration
+"""
+
+batch_size = 64  # Batch size for training.
+epochs = 100  # Number of epochs to train for.
+latent_dim = 256  # Latent dimensionality of the encoding space.
+num_samples = 10000  # Number of samples to train on.
+# Path to the data txt file on disk.
+data_path = os.path.join(dirpath, "fra.txt")
+
+"""
+## Prepare the data
+"""
+
+# Vectorize the data.
+input_texts = []
+target_texts = []
+input_characters = set()
+target_characters = set()
+with open(data_path, "r", encoding="utf-8") as f:
+    lines = f.read().split("\n")
+for line in lines[: min(num_samples, len(lines) - 1)]:
+    input_text, target_text, _ = line.split("\t")
+    # We use "tab" as the "start sequence" character
+    # for the targets, and "\n" as "end sequence" character.
+    target_text = "\t" + target_text + "\n"
+    input_texts.append(input_text)
+    target_texts.append(target_text)
+    for char in input_text:
+        if char not in input_characters:
+            input_characters.add(char)
+    for char in target_text:
+        if char not in target_characters:
+            target_characters.add(char)
+
+input_characters = sorted(list(input_characters))
+target_characters = sorted(list(target_characters))
+num_encoder_tokens = len(input_characters)
+num_decoder_tokens = len(target_characters)
+max_encoder_seq_length = max([len(txt) for txt in input_texts])
+max_decoder_seq_length = max([len(txt) for txt in target_texts])
+
+print("Number of samples:", len(input_texts))
+print("Number of unique input tokens:", num_encoder_tokens)
+print("Number of unique output tokens:", num_decoder_tokens)
+print("Max sequence length for inputs:", max_encoder_seq_length)
+print("Max sequence length for outputs:", max_decoder_seq_length)
+
+input_token_index = dict([(char, i) for i, char in enumerate(input_characters)])
+target_token_index = dict([(char, i) for i, char in enumerate(target_characters)])
+
+encoder_input_data = np.zeros(
+    (len(input_texts), max_encoder_seq_length, num_encoder_tokens),
+    dtype="float32",
+)
+decoder_input_data = np.zeros(
+    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
+    dtype="float32",
+)
+decoder_target_data = np.zeros(
+    (len(input_texts), max_decoder_seq_length, num_decoder_tokens),
+    dtype="float32",
+)
+
+for i, (input_text, target_text) in enumerate(zip(input_texts, target_texts)):
+    for t, char in enumerate(input_text):
+        encoder_input_data[i, t, input_token_index[char]] = 1.0
+    encoder_input_data[i, t + 1 :, input_token_index[" "]] = 1.0
+    for t, char in enumerate(target_text):
+        # decoder_target_data is ahead of decoder_input_data by one timestep
+        decoder_input_data[i, t, target_token_index[char]] = 1.0
+        if t > 0:
+            # decoder_target_data will be ahead by one timestep
+            # and will not include the start character.
+            decoder_target_data[i, t - 1, target_token_index[char]] = 1.0
+    decoder_input_data[i, t + 1 :, target_token_index[" "]] = 1.0
+    decoder_target_data[i, t:, target_token_index[" "]] = 1.0
+
+"""
+## Build the model
+"""
+
+# Define an input sequence and process it.
+encoder_inputs = keras.Input(shape=(None, num_encoder_tokens))
+encoder = keras.layers.LSTM(latent_dim, return_state=True)
+encoder_outputs, state_h, state_c = encoder(encoder_inputs)
+
+# We discard `encoder_outputs` and only keep the states.
+encoder_states = [state_h, state_c]
+
+# Set up the decoder, using `encoder_states` as initial state.
+decoder_inputs = keras.Input(shape=(None, num_decoder_tokens))
+
+# We set up our decoder to return full output sequences,
+# and to return internal states as well. We don't use the
+# return states in the training model, but we will use them in inference.
+decoder_lstm = keras.layers.LSTM(latent_dim, return_sequences=True, return_state=True)
+decoder_outputs, _, _ = decoder_lstm(decoder_inputs, initial_state=encoder_states)
+decoder_dense = keras.layers.Dense(num_decoder_tokens, activation="softmax")
+decoder_outputs = decoder_dense(decoder_outputs)
+
+# Define the model that will turn
+# `encoder_input_data` & `decoder_input_data` into `decoder_target_data`
+model = keras.Model([encoder_inputs, decoder_inputs], decoder_outputs)
+
+"""
+## Train the model
+"""
+
+model.compile(
+    optimizer="rmsprop", loss="categorical_crossentropy", metrics=["accuracy"]
+)
+model.fit(
+    [encoder_input_data, decoder_input_data],
+    decoder_target_data,
+    batch_size=batch_size,
+    epochs=epochs,
+    validation_split=0.2,
+)
+# Save model
+model.save("s2s_model.keras")
+
+"""
+## Run inference (sampling)
+
+1. encode input and retrieve initial decoder state
+2. run one step of decoder with this initial state
+and a "start of sequence" token as target.
+Output will be the next target token.
+3. Repeat with the current target token and current states
+"""
+
+# Define sampling models
+# Restore the model and construct the encoder and decoder.
+model = keras.models.load_model("s2s_model.keras")
+
+encoder_inputs = model.input[0]  # input_1
+encoder_outputs, state_h_enc, state_c_enc = model.layers[2].output  # lstm_1
+encoder_states = [state_h_enc, state_c_enc]
+encoder_model = keras.Model(encoder_inputs, encoder_states)
+
+decoder_inputs = model.input[1]  # input_2
+decoder_state_input_h = keras.Input(shape=(latent_dim,))
+decoder_state_input_c = keras.Input(shape=(latent_dim,))
+decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]
+decoder_lstm = model.layers[3]
+decoder_outputs, state_h_dec, state_c_dec = decoder_lstm(
+    decoder_inputs, initial_state=decoder_states_inputs
+)
+decoder_states = [state_h_dec, state_c_dec]
+decoder_dense = model.layers[4]
+decoder_outputs = decoder_dense(decoder_outputs)
+decoder_model = keras.Model(
+    [decoder_inputs] + decoder_states_inputs, [decoder_outputs] + decoder_states
+)
+
+# Reverse-lookup token index to decode sequences back to
+# something readable.
+reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
+reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())
+
+
+def decode_sequence(input_seq):
+    # Encode the input as state vectors.
+    states_value = encoder_model.predict(input_seq, verbose=0)
+
+    # Generate empty target sequence of length 1.
+    target_seq = np.zeros((1, 1, num_decoder_tokens))
+    # Populate the first character of target sequence with the start character.
+    target_seq[0, 0, target_token_index["\t"]] = 1.0
+
+    # Sampling loop for a batch of sequences
+    # (to simplify, here we assume a batch of size 1).
+    stop_condition = False
+    decoded_sentence = ""
+    while not stop_condition:
+        output_tokens, h, c = decoder_model.predict(
+            [target_seq] + states_value, verbose=0
+        )
+
+        # Sample a token
+        sampled_token_index = np.argmax(output_tokens[0, -1, :])
+        sampled_char = reverse_target_char_index[sampled_token_index]
+        decoded_sentence += sampled_char
+
+        # Exit condition: either hit max length
+        # or find stop character.
+        if sampled_char == "\n" or len(decoded_sentence) > max_decoder_seq_length:
+            stop_condition = True
+
+        # Update the target sequence (of length 1).
+        target_seq = np.zeros((1, 1, num_decoder_tokens))
+        target_seq[0, 0, sampled_token_index] = 1.0
+
+        # Update states
+        states_value = [h, c]
+    return decoded_sentence
+
+
+"""
+You can now generate decoded sentences as such:
+"""
+
+for seq_index in range(20):
+    # Take one sequence (part of the training set)
+    # for trying out decoding.
+    input_seq = encoder_input_data[seq_index : seq_index + 1]
+    decoded_sentence = decode_sequence(input_seq)
+    print("-")
+    print("Input sentence:", input_texts[seq_index])
+    print("Decoded sentence:", decoded_sentence)
diff --git a/knowledge_base/nlp/masked_language_modeling.py b/knowledge_base/nlp/masked_language_modeling.py
new file mode 100644
index 0000000000000000000000000000000000000000..01262642690a542b469082c7d732df70cf700473
--- /dev/null
+++ b/knowledge_base/nlp/masked_language_modeling.py
@@ -0,0 +1,505 @@
+"""
+Title: End-to-end Masked Language Modeling with BERT
+Author: [Ankur Singh](https://twitter.com/ankur310794)
+Date created: 2020/09/18
+Last modified: 2024/03/15
+Description: Implement a Masked Language Model (MLM) with BERT and fine-tune it on the IMDB Reviews dataset.
+Accelerator: GPU
+Converted to Keras 3 by: [Sitam Meur](https://github.com/sitamgithub-MSIT) and made backend-agnostic by: [Humbulani Ndou](https://github.com/Humbulani1234)
+"""
+
+"""
+## Introduction
+
+Masked Language Modeling is a fill-in-the-blank task,
+where a model uses the context words surrounding a mask token to try to predict what the
+masked word should be.
+
+For an input that contains one or more mask tokens,
+the model will generate the most likely substitution for each.
+
+Example:
+
+- Input: "I have watched this [MASK] and it was awesome."
+- Output: "I have watched this movie and it was awesome."
+
+Masked language modeling is a great way to train a language
+model in a self-supervised setting (without human-annotated labels).
+Such a model can then be fine-tuned to accomplish various supervised
+NLP tasks.
+
+This example teaches you how to build a BERT model from scratch,
+train it with the masked language modeling task,
+and then fine-tune this model on a sentiment classification task.
+
+We will use the Keras `TextVectorization` and `MultiHeadAttention` layers
+to create a BERT Transformer-Encoder network architecture.
+
+Note: This example should be run with `tf-nightly`.
+"""
+
+"""
+## Setup
+
+Install `tf-nightly` via `pip install tf-nightly`.
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "torch"  # or jax, or tensorflow
+
+import keras_hub
+
+import keras
+from keras import layers
+from keras.layers import TextVectorization
+
+from dataclasses import dataclass
+import pandas as pd
+import numpy as np
+import glob
+import re
+from pprint import pprint
+
+"""
+## Set-up Configuration
+"""
+
+
+@dataclass
+class Config:
+    MAX_LEN = 256
+    BATCH_SIZE = 32
+    LR = 0.001
+    VOCAB_SIZE = 30000
+    EMBED_DIM = 128
+    NUM_HEAD = 8  # used in bert model
+    FF_DIM = 128  # used in bert model
+    NUM_LAYERS = 1
+
+
+config = Config()
+
+"""
+## Load the data
+
+We will first download the IMDB data and load into a Pandas dataframe.
+"""
+
+"""shell
+curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
+tar -xf aclImdb_v1.tar.gz
+"""
+
+
+def get_text_list_from_files(files):
+    text_list = []
+    for name in files:
+        with open(name) as f:
+            for line in f:
+                text_list.append(line)
+    return text_list
+
+
+def get_data_from_text_files(folder_name):
+    pos_files = glob.glob("aclImdb/" + folder_name + "/pos/*.txt")
+    pos_texts = get_text_list_from_files(pos_files)
+    neg_files = glob.glob("aclImdb/" + folder_name + "/neg/*.txt")
+    neg_texts = get_text_list_from_files(neg_files)
+    df = pd.DataFrame(
+        {
+            "review": pos_texts + neg_texts,
+            "sentiment": [0] * len(pos_texts) + [1] * len(neg_texts),
+        }
+    )
+    df = df.sample(len(df)).reset_index(drop=True)
+    return df
+
+
+train_df = get_data_from_text_files("train")
+test_df = get_data_from_text_files("test")
+
+all_data = pd.concat([train_df, test_df], ignore_index=True)
+
+"""
+## Dataset preparation
+
+We will use the `TextVectorization` layer to vectorize the text into integer token ids.
+It transforms a batch of strings into either
+a sequence of token indices (one sample = 1D array of integer token indices, in order)
+or a dense representation (one sample = 1D array of float values encoding an unordered set of tokens).
+
+Below, we define 3 preprocessing functions.
+
+1.  The `get_vectorize_layer` function builds the `TextVectorization` layer.
+2.  The `encode` function encodes raw text into integer token ids.
+3.  The `get_masked_input_and_labels` function will mask input token ids.
+It masks 15% of all input tokens in each sequence at random.
+"""
+
+# For data pre-processing and tf.data.Dataset
+import tensorflow as tf
+
+
+def custom_standardization(input_data):
+    lowercase = tf.strings.lower(input_data)
+    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
+    return tf.strings.regex_replace(
+        stripped_html, "[%s]" % re.escape("!#$%&'()*+,-./:;<=>?@\^_`{|}~"), ""
+    )
+
+
+def get_vectorize_layer(texts, vocab_size, max_seq, special_tokens=["[MASK]"]):
+    """Build Text vectorization layer
+
+    Args:
+      texts (list): List of string i.e input texts
+      vocab_size (int): vocab size
+      max_seq (int): Maximum sequence length.
+      special_tokens (list, optional): List of special tokens. Defaults to ['[MASK]'].
+
+    Returns:
+        layers.Layer: Return TextVectorization Keras Layer
+    """
+    vectorize_layer = TextVectorization(
+        max_tokens=vocab_size,
+        output_mode="int",
+        standardize=custom_standardization,
+        output_sequence_length=max_seq,
+    )
+    vectorize_layer.adapt(texts)
+
+    # Insert mask token in vocabulary
+    vocab = vectorize_layer.get_vocabulary()
+    vocab = vocab[2 : vocab_size - len(special_tokens)] + ["[mask]"]
+    vectorize_layer.set_vocabulary(vocab)
+    return vectorize_layer
+
+
+vectorize_layer = get_vectorize_layer(
+    all_data.review.values.tolist(),
+    config.VOCAB_SIZE,
+    config.MAX_LEN,
+    special_tokens=["[mask]"],
+)
+
+# Get mask token id for masked language model
+mask_token_id = vectorize_layer(["[mask]"]).numpy()[0][0]
+
+
+def encode(texts):
+    encoded_texts = vectorize_layer(texts)
+    return encoded_texts.numpy()
+
+
+def get_masked_input_and_labels(encoded_texts):
+    # 15% BERT masking
+    inp_mask = np.random.rand(*encoded_texts.shape) < 0.15
+    # Do not mask special tokens
+    inp_mask[encoded_texts <= 2] = False
+    # Set targets to -1 by default, it means ignore
+    labels = -1 * np.ones(encoded_texts.shape, dtype=int)
+    # Set labels for masked tokens
+    labels[inp_mask] = encoded_texts[inp_mask]
+
+    # Prepare input
+    encoded_texts_masked = np.copy(encoded_texts)
+    # Set input to [MASK] which is the last token for the 90% of tokens
+    # This means leaving 10% unchanged
+    inp_mask_2mask = inp_mask & (np.random.rand(*encoded_texts.shape) < 0.90)
+    encoded_texts_masked[inp_mask_2mask] = (
+        mask_token_id  # mask token is the last in the dict
+    )
+
+    # Set 10% to a random token
+    inp_mask_2random = inp_mask_2mask & (np.random.rand(*encoded_texts.shape) < 1 / 9)
+    encoded_texts_masked[inp_mask_2random] = np.random.randint(
+        3, mask_token_id, inp_mask_2random.sum()
+    )
+
+    # Prepare sample_weights to pass to .fit() method
+    sample_weights = np.ones(labels.shape)
+    sample_weights[labels == -1] = 0
+
+    # y_labels would be same as encoded_texts i.e input tokens
+    y_labels = np.copy(encoded_texts)
+
+    return encoded_texts_masked, y_labels, sample_weights
+
+
+# We have 25000 examples for training
+x_train = encode(train_df.review.values)  # encode reviews with vectorizer
+y_train = train_df.sentiment.values
+train_classifier_ds = (
+    tf.data.Dataset.from_tensor_slices((x_train, y_train))
+    .shuffle(1000)
+    .batch(config.BATCH_SIZE)
+)
+
+# We have 25000 examples for testing
+x_test = encode(test_df.review.values)
+y_test = test_df.sentiment.values
+test_classifier_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test)).batch(
+    config.BATCH_SIZE
+)
+
+# Dataset for end to end model input (will be used at the end)
+test_raw_classifier_ds = test_df
+
+# Prepare data for masked language model
+x_all_review = encode(all_data.review.values)
+x_masked_train, y_masked_labels, sample_weights = get_masked_input_and_labels(
+    x_all_review
+)
+
+mlm_ds = tf.data.Dataset.from_tensor_slices(
+    (x_masked_train, y_masked_labels, sample_weights)
+)
+mlm_ds = mlm_ds.shuffle(1000).batch(config.BATCH_SIZE)
+
+"""
+## Create BERT model (Pretraining Model) for masked language modeling
+
+We will create a BERT-like pretraining model architecture
+using the `MultiHeadAttention` layer.
+It will take token ids as inputs (including masked tokens)
+and it will predict the correct ids for the masked input tokens.
+"""
+
+
+def bert_module(query, key, value, i):
+    # Multi headed self-attention
+    attention_output = layers.MultiHeadAttention(
+        num_heads=config.NUM_HEAD,
+        key_dim=config.EMBED_DIM // config.NUM_HEAD,
+        name="encoder_{}_multiheadattention".format(i),
+    )(query, key, value)
+    attention_output = layers.Dropout(0.1, name="encoder_{}_att_dropout".format(i))(
+        attention_output
+    )
+    attention_output = layers.LayerNormalization(
+        epsilon=1e-6, name="encoder_{}_att_layernormalization".format(i)
+    )(query + attention_output)
+
+    # Feed-forward layer
+    ffn = keras.Sequential(
+        [
+            layers.Dense(config.FF_DIM, activation="relu"),
+            layers.Dense(config.EMBED_DIM),
+        ],
+        name="encoder_{}_ffn".format(i),
+    )
+    ffn_output = ffn(attention_output)
+    ffn_output = layers.Dropout(0.1, name="encoder_{}_ffn_dropout".format(i))(
+        ffn_output
+    )
+    sequence_output = layers.LayerNormalization(
+        epsilon=1e-6, name="encoder_{}_ffn_layernormalization".format(i)
+    )(attention_output + ffn_output)
+    return sequence_output
+
+
+loss_fn = keras.losses.SparseCategoricalCrossentropy(reduction=None)
+loss_tracker = keras.metrics.Mean(name="loss")
+
+
+class MaskedLanguageModel(keras.Model):
+
+    def compute_loss(self, x=None, y=None, y_pred=None, sample_weight=None):
+
+        loss = loss_fn(y, y_pred, sample_weight)
+        loss_tracker.update_state(loss, sample_weight=sample_weight)
+        return keras.ops.sum(loss)
+
+    def compute_metrics(self, x, y, y_pred, sample_weight):
+
+        # Return a dict mapping metric names to current value
+        return {"loss": loss_tracker.result()}
+
+    @property
+    def metrics(self):
+        # We list our `Metric` objects here so that `reset_states()` can be
+        # called automatically at the start of each epoch
+        # or at the start of `evaluate()`.
+        # If you don't implement this property, you have to call
+        # `reset_states()` yourself at the time of your choosing.
+        return [loss_tracker]
+
+
+def create_masked_language_bert_model():
+    inputs = layers.Input((config.MAX_LEN,), dtype="int64")
+
+    word_embeddings = layers.Embedding(
+        config.VOCAB_SIZE, config.EMBED_DIM, name="word_embedding"
+    )(inputs)
+    position_embeddings = keras_hub.layers.PositionEmbedding(
+        sequence_length=config.MAX_LEN
+    )(word_embeddings)
+    embeddings = word_embeddings + position_embeddings
+
+    encoder_output = embeddings
+    for i in range(config.NUM_LAYERS):
+        encoder_output = bert_module(encoder_output, encoder_output, encoder_output, i)
+
+    mlm_output = layers.Dense(config.VOCAB_SIZE, name="mlm_cls", activation="softmax")(
+        encoder_output
+    )
+    mlm_model = MaskedLanguageModel(inputs, mlm_output, name="masked_bert_model")
+
+    optimizer = keras.optimizers.Adam(learning_rate=config.LR)
+    mlm_model.compile(optimizer=optimizer)
+    return mlm_model
+
+
+id2token = dict(enumerate(vectorize_layer.get_vocabulary()))
+token2id = {y: x for x, y in id2token.items()}
+
+
+class MaskedTextGenerator(keras.callbacks.Callback):
+    def __init__(self, sample_tokens, top_k=5):
+        self.sample_tokens = sample_tokens
+        self.k = top_k
+
+    def decode(self, tokens):
+        return " ".join([id2token[t] for t in tokens if t != 0])
+
+    def convert_ids_to_tokens(self, id):
+        return id2token[id]
+
+    def on_epoch_end(self, epoch, logs=None):
+        prediction = self.model.predict(self.sample_tokens)
+
+        masked_index = np.where(self.sample_tokens == mask_token_id)
+        masked_index = masked_index[1]
+        mask_prediction = prediction[0][masked_index]
+
+        top_indices = mask_prediction[0].argsort()[-self.k :][::-1]
+        values = mask_prediction[0][top_indices]
+
+        for i in range(len(top_indices)):
+            p = top_indices[i]
+            v = values[i]
+            tokens = np.copy(sample_tokens[0])
+            tokens[masked_index[0]] = p
+            result = {
+                "input_text": self.decode(sample_tokens[0].numpy()),
+                "prediction": self.decode(tokens),
+                "probability": v,
+                "predicted mask token": self.convert_ids_to_tokens(p),
+            }
+            pprint(result)
+
+
+sample_tokens = vectorize_layer(["I have watched this [mask] and it was awesome"])
+generator_callback = MaskedTextGenerator(sample_tokens.numpy())
+
+bert_masked_model = create_masked_language_bert_model()
+bert_masked_model.summary()
+
+"""
+## Train and Save
+"""
+
+bert_masked_model.fit(mlm_ds, epochs=5, callbacks=[generator_callback])
+bert_masked_model.save("bert_mlm_imdb.keras")
+
+"""
+## Fine-tune a sentiment classification model
+
+We will fine-tune our self-supervised model on a downstream task of sentiment classification.
+To do this, let's create a classifier by adding a pooling layer and a `Dense` layer on top of the
+pretrained BERT features.
+
+"""
+
+# Load pretrained bert model
+mlm_model = keras.models.load_model(
+    "bert_mlm_imdb.keras", custom_objects={"MaskedLanguageModel": MaskedLanguageModel}
+)
+pretrained_bert_model = keras.Model(
+    mlm_model.input, mlm_model.get_layer("encoder_0_ffn_layernormalization").output
+)
+
+# Freeze it
+pretrained_bert_model.trainable = False
+
+
+def create_classifier_bert_model():
+    inputs = layers.Input((config.MAX_LEN,), dtype="int64")
+    sequence_output = pretrained_bert_model(inputs)
+    pooled_output = layers.GlobalMaxPooling1D()(sequence_output)
+    hidden_layer = layers.Dense(64, activation="relu")(pooled_output)
+    outputs = layers.Dense(1, activation="sigmoid")(hidden_layer)
+    classifer_model = keras.Model(inputs, outputs, name="classification")
+    optimizer = keras.optimizers.Adam()
+    classifer_model.compile(
+        optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"]
+    )
+    return classifer_model
+
+
+classifer_model = create_classifier_bert_model()
+classifer_model.summary()
+
+# Train the classifier with frozen BERT stage
+classifer_model.fit(
+    train_classifier_ds,
+    epochs=5,
+    validation_data=test_classifier_ds,
+)
+
+# Unfreeze the BERT model for fine-tuning
+pretrained_bert_model.trainable = True
+optimizer = keras.optimizers.Adam()
+classifer_model.compile(
+    optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"]
+)
+classifer_model.fit(
+    train_classifier_ds,
+    epochs=5,
+    validation_data=test_classifier_ds,
+)
+
+"""
+## Create an end-to-end model and evaluate it
+
+When you want to deploy a model, it's best if it already includes its preprocessing
+pipeline, so that you don't have to reimplement the preprocessing logic in your
+production environment. Let's create an end-to-end model that incorporates
+the `TextVectorization` layer inside evaluate method, and let's evaluate. We will pass raw strings as input.
+"""
+
+
+# We create a custom Model to override the evaluate method so
+# that it first pre-process text data
+class ModelEndtoEnd(keras.Model):
+
+    def evaluate(self, inputs):
+        features = encode(inputs.review.values)
+        labels = inputs.sentiment.values
+        test_classifier_ds = (
+            tf.data.Dataset.from_tensor_slices((features, labels))
+            .shuffle(1000)
+            .batch(config.BATCH_SIZE)
+        )
+        return super().evaluate(test_classifier_ds)
+
+    # Build the model
+    def build(self, input_shape):
+        self.built = True
+
+
+def get_end_to_end(model):
+    inputs = classifer_model.inputs[0]
+    outputs = classifer_model.outputs
+    end_to_end_model = ModelEndtoEnd(inputs, outputs, name="end_to_end_model")
+    optimizer = keras.optimizers.Adam(learning_rate=config.LR)
+    end_to_end_model.compile(
+        optimizer=optimizer, loss="binary_crossentropy", metrics=["accuracy"]
+    )
+    return end_to_end_model
+
+
+end_to_end_classification_model = get_end_to_end(classifer_model)
+# Pass raw text dataframe to the model
+end_to_end_classification_model.evaluate(test_raw_classifier_ds)
diff --git a/knowledge_base/nlp/multi_label_classification.py b/knowledge_base/nlp/multi_label_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..60a9936f254aa0fb6a0e4dba63dd178daff6e272
--- /dev/null
+++ b/knowledge_base/nlp/multi_label_classification.py
@@ -0,0 +1,451 @@
+"""
+Title: Large-scale multi-label text classification
+Author: [Sayak Paul](https://twitter.com/RisingSayak), [Soumik Rakshit](https://github.com/soumik12345)
+Date created: 2020/09/25
+Last modified: 2025/02/27
+Description: Implementing a large-scale multi-label text classification model.
+Accelerator: GPU
+Converted to keras 3 and made backend-agnostic by: [Humbulani Ndou](https://github.com/Humbulani1234)
+"""
+
+"""
+## Introduction
+
+In this example, we will build a multi-label text classifier to predict the subject areas
+of arXiv papers from their abstract bodies. This type of classifier can be useful for
+conference submission portals like [OpenReview](https://openreview.net/). Given a paper
+abstract, the portal could provide suggestions for which areas the paper would
+best belong to.
+
+The dataset was collected using the
+[`arXiv` Python library](https://github.com/lukasschwab/arxiv.py)
+that provides a wrapper around the
+[original arXiv API](http://arxiv.org/help/api/index).
+To learn more about the data collection process, please refer to
+[this notebook](https://github.com/soumik12345/multi-label-text-classification/blob/master/arxiv_scrape.ipynb).
+Additionally, you can also find the dataset on
+[Kaggle](https://www.kaggle.com/spsayakpaul/arxiv-paper-abstracts).
+"""
+
+"""
+## Imports
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "jax"  # or tensorflow, or torch
+
+import keras
+from keras import layers, ops
+
+from sklearn.model_selection import train_test_split
+
+from ast import literal_eval
+import matplotlib.pyplot as plt
+import pandas as pd
+import numpy as np
+
+"""
+## Perform exploratory data analysis
+
+In this section, we first load the dataset into a `pandas` dataframe and then perform
+some basic exploratory data analysis (EDA).
+"""
+
+arxiv_data = pd.read_csv(
+    "https://github.com/soumik12345/multi-label-text-classification/releases/download/v0.2/arxiv_data.csv"
+)
+arxiv_data.head()
+
+"""
+Our text features are present in the `summaries` column and their corresponding labels
+are in `terms`. As you can notice, there are multiple categories associated with a
+particular entry.
+"""
+
+print(f"There are {len(arxiv_data)} rows in the dataset.")
+
+"""
+Real-world data is noisy. One of the most commonly observed source of noise is data
+duplication. Here we notice that our initial dataset has got about 13k duplicate entries.
+"""
+
+total_duplicate_titles = sum(arxiv_data["titles"].duplicated())
+print(f"There are {total_duplicate_titles} duplicate titles.")
+
+"""
+Before proceeding further, we drop these entries.
+"""
+
+arxiv_data = arxiv_data[~arxiv_data["titles"].duplicated()]
+print(f"There are {len(arxiv_data)} rows in the deduplicated dataset.")
+
+# There are some terms with occurrence as low as 1.
+print(sum(arxiv_data["terms"].value_counts() == 1))
+
+# How many unique terms?
+print(arxiv_data["terms"].nunique())
+
+"""
+As observed above, out of 3,157 unique combinations of `terms`, 2,321 entries have the
+lowest occurrence. To prepare our train, validation, and test sets with
+[stratification](https://en.wikipedia.org/wiki/Stratified_sampling), we need to drop
+these terms.
+"""
+
+# Filtering the rare terms.
+arxiv_data_filtered = arxiv_data.groupby("terms").filter(lambda x: len(x) > 1)
+arxiv_data_filtered.shape
+
+"""
+## Convert the string labels to lists of strings
+
+The initial labels are represented as raw strings. Here we make them `List[str]` for a
+more compact representation.
+"""
+
+arxiv_data_filtered["terms"] = arxiv_data_filtered["terms"].apply(
+    lambda x: literal_eval(x)
+)
+arxiv_data_filtered["terms"].values[:5]
+
+"""
+## Use stratified splits because of class imbalance
+
+The dataset has a
+[class imbalance problem](https://developers.google.com/machine-learning/glossary/#class-imbalanced-dataset).
+So, to have a fair evaluation result, we need to ensure the datasets are sampled with
+stratification. To know more about different strategies to deal with the class imbalance
+problem, you can follow
+[this tutorial](https://www.tensorflow.org/tutorials/structured_data/imbalanced_data).
+For an end-to-end demonstration of classification with imbablanced data, refer to
+[Imbalanced classification: credit card fraud detection](https://keras.io/examples/structured_data/imbalanced_classification/).
+"""
+
+test_split = 0.1
+
+# Initial train and test split.
+train_df, test_df = train_test_split(
+    arxiv_data_filtered,
+    test_size=test_split,
+    stratify=arxiv_data_filtered["terms"].values,
+)
+
+# Splitting the test set further into validation
+# and new test sets.
+val_df = test_df.sample(frac=0.5)
+test_df.drop(val_df.index, inplace=True)
+
+print(f"Number of rows in training set: {len(train_df)}")
+print(f"Number of rows in validation set: {len(val_df)}")
+print(f"Number of rows in test set: {len(test_df)}")
+
+"""
+## Multi-label binarization
+
+Now we preprocess our labels using the
+[`StringLookup`](https://keras.io/api/layers/preprocessing_layers/categorical/string_lookup)
+layer.
+"""
+
+# For RaggedTensor
+import tensorflow as tf
+
+terms = tf.ragged.constant(train_df["terms"].values)
+lookup = layers.StringLookup(output_mode="multi_hot")
+lookup.adapt(terms)
+vocab = lookup.get_vocabulary()
+
+
+def invert_multi_hot(encoded_labels):
+    """Reverse a single multi-hot encoded label to a tuple of vocab terms."""
+    hot_indices = np.argwhere(encoded_labels == 1.0)[..., 0]
+    return np.take(vocab, hot_indices)
+
+
+print("Vocabulary:\n")
+print(vocab)
+
+
+"""
+Here we are separating the individual unique classes available from the label
+pool and then using this information to represent a given label set with 0's and 1's.
+Below is an example.
+"""
+
+sample_label = train_df["terms"].iloc[0]
+print(f"Original label: {sample_label}")
+
+label_binarized = lookup([sample_label])
+print(f"Label-binarized representation: {label_binarized}")
+
+"""
+## Data preprocessing and `tf.data.Dataset` objects
+
+We first get percentile estimates of the sequence lengths. The purpose will be clear in a
+moment.
+"""
+
+train_df["summaries"].apply(lambda x: len(x.split(" "))).describe()
+
+"""
+Notice that 50% of the abstracts have a length of 154 (you may get a different number
+based on the split). So, any number close to that value is a good enough approximate for the
+maximum sequence length.
+
+Now, we implement utilities to prepare our datasets.
+"""
+
+max_seqlen = 150
+batch_size = 128
+padding_token = "<pad>"
+auto = tf.data.AUTOTUNE
+
+
+def make_dataset(dataframe, is_train=True):
+    labels = tf.ragged.constant(dataframe["terms"].values)
+    label_binarized = lookup(labels).numpy()
+    dataset = tf.data.Dataset.from_tensor_slices(
+        (dataframe["summaries"].values, label_binarized)
+    )
+    dataset = dataset.shuffle(batch_size * 10) if is_train else dataset
+    return dataset.batch(batch_size)
+
+
+"""
+Now we can prepare the `tf.data.Dataset` objects.
+"""
+
+train_dataset = make_dataset(train_df, is_train=True)
+validation_dataset = make_dataset(val_df, is_train=False)
+test_dataset = make_dataset(test_df, is_train=False)
+
+"""
+## Dataset preview
+"""
+
+text_batch, label_batch = next(iter(train_dataset))
+
+for i, text in enumerate(text_batch[:5]):
+    label = label_batch[i].numpy()[None, ...]
+    print(f"Abstract: {text}")
+    print(f"Label(s): {invert_multi_hot(label[0])}")
+    print(" ")
+
+"""
+## Vectorization
+
+Before we feed the data to our model, we need to vectorize it (represent it in a numerical form).
+For that purpose, we will use the
+[`TextVectorization` layer](https://keras.io/api/layers/preprocessing_layers/text/text_vectorization).
+It can operate as a part of your main model so that the model is excluded from the core
+preprocessing logic. This greatly reduces the chances of training / serving skew during inference.
+
+We first calculate the number of unique words present in the abstracts.
+"""
+
+# Source: https://stackoverflow.com/a/18937309/7636462
+vocabulary = set()
+train_df["summaries"].str.lower().str.split().apply(vocabulary.update)
+vocabulary_size = len(vocabulary)
+print(vocabulary_size)
+
+
+"""
+We now create our vectorization layer and `map()` to the `tf.data.Dataset`s created
+earlier.
+"""
+
+text_vectorizer = layers.TextVectorization(
+    max_tokens=vocabulary_size, ngrams=2, output_mode="tf_idf"
+)
+
+# `TextVectorization` layer needs to be adapted as per the vocabulary from our
+# training set.
+with tf.device("/CPU:0"):
+    text_vectorizer.adapt(train_dataset.map(lambda text, label: text))
+
+train_dataset = train_dataset.map(
+    lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto
+).prefetch(auto)
+validation_dataset = validation_dataset.map(
+    lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto
+).prefetch(auto)
+test_dataset = test_dataset.map(
+    lambda text, label: (text_vectorizer(text), label), num_parallel_calls=auto
+).prefetch(auto)
+
+
+"""
+A batch of raw text will first go through the `TextVectorization` layer and it will
+generate their integer representations. Internally, the `TextVectorization` layer will
+first create bi-grams out of the sequences and then represent them using
+[TF-IDF](https://wikipedia.org/wiki/Tf%E2%80%93idf). The output representations will then
+be passed to the shallow model responsible for text classification.
+
+To learn more about other possible configurations with `TextVectorizer`, please consult
+the
+[official documentation](https://keras.io/api/layers/preprocessing_layers/text/text_vectorization).
+
+**Note**: Setting the `max_tokens` argument to a pre-calculated vocabulary size is
+not a requirement.
+"""
+
+"""
+## Create a text classification model
+
+We will keep our model simple -- it will be a small stack of fully-connected layers with
+ReLU as the non-linearity.
+
+"""
+
+
+def make_model():
+    shallow_mlp_model = keras.Sequential(
+        [
+            layers.Dense(512, activation="relu"),
+            layers.Dense(256, activation="relu"),
+            layers.Dense(lookup.vocabulary_size(), activation="sigmoid"),
+        ]  # More on why "sigmoid" has been used here in a moment.
+    )
+    return shallow_mlp_model
+
+
+"""
+## Train the model
+
+We will train our model using the binary crossentropy loss. This is because the labels
+are not disjoint. For a given abstract, we may have multiple categories. So, we will
+divide the prediction task into a series of multiple binary classification problems. This
+is also why we kept the activation function of the classification layer in our model to
+sigmoid. Researchers have used other combinations of loss function and activation
+function as well. For example, in [Exploring the Limits of Weakly Supervised Pretraining](https://arxiv.org/abs/1805.00932),
+Mahajan et al. used the softmax activation function and cross-entropy loss to train
+their models.
+
+There are several options of metrics that can be used in multi-label classification.
+To keep this code example narrow we decided to use the
+[binary accuracy metric](https://keras.io/api/metrics/accuracy_metrics/#binaryaccuracy-class).
+To see the explanation why this metric is used we refer to this
+[pull-request](https://github.com/keras-team/keras-io/pull/1133#issuecomment-1322736860).
+There are also other suitable metrics for multi-label classification, like
+[F1 Score](https://www.tensorflow.org/addons/api_docs/python/tfa/metrics/F1Score) or
+[Hamming loss](https://www.tensorflow.org/addons/api_docs/python/tfa/metrics/HammingLoss).
+"""
+
+epochs = 20
+
+shallow_mlp_model = make_model()
+shallow_mlp_model.compile(
+    loss="binary_crossentropy", optimizer="adam", metrics=["binary_accuracy"]
+)
+
+history = shallow_mlp_model.fit(
+    train_dataset, validation_data=validation_dataset, epochs=epochs
+)
+
+
+def plot_result(item):
+    plt.plot(history.history[item], label=item)
+    plt.plot(history.history["val_" + item], label="val_" + item)
+    plt.xlabel("Epochs")
+    plt.ylabel(item)
+    plt.title("Train and Validation {} Over Epochs".format(item), fontsize=14)
+    plt.legend()
+    plt.grid()
+    plt.show()
+
+
+plot_result("loss")
+plot_result("binary_accuracy")
+
+"""
+While training, we notice an initial sharp fall in the loss followed by a gradual decay.
+"""
+
+"""
+### Evaluate the model
+"""
+
+_, binary_acc = shallow_mlp_model.evaluate(test_dataset)
+print(f"Categorical accuracy on the test set: {round(binary_acc * 100, 2)}%.")
+
+"""
+The trained model gives us an evaluation accuracy of ~99%.
+"""
+
+"""
+## Inference
+
+An important feature of the
+[preprocessing layers provided by Keras](https://keras.io/api/layers/preprocessing_layers/)
+is that they can be included inside a `tf.keras.Model`. We will export an inference model
+by including the `text_vectorization` layer on top of `shallow_mlp_model`. This will
+allow our inference model to directly operate on raw strings.
+
+**Note** that during training it is always preferable to use these preprocessing
+layers as a part of the data input pipeline rather than the model to avoid
+surfacing bottlenecks for the hardware accelerators. This also allows for
+asynchronous data processing.
+"""
+
+
+# We create a custom Model to override the predict method so
+# that it first vectorizes text data
+class ModelEndtoEnd(keras.Model):
+
+    def predict(self, inputs):
+        indices = text_vectorizer(inputs)
+        return super().predict(indices)
+
+
+def get_inference_model(model):
+    inputs = shallow_mlp_model.inputs
+    outputs = shallow_mlp_model.outputs
+    end_to_end_model = ModelEndtoEnd(inputs, outputs, name="end_to_end_model")
+    end_to_end_model.compile(
+        optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]
+    )
+    return end_to_end_model
+
+
+model_for_inference = get_inference_model(shallow_mlp_model)
+
+# Create a small dataset just for demonstrating inference.
+inference_dataset = make_dataset(test_df.sample(2), is_train=False)
+text_batch, label_batch = next(iter(inference_dataset))
+predicted_probabilities = model_for_inference.predict(text_batch)
+
+
+# Perform inference.
+for i, text in enumerate(text_batch[:5]):
+    label = label_batch[i].numpy()[None, ...]
+    print(f"Abstract: {text}")
+    print(f"Label(s): {invert_multi_hot(label[0])}")
+    predicted_proba = [proba for proba in predicted_probabilities[i]]
+    top_3_labels = [
+        x
+        for _, x in sorted(
+            zip(predicted_probabilities[i], lookup.get_vocabulary()),
+            key=lambda pair: pair[0],
+            reverse=True,
+        )
+    ][:3]
+    print(f"Predicted Label(s): ({', '.join([label for label in top_3_labels])})")
+    print(" ")
+
+"""
+The prediction results are not that great but not below the par for a simple model like
+ours. We can improve this performance with models that consider word order like LSTM or
+even those that use Transformers ([Vaswani et al.](https://arxiv.org/abs/1706.03762)).
+"""
+
+"""
+## Acknowledgements
+
+We would like to thank [Matt Watson](https://github.com/mattdangerw) for helping us
+tackle the multi-label binarization part and inverse-transforming the processed labels
+to the original form.
+
+Thanks to [Cingis Kratochvil](https://github.com/cumbalik) for suggesting and extending this code example by introducing binary accuracy as the evaluation metric.
+"""
diff --git a/knowledge_base/nlp/multimodal_entailment.py b/knowledge_base/nlp/multimodal_entailment.py
new file mode 100644
index 0000000000000000000000000000000000000000..dab6ca1063a01744ea9344ee6408f25970a4cd72
--- /dev/null
+++ b/knowledge_base/nlp/multimodal_entailment.py
@@ -0,0 +1,713 @@
+"""
+Title: Multimodal entailment
+Author: [Sayak Paul](https://twitter.com/RisingSayak)
+Date created: 2021/08/08
+Last modified: 2025/01/03
+Description: Training a multimodal model for predicting entailment.
+Accelerator: GPU
+Converted to Keras 3 and made backend-agnostic by: [Humbulani Ndou](https://github.com/Humbulani1234)
+"""
+
+"""
+## Introduction
+
+In this example, we will build and train a model for predicting multimodal entailment. We will be
+using the
+[multimodal entailment dataset](https://github.com/google-research-datasets/recognizing-multimodal-entailment)
+recently introduced by Google Research.
+
+### What is multimodal entailment?
+
+On social media platforms, to audit and moderate content
+we may want to find answers to the
+following questions in near real-time:
+
+* Does a given piece of information contradict the other?
+* Does a given piece of information imply the other?
+
+In NLP, this task is called analyzing _textual entailment_. However, that's only
+when the information comes from text content.
+In practice, it's often the case the information available comes not just
+from text content, but from a multimodal combination of text, images, audio, video, etc.
+_Multimodal entailment_ is simply the extension of textual entailment to a variety
+of new input modalities.
+
+### Requirements
+
+This example requires TensorFlow 2.5 or higher. In addition, TensorFlow Hub and
+TensorFlow Text are required for the BERT model
+([Devlin et al.](https://arxiv.org/abs/1810.04805)). These libraries can be installed
+using the following command:
+"""
+
+"""shell
+pip install -q tensorflow_text
+"""
+
+"""
+## Imports
+"""
+
+from sklearn.model_selection import train_test_split
+import matplotlib.pyplot as plt
+import pandas as pd
+import numpy as np
+import random
+import math
+from skimage.io import imread
+from skimage.transform import resize
+from PIL import Image
+import os
+
+os.environ["KERAS_BACKEND"] = "jax"  # or tensorflow, or torch
+
+import keras
+import keras_hub
+from keras.utils import PyDataset
+
+"""
+## Define a label map
+"""
+
+label_map = {"Contradictory": 0, "Implies": 1, "NoEntailment": 2}
+
+"""
+## Collect the dataset
+
+The original dataset is available
+[here](https://github.com/google-research-datasets/recognizing-multimodal-entailment).
+It comes with URLs of images which are hosted on Twitter's photo storage system called
+the
+[Photo Blob Storage (PBS for short)](https://blog.twitter.com/engineering/en_us/a/2012/blobstore-twitter-s-in-house-photo-storage-system).
+We will be working with the downloaded images along with additional data that comes with
+the original dataset. Thanks to
+[Nilabhra Roy Chowdhury](https://de.linkedin.com/in/nilabhraroychowdhury) who worked on
+preparing the image data.
+"""
+
+image_base_path = keras.utils.get_file(
+    "tweet_images",
+    "https://github.com/sayakpaul/Multimodal-Entailment-Baseline/releases/download/v1.0.0/tweet_images.tar.gz",
+    untar=True,
+)
+
+"""
+## Read the dataset and apply basic preprocessing
+"""
+
+df = pd.read_csv(
+    "https://github.com/sayakpaul/Multimodal-Entailment-Baseline/raw/main/csvs/tweets.csv"
+).iloc[
+    0:1000
+]  # Resources conservation since these are examples and not SOTA
+df.sample(10)
+
+"""
+The columns we are interested in are the following:
+
+* `text_1`
+* `image_1`
+* `text_2`
+* `image_2`
+* `label`
+
+The entailment task is formulated as the following:
+
+***Given the pairs of (`text_1`, `image_1`) and (`text_2`, `image_2`) do they entail (or
+not entail or contradict) each other?***
+
+We have the images already downloaded. `image_1` is downloaded as `id1` as its filename
+and `image2` is downloaded as `id2` as its filename. In the next step, we will add two
+more columns to `df` - filepaths of `image_1`s and `image_2`s.
+"""
+
+images_one_paths = []
+images_two_paths = []
+
+for idx in range(len(df)):
+    current_row = df.iloc[idx]
+    id_1 = current_row["id_1"]
+    id_2 = current_row["id_2"]
+    extentsion_one = current_row["image_1"].split(".")[-1]
+    extentsion_two = current_row["image_2"].split(".")[-1]
+
+    image_one_path = os.path.join(image_base_path, str(id_1) + f".{extentsion_one}")
+    image_two_path = os.path.join(image_base_path, str(id_2) + f".{extentsion_two}")
+
+    images_one_paths.append(image_one_path)
+    images_two_paths.append(image_two_path)
+
+df["image_1_path"] = images_one_paths
+df["image_2_path"] = images_two_paths
+
+# Create another column containing the integer ids of
+# the string labels.
+df["label_idx"] = df["label"].apply(lambda x: label_map[x])
+
+"""
+## Dataset visualization
+"""
+
+
+def visualize(idx):
+    current_row = df.iloc[idx]
+    image_1 = plt.imread(current_row["image_1_path"])
+    image_2 = plt.imread(current_row["image_2_path"])
+    text_1 = current_row["text_1"]
+    text_2 = current_row["text_2"]
+    label = current_row["label"]
+
+    plt.subplot(1, 2, 1)
+    plt.imshow(image_1)
+    plt.axis("off")
+    plt.title("Image One")
+    plt.subplot(1, 2, 2)
+    plt.imshow(image_1)
+    plt.axis("off")
+    plt.title("Image Two")
+    plt.show()
+
+    print(f"Text one: {text_1}")
+    print(f"Text two: {text_2}")
+    print(f"Label: {label}")
+
+
+random_idx = random.choice(range(len(df)))
+visualize(random_idx)
+
+random_idx = random.choice(range(len(df)))
+visualize(random_idx)
+
+"""
+## Train/test split
+
+The dataset suffers from
+[class imbalance problem](https://developers.google.com/machine-learning/glossary#class-imbalanced-dataset).
+We can confirm that in the following cell.
+"""
+
+df["label"].value_counts()
+
+"""
+To account for that we will go for a stratified split.
+"""
+
+# 10% for test
+train_df, test_df = train_test_split(
+    df, test_size=0.1, stratify=df["label"].values, random_state=42
+)
+# 5% for validation
+train_df, val_df = train_test_split(
+    train_df, test_size=0.05, stratify=train_df["label"].values, random_state=42
+)
+
+print(f"Total training examples: {len(train_df)}")
+print(f"Total validation examples: {len(val_df)}")
+print(f"Total test examples: {len(test_df)}")
+
+"""
+## Data input pipeline
+
+Keras Hub provides
+[variety of BERT family of models](https://keras.io/keras_hub/presets/).
+Each of those models comes with a
+corresponding preprocessing layer. You can learn more about these models and their
+preprocessing layers from
+[this resource](https://www.kaggle.com/models/keras/bert/keras/bert_base_en_uncased/2).
+
+To keep the runtime of this example relatively short, we will use a base_unacased variant of
+the original BERT model.
+"""
+
+"""
+text preprocessing using KerasHub
+"""
+
+text_preprocessor = keras_hub.models.BertTextClassifierPreprocessor.from_preset(
+    "bert_base_en_uncased",
+    sequence_length=128,
+)
+
+"""
+### Run the preprocessor on a sample input
+"""
+
+idx = random.choice(range(len(train_df)))
+row = train_df.iloc[idx]
+sample_text_1, sample_text_2 = row["text_1"], row["text_2"]
+print(f"Text 1: {sample_text_1}")
+print(f"Text 2: {sample_text_2}")
+
+test_text = [sample_text_1, sample_text_2]
+text_preprocessed = text_preprocessor(test_text)
+
+print("Keys           : ", list(text_preprocessed.keys()))
+print("Shape Token Ids : ", text_preprocessed["token_ids"].shape)
+print("Token Ids       : ", text_preprocessed["token_ids"][0, :16])
+print(" Shape Padding Mask     : ", text_preprocessed["padding_mask"].shape)
+print("Padding Mask     : ", text_preprocessed["padding_mask"][0, :16])
+print("Shape Segment Ids : ", text_preprocessed["segment_ids"].shape)
+print("Segment Ids       : ", text_preprocessed["segment_ids"][0, :16])
+
+
+"""
+We will now create `tf.data.Dataset` objects from the dataframes.
+
+Note that the text inputs will be preprocessed as a part of the data input pipeline. But
+the preprocessing modules can also be a part of their corresponding BERT models. This
+helps reduce the training/serving skew and lets our models operate with raw text inputs.
+Follow [this tutorial](https://www.tensorflow.org/text/tutorials/classify_text_with_bert)
+to learn more about how to incorporate the preprocessing modules directly inside the
+models.
+"""
+
+
+def dataframe_to_dataset(dataframe):
+    columns = ["image_1_path", "image_2_path", "text_1", "text_2", "label_idx"]
+    ds = UnifiedPyDataset(
+        dataframe,
+        batch_size=32,
+        workers=4,
+    )
+    return ds
+
+
+"""
+### Preprocessing utilities
+"""
+
+bert_input_features = ["padding_mask", "segment_ids", "token_ids"]
+
+
+def preprocess_text(text_1, text_2):
+    output = text_preprocessor([text_1, text_2])
+    output = {
+        feature: keras.ops.reshape(output[feature], [-1])
+        for feature in bert_input_features
+    }
+    return output
+
+
+"""
+### Create the final datasets, method adapted from PyDataset doc string.
+"""
+
+
+class UnifiedPyDataset(PyDataset):
+    """A Keras-compatible dataset that processes a DataFrame for TensorFlow, JAX, and PyTorch."""
+
+    def __init__(
+        self,
+        df,
+        batch_size=32,
+        workers=4,
+        use_multiprocessing=False,
+        max_queue_size=10,
+        **kwargs,
+    ):
+        """
+        Args:
+            df: pandas DataFrame with data
+            batch_size: Batch size for dataset
+            workers: Number of workers to use for parallel loading (Keras)
+            use_multiprocessing: Whether to use multiprocessing
+            max_queue_size: Maximum size of the data queue for parallel loading
+        """
+        super().__init__(**kwargs)
+        self.dataframe = df
+        columns = ["image_1_path", "image_2_path", "text_1", "text_2"]
+        # image files
+        self.image_x_1 = self.dataframe["image_1_path"]
+        self.image_x_2 = self.dataframe["image_1_path"]
+        self.image_y = self.dataframe["label_idx"]
+        # text files
+        self.text_x_1 = self.dataframe["text_1"]
+        self.text_x_2 = self.dataframe["text_2"]
+        self.text_y = self.dataframe["label_idx"]
+        # general
+        self.batch_size = batch_size
+        self.workers = workers
+        self.use_multiprocessing = use_multiprocessing
+        self.max_queue_size = max_queue_size
+
+    def __getitem__(self, index):
+        """
+        Fetches a batch of data from the dataset at the given index.
+        """
+
+        # Return x, y for batch idx.
+        low = index * self.batch_size
+        # Cap upper bound at array length; the last batch may be smaller
+        # if the total number of items is not a multiple of batch size.
+        # image files
+        high_image_1 = min(low + self.batch_size, len(self.image_x_1))
+        high_image_2 = min(low + self.batch_size, len(self.image_x_2))
+        # text
+        high_text_1 = min(low + self.batch_size, len(self.text_x_1))
+        high_text_2 = min(low + self.batch_size, len(self.text_x_1))
+        # images files
+        batch_image_x_1 = self.image_x_1[low:high_image_1]
+        batch_image_y_1 = self.image_y[low:high_image_1]
+        batch_image_x_2 = self.image_x_2[low:high_image_2]
+        batch_image_y_2 = self.image_y[low:high_image_2]
+        # text files
+        batch_text_x_1 = self.text_x_1[low:high_text_1]
+        batch_text_y_1 = self.text_y[low:high_text_1]
+        batch_text_x_2 = self.text_x_2[low:high_text_2]
+        batch_text_y_2 = self.text_y[low:high_text_2]
+        # image number 1 inputs
+        image_1 = [
+            resize(imread(file_name), (128, 128)) for file_name in batch_image_x_1
+        ]
+        image_1 = [
+            (  # exeperienced some shapes which were different from others.
+                np.array(Image.fromarray((img.astype(np.uint8))).convert("RGB"))
+                if img.shape[2] == 4
+                else img
+            )
+            for img in image_1
+        ]
+        image_1 = np.array(image_1)
+        # Both text inputs to the model, return a dict for inputs to BertBackbone
+        text = {
+            key: np.array(
+                [
+                    d[key]
+                    for d in [
+                        preprocess_text(file_path1, file_path2)
+                        for file_path1, file_path2 in zip(
+                            batch_text_x_1, batch_text_x_2
+                        )
+                    ]
+                ]
+            )
+            for key in ["padding_mask", "token_ids", "segment_ids"]
+        }
+        # Image number 2 model inputs
+        image_2 = [
+            resize(imread(file_name), (128, 128)) for file_name in batch_image_x_2
+        ]
+        image_2 = [
+            (  # exeperienced some shapes which were different from others
+                np.array(Image.fromarray((img.astype(np.uint8))).convert("RGB"))
+                if img.shape[2] == 4
+                else img
+            )
+            for img in image_2
+        ]
+        # Stack the list comprehension to an nd.array
+        image_2 = np.array(image_2)
+        return (
+            {
+                "image_1": image_1,
+                "image_2": image_2,
+                "padding_mask": text["padding_mask"],
+                "segment_ids": text["segment_ids"],
+                "token_ids": text["token_ids"],
+            },
+            # Target lables
+            np.array(batch_image_y_1),
+        )
+
+    def __len__(self):
+        """
+        Returns the number of batches in the dataset.
+        """
+        return math.ceil(len(self.dataframe) / self.batch_size)
+
+
+"""
+Create train, validation and test datasets
+"""
+
+
+def prepare_dataset(dataframe):
+    ds = dataframe_to_dataset(dataframe)
+    return ds
+
+
+train_ds = prepare_dataset(train_df)
+validation_ds = prepare_dataset(val_df)
+test_ds = prepare_dataset(test_df)
+
+"""
+## Model building utilities
+
+Our final model will accept two images along with their text counterparts. While the
+images will be directly fed to the model the text inputs will first be preprocessed and
+then will make it into the model. Below is a visual illustration of this approach:
+
+![](https://github.com/sayakpaul/Multimodal-Entailment-Baseline/raw/main/figures/brief_architecture.png)
+
+The model consists of the following elements:
+
+* A standalone encoder for the images. We will use a
+[ResNet50V2](https://arxiv.org/abs/1603.05027) pre-trained on the ImageNet-1k dataset for
+this.
+* A standalone encoder for the images. A pre-trained BERT will be used for this.
+
+After extracting the individual embeddings, they will be projected in an identical space.
+Finally, their projections will be concatenated and be fed to the final classification
+layer.
+
+This is a multi-class classification problem involving the following classes:
+
+* NoEntailment
+* Implies
+* Contradictory
+
+`project_embeddings()`, `create_vision_encoder()`, and `create_text_encoder()` utilities
+are referred from [this example](https://keras.io/examples/nlp/nl_image_search/).
+"""
+
+"""
+Projection utilities
+"""
+
+
+def project_embeddings(
+    embeddings, num_projection_layers, projection_dims, dropout_rate
+):
+    projected_embeddings = keras.layers.Dense(units=projection_dims)(embeddings)
+    for _ in range(num_projection_layers):
+        x = keras.ops.nn.gelu(projected_embeddings)
+        x = keras.layers.Dense(projection_dims)(x)
+        x = keras.layers.Dropout(dropout_rate)(x)
+        x = keras.layers.Add()([projected_embeddings, x])
+        projected_embeddings = keras.layers.LayerNormalization()(x)
+    return projected_embeddings
+
+
+"""
+Vision encoder utilities
+"""
+
+
+def create_vision_encoder(
+    num_projection_layers, projection_dims, dropout_rate, trainable=False
+):
+    # Load the pre-trained ResNet50V2 model to be used as the base encoder.
+    resnet_v2 = keras.applications.ResNet50V2(
+        include_top=False, weights="imagenet", pooling="avg"
+    )
+    # Set the trainability of the base encoder.
+    for layer in resnet_v2.layers:
+        layer.trainable = trainable
+
+    # Receive the images as inputs.
+    image_1 = keras.Input(shape=(128, 128, 3), name="image_1")
+    image_2 = keras.Input(shape=(128, 128, 3), name="image_2")
+
+    # Preprocess the input image.
+    preprocessed_1 = keras.applications.resnet_v2.preprocess_input(image_1)
+    preprocessed_2 = keras.applications.resnet_v2.preprocess_input(image_2)
+
+    # Generate the embeddings for the images using the resnet_v2 model
+    # concatenate them.
+    embeddings_1 = resnet_v2(preprocessed_1)
+    embeddings_2 = resnet_v2(preprocessed_2)
+    embeddings = keras.layers.Concatenate()([embeddings_1, embeddings_2])
+
+    # Project the embeddings produced by the model.
+    outputs = project_embeddings(
+        embeddings, num_projection_layers, projection_dims, dropout_rate
+    )
+    # Create the vision encoder model.
+    return keras.Model([image_1, image_2], outputs, name="vision_encoder")
+
+
+"""
+Text encoder utilities
+"""
+
+
+def create_text_encoder(
+    num_projection_layers, projection_dims, dropout_rate, trainable=False
+):
+    # Load the pre-trained BERT BackBone using KerasHub.
+    bert = keras_hub.models.BertBackbone.from_preset(
+        "bert_base_en_uncased", num_classes=3
+    )
+
+    # Set the trainability of the base encoder.
+    bert.trainable = trainable
+
+    # Receive the text as inputs.
+    bert_input_features = ["padding_mask", "segment_ids", "token_ids"]
+    inputs = {
+        feature: keras.Input(shape=(256,), dtype="int32", name=feature)
+        for feature in bert_input_features
+    }
+
+    # Generate embeddings for the preprocessed text using the BERT model.
+    embeddings = bert(inputs)["pooled_output"]
+
+    # Project the embeddings produced by the model.
+    outputs = project_embeddings(
+        embeddings, num_projection_layers, projection_dims, dropout_rate
+    )
+    # Create the text encoder model.
+    return keras.Model(inputs, outputs, name="text_encoder")
+
+
+"""
+Multimodal model utilities
+"""
+
+
+def create_multimodal_model(
+    num_projection_layers=1,
+    projection_dims=256,
+    dropout_rate=0.1,
+    vision_trainable=False,
+    text_trainable=False,
+):
+    # Receive the images as inputs.
+    image_1 = keras.Input(shape=(128, 128, 3), name="image_1")
+    image_2 = keras.Input(shape=(128, 128, 3), name="image_2")
+
+    # Receive the text as inputs.
+    bert_input_features = ["padding_mask", "segment_ids", "token_ids"]
+    text_inputs = {
+        feature: keras.Input(shape=(256,), dtype="int32", name=feature)
+        for feature in bert_input_features
+    }
+    text_inputs = list(text_inputs.values())
+    # Create the encoders.
+    vision_encoder = create_vision_encoder(
+        num_projection_layers, projection_dims, dropout_rate, vision_trainable
+    )
+    text_encoder = create_text_encoder(
+        num_projection_layers, projection_dims, dropout_rate, text_trainable
+    )
+
+    # Fetch the embedding projections.
+    vision_projections = vision_encoder([image_1, image_2])
+    text_projections = text_encoder(text_inputs)
+
+    # Concatenate the projections and pass through the classification layer.
+    concatenated = keras.layers.Concatenate()([vision_projections, text_projections])
+    outputs = keras.layers.Dense(3, activation="softmax")(concatenated)
+    return keras.Model([image_1, image_2, *text_inputs], outputs)
+
+
+multimodal_model = create_multimodal_model()
+keras.utils.plot_model(multimodal_model, show_shapes=True)
+
+"""
+You can inspect the structure of the individual encoders as well by setting the
+`expand_nested` argument of `plot_model()` to `True`. You are encouraged
+to play with the different hyperparameters involved in building this model and
+observe how the final performance is affected.
+"""
+
+"""
+## Compile and train the model
+"""
+
+multimodal_model.compile(
+    optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
+)
+
+history = multimodal_model.fit(train_ds, validation_data=validation_ds, epochs=1)
+
+"""
+## Evaluate the model
+"""
+
+_, acc = multimodal_model.evaluate(test_ds)
+print(f"Accuracy on the test set: {round(acc * 100, 2)}%.")
+
+"""
+## Additional notes regarding training
+
+**Incorporating regularization**:
+
+The training logs suggest that the model is starting to overfit and may have benefitted
+from regularization. Dropout ([Srivastava et al.](https://jmlr.org/papers/v15/srivastava14a.html))
+is a simple yet powerful regularization technique that we can use in our model.
+But how should we apply it here?
+
+We could always introduce Dropout (`keras.layers.Dropout`) in between different layers of the model.
+But here is another recipe. Our model expects inputs from two different data modalities.
+What if either of the modalities is not present during inference? To account for this,
+we can introduce Dropout to the individual projections just before they get concatenated:
+
+```python
+vision_projections = keras.layers.Dropout(rate)(vision_projections)
+text_projections = keras.layers.Dropout(rate)(text_projections)
+concatenated = keras.layers.Concatenate()([vision_projections, text_projections])
+```
+
+**Attending to what matters**:
+
+Do all parts of the images correspond equally to their textual counterparts? It's likely
+not the case. To make our model only focus on the most important bits of the images that relate
+well to their corresponding textual parts we can use "cross-attention":
+
+```python
+# Embeddings.
+vision_projections = vision_encoder([image_1, image_2])
+text_projections = text_encoder(text_inputs)
+
+# Cross-attention (Luong-style).
+query_value_attention_seq = keras.layers.Attention(use_scale=True, dropout=0.2)(
+    [vision_projections, text_projections]
+)
+# Concatenate.
+concatenated = keras.layers.Concatenate()([vision_projections, text_projections])
+contextual = keras.layers.Concatenate()([concatenated, query_value_attention_seq])
+```
+
+To see this in action, refer to
+[this notebook](https://github.com/sayakpaul/Multimodal-Entailment-Baseline/blob/main/multimodal_entailment_attn.ipynb).
+
+**Handling class imbalance**:
+
+The dataset suffers from class imbalance. Investigating the confusion matrix of the
+above model reveals that it performs poorly on the minority classes. If we had used a
+weighted loss then the training would have been more guided. You can check out
+[this notebook](https://github.com/sayakpaul/Multimodal-Entailment-Baseline/blob/main/multimodal_entailment.ipynb)
+that takes class-imbalance into account during model training.
+
+**Using only text inputs**:
+
+Also, what if we had only incorporated text inputs for the entailment task? Because of
+the nature of the text inputs encountered on social media platforms, text inputs alone
+would have hurt the final performance. Under a similar training setup, by only using
+text inputs we get to 67.14% top-1 accuracy on the same test set. Refer to
+[this notebook](https://github.com/sayakpaul/Multimodal-Entailment-Baseline/blob/main/text_entailment.ipynb)
+for details.
+
+Finally, here is a table comparing different approaches taken for the entailment task:
+
+| Type  | Standard<br>Cross-entropy     | Loss-weighted<br>Cross-entropy    | Focal Loss    |
+|:---:  |:---:  |:---:    |:---:    |
+| Multimodal    | 77.86%    | 67.86%    | 86.43%    |
+| Only text     | 67.14%    | 11.43%    | 37.86%    |
+
+You can check out [this repository](https://git.io/JR0HU) to learn more about how the
+experiments were conducted to obtain these numbers.
+"""
+
+"""
+## Final remarks
+
+* The architecture we used in this example is too large for the number of data points
+available for training. It's going to benefit from more data.
+* We used a smaller variant of the original BERT model. Chances are high that with a
+larger variant, this performance will be improved. TensorFlow Hub
+[provides](https://www.tensorflow.org/text/tutorials/bert_glue#loading_models_from_tensorflow_hub)
+a number of different BERT models that you can experiment with.
+* We kept the pre-trained models frozen. Fine-tuning them on the multimodal entailment
+task would could resulted in better performance.
+* We built a simple baseline model for the multimodal entailment task. There are various
+approaches that have been proposed to tackle the entailment problem.
+[This presentation deck](https://docs.google.com/presentation/d/1mAB31BCmqzfedreNZYn4hsKPFmgHA9Kxz219DzyRY3c/edit?usp=sharing)
+from the
+[Recognizing Multimodal Entailment](https://multimodal-entailment.github.io/)
+tutorial provides a comprehensive overview.
+
+You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/keras-io/multimodal-entailment)
+and try the demo on [Hugging Face Spaces](https://huggingface.co/spaces/keras-io/multimodal_entailment)
+"""
diff --git a/knowledge_base/nlp/multiple_choice_task_with_transfer_learning.py b/knowledge_base/nlp/multiple_choice_task_with_transfer_learning.py
new file mode 100644
index 0000000000000000000000000000000000000000..757b1abe6798ed929c7c99f91e71f4cd25139748
--- /dev/null
+++ b/knowledge_base/nlp/multiple_choice_task_with_transfer_learning.py
@@ -0,0 +1,560 @@
+"""
+Title: MultipleChoice Task with Transfer Learning
+Author: Md Awsafur Rahman
+Date created: 2023/09/14
+Last modified: 2025/06/16
+Description: Use pre-trained nlp models for multiplechoice task.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+In this example, we will demonstrate how to perform the **MultipleChoice** task by
+finetuning pre-trained DebertaV3 model. In this task, several candidate answers are
+provided along with a context and the model is trained to select the correct answer
+unlike question answering. We will use SWAG dataset to demonstrate this example.
+"""
+
+"""
+## Setup
+"""
+
+"""shell
+"""
+
+import keras_hub
+import keras
+import tensorflow as tf  # For tf.data only.
+
+import numpy as np
+import pandas as pd
+
+import matplotlib.pyplot as plt
+
+"""
+## Dataset
+In this example we'll use **SWAG** dataset for multiplechoice task.
+"""
+
+"""shell
+wget "https://github.com/rowanz/swagaf/archive/refs/heads/master.zip" -O swag.zip
+unzip -q swag.zip
+"""
+
+"""shell
+ls swagaf-master/data
+"""
+
+"""
+## Configuration
+"""
+
+
+class CFG:
+    preset = "deberta_v3_extra_small_en"  # Name of pretrained models
+    sequence_length = 200  # Input sequence length
+    seed = 42  # Random seed
+    epochs = 5  # Training epochs
+    batch_size = 8  # Batch size
+    augment = True  # Augmentation (Shuffle Options)
+
+
+"""
+## Reproducibility
+Sets value for random seed to produce similar result in each run.
+"""
+
+keras.utils.set_random_seed(CFG.seed)
+
+
+"""
+## Meta Data
+* **train.csv** - will be used for training.
+* `sent1` and `sent2`: these fields show how a sentence starts, and if you put the two
+together, you get the `startphrase` field.
+* `ending_<i>`: suggests a possible ending for how a sentence can end, but only one of
+them is correct.
+    * `label`: identifies the correct sentence ending.
+
+* **val.csv** - similar to `train.csv` but will be used for validation.
+"""
+
+# Train data
+train_df = pd.read_csv(
+    "swagaf-master/data/train.csv", index_col=0
+)  # Read CSV file into a DataFrame
+train_df = train_df.sample(frac=0.02)
+print("# Train Data: {:,}".format(len(train_df)))
+
+# Valid data
+valid_df = pd.read_csv(
+    "swagaf-master/data/val.csv", index_col=0
+)  # Read CSV file into a DataFrame
+valid_df = valid_df.sample(frac=0.02)
+print("# Valid Data: {:,}".format(len(valid_df)))
+
+"""
+## Contextualize Options
+
+Our approach entails furnishing the model with question and answer pairs, as opposed to
+employing a single question for all five options. In practice, this signifies that for
+the five options, we will supply the model with the same set of five questions combined
+with each respective answer choice (e.g., `(Q + A)`, `(Q + B)`, and so on). This analogy
+draws parallels to the practice of revisiting a question multiple times during an exam to
+promote a deeper understanding of the problem at hand.
+
+> Notably, in the context of SWAG dataset, question is the start of a sentence and
+options are possible ending of that sentence.
+"""
+
+
+# Define a function to create options based on the prompt and choices
+def make_options(row):
+    row["options"] = [
+        f"{row.startphrase}\n{row.ending0}",  # Option 0
+        f"{row.startphrase}\n{row.ending1}",  # Option 1
+        f"{row.startphrase}\n{row.ending2}",  # Option 2
+        f"{row.startphrase}\n{row.ending3}",
+    ]  # Option 3
+    return row
+
+
+"""
+Apply the `make_options` function to each row of the dataframe
+"""
+
+train_df = train_df.apply(make_options, axis=1)
+valid_df = valid_df.apply(make_options, axis=1)
+
+"""
+## Preprocessing
+
+**What it does:** The preprocessor takes input strings and transforms them into a
+dictionary (`token_ids`, `padding_mask`) containing preprocessed tensors. This process
+starts with tokenization, where input strings are converted into sequences of token IDs.
+
+**Why it's important:** Initially, raw text data is complex and challenging for modeling
+due to its high dimensionality. By converting text into a compact set of tokens, such as
+transforming `"The quick brown fox"` into `["the", "qu", "##ick", "br", "##own", "fox"]`,
+we simplify the data. Many models rely on special tokens and additional tensors to
+understand input. These tokens help divide input and identify padding, among other tasks.
+Making all sequences the same length through padding boosts computational efficiency,
+making subsequent steps smoother.
+
+Explore the following pages to access the available preprocessing and tokenizer layers in
+**KerasHub**:
+- [Preprocessing](https://keras.io/api/keras_hub/preprocessing_layers/)
+- [Tokenizers](https://keras.io/api/keras_hub/tokenizers/)
+"""
+
+preprocessor = keras_hub.models.DebertaV3Preprocessor.from_preset(
+    preset=CFG.preset,  # Name of the model
+    sequence_length=CFG.sequence_length,  # Max sequence length, will be padded if shorter
+)
+
+"""
+Now, let's examine what the output shape of the preprocessing layer looks like. The
+output shape of the layer can be represented as $(num\_choices, sequence\_length)$.
+"""
+
+outs = preprocessor(train_df.options.iloc[0])  # Process options for the first row
+
+# Display the shape of each processed output
+for k, v in outs.items():
+    print(k, ":", v.shape)
+
+"""
+We'll use the `preprocessing_fn` function to transform each text option using the
+`dataset.map(preprocessing_fn)` method.
+"""
+
+
+def preprocess_fn(text, label=None):
+    text = preprocessor(text)  # Preprocess text
+    return (
+        (text, label) if label is not None else text
+    )  # Return processed text and label if available
+
+
+"""
+## Augmentation
+
+In this notebook, we'll experiment with an interesting augmentation technique,
+`option_shuffle`. Since we're providing the model with one option at a time, we can
+introduce a shuffle to the order of options. For instance, options `[A, C, E, D, B]`
+would be rearranged as `[D, B, A, E, C]`. This practice will help the model focus on the
+content of the options themselves, rather than being influenced by their positions.
+
+**Note:** Even though `option_shuffle` function is written in pure
+tensorflow, it can be used with any backend (e.g. JAX, PyTorch) as it is only used
+in `tf.data.Dataset` pipeline which is compatible with Keras 3 routines.
+"""
+
+
+def option_shuffle(options, labels, prob=0.50, seed=None):
+    if tf.random.uniform([]) > prob:  # Shuffle probability check
+        return options, labels
+    # Shuffle indices of options and labels in the same order
+    indices = tf.random.shuffle(tf.range(tf.shape(options)[0]), seed=seed)
+    # Shuffle options and labels
+    options = tf.gather(options, indices)
+    labels = tf.gather(labels, indices)
+    return options, labels
+
+
+"""
+In the following function, we'll merge all augmentation functions to apply to the text.
+These augmentations will be applied to the data using the `dataset.map(augment_fn)`
+approach.
+"""
+
+
+def augment_fn(text, label=None):
+    text, label = option_shuffle(text, label, prob=0.5)  # Shuffle the options
+    return (text, label) if label is not None else text
+
+
+"""
+## DataLoader
+
+The code below sets up a robust data flow pipeline using `tf.data.Dataset` for data
+processing. Notable aspects of `tf.data` include its ability to simplify pipeline
+construction and represent components in sequences.
+
+To learn more about `tf.data`, refer to this
+[documentation](https://www.tensorflow.org/guide/data).
+"""
+
+
+def build_dataset(
+    texts,
+    labels=None,
+    batch_size=32,
+    cache=False,
+    augment=False,
+    repeat=False,
+    shuffle=1024,
+):
+    AUTO = tf.data.AUTOTUNE  # AUTOTUNE option
+    slices = (
+        (texts,)
+        if labels is None
+        else (texts, keras.utils.to_categorical(labels, num_classes=4))
+    )  # Create slices
+    ds = tf.data.Dataset.from_tensor_slices(slices)  # Create dataset from slices
+    ds = ds.cache() if cache else ds  # Cache dataset if enabled
+    if augment:  # Apply augmentation if enabled
+        ds = ds.map(augment_fn, num_parallel_calls=AUTO)
+    ds = ds.map(preprocess_fn, num_parallel_calls=AUTO)  # Map preprocessing function
+    ds = ds.repeat() if repeat else ds  # Repeat dataset if enabled
+    opt = tf.data.Options()  # Create dataset options
+    if shuffle:
+        ds = ds.shuffle(shuffle, seed=CFG.seed)  # Shuffle dataset if enabled
+        opt.experimental_deterministic = False
+    ds = ds.with_options(opt)  # Set dataset options
+    ds = ds.batch(batch_size, drop_remainder=True)  # Batch dataset
+    ds = ds.prefetch(AUTO)  # Prefetch next batch
+    return ds  # Return the built dataset
+
+
+"""
+Now let's create train and valid dataloader using above function.
+"""
+
+# Build train dataloader
+train_texts = train_df.options.tolist()  # Extract training texts
+train_labels = train_df.label.tolist()  # Extract training labels
+train_ds = build_dataset(
+    train_texts,
+    train_labels,
+    batch_size=CFG.batch_size,
+    cache=True,
+    shuffle=True,
+    repeat=True,
+    augment=CFG.augment,
+)
+
+# Build valid dataloader
+valid_texts = valid_df.options.tolist()  # Extract validation texts
+valid_labels = valid_df.label.tolist()  # Extract validation labels
+valid_ds = build_dataset(
+    valid_texts,
+    valid_labels,
+    batch_size=CFG.batch_size,
+    cache=True,
+    shuffle=False,
+    repeat=False,
+    augment=False,
+)
+
+
+"""
+## LR Schedule
+
+Implementing a learning rate scheduler is crucial for transfer learning. The learning
+rate initiates at `lr_start` and gradually tapers down to `lr_min` using **cosine**
+curve.
+
+**Importance:** A well-structured learning rate schedule is essential for efficient model
+training, ensuring optimal convergence and avoiding issues such as overshooting or
+stagnation.
+"""
+
+import math
+
+
+def get_lr_callback(batch_size=8, mode="cos", epochs=10, plot=False):
+    lr_start, lr_max, lr_min = 1.0e-6, 0.6e-6 * batch_size, 1e-6
+    lr_ramp_ep, lr_sus_ep = 2, 0
+
+    def lrfn(epoch):  # Learning rate update function
+        if epoch < lr_ramp_ep:
+            lr = (lr_max - lr_start) / lr_ramp_ep * epoch + lr_start
+        elif epoch < lr_ramp_ep + lr_sus_ep:
+            lr = lr_max
+        else:
+            decay_total_epochs, decay_epoch_index = (
+                epochs - lr_ramp_ep - lr_sus_ep + 3,
+                epoch - lr_ramp_ep - lr_sus_ep,
+            )
+            phase = math.pi * decay_epoch_index / decay_total_epochs
+            lr = (lr_max - lr_min) * 0.5 * (1 + math.cos(phase)) + lr_min
+        return lr
+
+    if plot:  # Plot lr curve if plot is True
+        plt.figure(figsize=(10, 5))
+        plt.plot(
+            np.arange(epochs),
+            [lrfn(epoch) for epoch in np.arange(epochs)],
+            marker="o",
+        )
+        plt.xlabel("epoch")
+        plt.ylabel("lr")
+        plt.title("LR Scheduler")
+        plt.show()
+
+    return keras.callbacks.LearningRateScheduler(
+        lrfn, verbose=False
+    )  # Create lr callback
+
+
+_ = get_lr_callback(CFG.batch_size, plot=True)
+
+"""
+## Callbacks
+
+The function below will gather all the training callbacks, such as `lr_scheduler`,
+`model_checkpoint`.
+"""
+
+
+def get_callbacks():
+    callbacks = []
+    lr_cb = get_lr_callback(CFG.batch_size)  # Get lr callback
+    ckpt_cb = keras.callbacks.ModelCheckpoint(
+        f"best.keras",
+        monitor="val_accuracy",
+        save_best_only=True,
+        save_weights_only=False,
+        mode="max",
+    )  # Get Model checkpoint callback
+    callbacks.extend([lr_cb, ckpt_cb])  # Add lr and checkpoint callbacks
+    return callbacks  # Return the list of callbacks
+
+
+callbacks = get_callbacks()
+
+"""
+## MultipleChoice Model
+
+
+
+
+
+"""
+
+"""
+
+### Pre-trained Models
+
+The `KerasHub` library provides comprehensive, ready-to-use implementations of popular
+NLP model architectures. It features a variety of pre-trained models including `Bert`,
+`Roberta`, `DebertaV3`, and more. In this notebook, we'll showcase the usage of
+`DistillBert`. However, feel free to explore all available models in the [KerasHub
+documentation](https://keras.io/api/keras_hub/models/). Also for a deeper understanding
+of `KerasHub`, refer to the informative [getting started
+guide](https://keras.io/guides/keras_hub/getting_started/).
+
+Our approach involves using `keras_hub.models.XXClassifier` to process each question and
+option pari (e.g. (Q+A), (Q+B), etc.), generating logits. These logits are then combined
+and passed through a softmax function to produce the final output.
+"""
+
+"""
+
+### Classifier for Multiple-Choice Tasks
+
+When dealing with multiple-choice questions, instead of giving the model the question and
+all options together `(Q + A + B + C ...)`, we provide the model with one option at a
+time along with the question. For instance, `(Q + A)`, `(Q + B)`, and so on. Once we have
+the prediction scores (logits) for all options, we combine them using the `Softmax`
+function to get the ultimate result. If we had given all options at once to the model,
+the text's length would increase, making it harder for the model to handle. The picture
+below illustrates this idea:
+
+![Model Diagram](https://pbs.twimg.com/media/F3NUju_a8AAS8Fq?format=png&name=large)
+
+<div align="center"><b> Picture Credit: </b> <a href="https://twitter.com/johnowhitaker"> 
+@johnowhitaker </a> </div><br>
+
+From a coding perspective, remember that we use the same model for all five options, with
+shared weights. Despite the figure suggesting five separate models, they are, in fact,
+one model with shared weights. Another point to consider is the the input shapes of
+Classifier and MultipleChoice.
+
+* Input shape for **Multiple Choice**: $(batch\_size, num\_choices, seq\_length)$
+* Input shape for **Classifier**: $(batch\_size, seq\_length)$
+
+Certainly, it's clear that we can't directly give the data for the multiple-choice task
+to the model because the input shapes don't match. To handle this, we'll use **slicing**.
+This means we'll separate the features of each option, like $feature_{(Q + A)}$ and
+$feature_{(Q + B)}$, and give them one by one to the NLP classifier. After we get the
+prediction scores $logits_{(Q + A)}$ and $logits_{(Q + B)}$ for all the options, we'll
+use the Softmax function, like $\operatorname{Softmax}([logits_{(Q + A)}, logits_{(Q +
+B)}])$, to combine them. This final step helps us make the ultimate decision or choice.
+
+> Note that in the classifier, we set `num_classes=1` instead of `5`. This is because the
+classifier produces a single output for each option. When dealing with five options,
+these individual outputs are joined together and then processed through a softmax
+function to generate the final result, which has a dimension of `5`.
+"""
+
+
+# Selects one option from five
+class SelectOption(keras.layers.Layer):
+    def __init__(self, index, **kwargs):
+        super().__init__(**kwargs)
+        self.index = index
+
+    def call(self, inputs):
+        # Selects a specific slice from the inputs tensor
+        return inputs[:, self.index, :]
+
+    def get_config(self):
+        # For serialize the model
+        base_config = super().get_config()
+        config = {
+            "index": self.index,
+        }
+        return {**base_config, **config}
+
+
+def build_model():
+    # Define input layers
+    inputs = {
+        "token_ids": keras.Input(shape=(4, None), dtype="int32", name="token_ids"),
+        "padding_mask": keras.Input(
+            shape=(4, None), dtype="int32", name="padding_mask"
+        ),
+    }
+    # Create a DebertaV3Classifier model
+    classifier = keras_hub.models.DebertaV3Classifier.from_preset(
+        CFG.preset,
+        preprocessor=None,
+        num_classes=1,  # one output per one option, for five options total 5 outputs
+    )
+    logits = []
+    # Loop through each option (Q+A), (Q+B) etc and compute associated logits
+    for option_idx in range(4):
+        option = {
+            k: SelectOption(option_idx, name=f"{k}_{option_idx}")(v)
+            for k, v in inputs.items()
+        }
+        logit = classifier(option)
+        logits.append(logit)
+
+    # Compute final output
+    logits = keras.layers.Concatenate(axis=-1)(logits)
+    outputs = keras.layers.Softmax(axis=-1)(logits)
+    model = keras.Model(inputs, outputs)
+
+    # Compile the model with optimizer, loss, and metrics
+    model.compile(
+        optimizer=keras.optimizers.AdamW(5e-6),
+        loss=keras.losses.CategoricalCrossentropy(label_smoothing=0.02),
+        metrics=[
+            keras.metrics.CategoricalAccuracy(name="accuracy"),
+        ],
+        jit_compile=True,
+    )
+    return model
+
+
+# Build the Build
+model = build_model()
+
+"""
+Let's checkout the model summary to have a better insight on the model.
+"""
+
+model.summary()
+
+"""
+Finally, let's check the model structure visually if everything is in place.
+"""
+
+keras.utils.plot_model(model, show_shapes=True)
+
+"""
+## Training
+"""
+
+# Start training the model
+history = model.fit(
+    train_ds,
+    epochs=CFG.epochs,
+    validation_data=valid_ds,
+    callbacks=callbacks,
+    steps_per_epoch=int(len(train_df) / CFG.batch_size),
+    verbose=1,
+)
+
+"""
+## Inference
+"""
+
+# Make predictions using the trained model on last validation data
+predictions = model.predict(
+    valid_ds,
+    batch_size=CFG.batch_size,  # max batch size = valid size
+    verbose=1,
+)
+
+# Format predictions and true answers
+pred_answers = np.arange(4)[np.argsort(-predictions)][:, 0]
+true_answers = valid_df.label.values
+
+# Check 5 Predictions
+print("# Predictions\n")
+for i in range(0, 50, 10):
+    row = valid_df.iloc[i]
+    question = row.startphrase
+    pred_answer = f"ending{pred_answers[i]}"
+    true_answer = f"ending{true_answers[i]}"
+    print(f"❓ Sentence {i+1}:\n{question}\n")
+    print(f"✅ True Ending: {true_answer}\n   >> {row[true_answer]}\n")
+    print(f"🤖 Predicted Ending: {pred_answer}\n   >> {row[pred_answer]}\n")
+    print("-" * 90, "\n")
+
+"""
+## Reference
+* [Multiple Choice with
+HF](https://twitter.com/johnowhitaker/status/1689790373454041089?s=20)
+* [Keras NLP](https://keras.io/api/keras_hub/)
+* [BirdCLEF23: Pretraining is All you Need
+[Train]](https://www.kaggle.com/code/awsaf49/birdclef23-pretraining-is-all-you-need-train)
+[Train]](https://www.kaggle.com/code/awsaf49/birdclef23-pretraining-is-all-you-need-train)
+* [Triple Stratified KFold with
+TFRecords](https://www.kaggle.com/code/cdeotte/triple-stratified-kfold-with-tfrecords)
+"""
diff --git a/knowledge_base/nlp/ner_transformers.py b/knowledge_base/nlp/ner_transformers.py
new file mode 100644
index 0000000000000000000000000000000000000000..03d8c89afb47ca3f1bf13a7234e4e445452ae6bd
--- /dev/null
+++ b/knowledge_base/nlp/ner_transformers.py
@@ -0,0 +1,355 @@
+"""
+Title: Named Entity Recognition using Transformers
+Author: [Varun Singh](https://www.linkedin.com/in/varunsingh2/)
+Date created: 2021/06/23
+Last modified: 2024/04/05
+Description: NER using the Transformers and data from CoNLL 2003 shared task.
+Accelerator: GPU
+Converted to Keras 3 by: [Sitam Meur](https://github.com/sitamgithub-MSIT)
+"""
+
+"""
+## Introduction
+
+Named Entity Recognition (NER) is the process of identifying named entities in text.
+Example of named entities are: "Person", "Location", "Organization", "Dates" etc. NER is
+essentially a token classification task where every token is classified into one or more
+predetermined categories.
+
+In this exercise, we will train a simple Transformer based model to perform NER. We will
+be using the data from CoNLL 2003 shared task. For more information about the dataset,
+please visit [the dataset website](https://www.clips.uantwerpen.be/conll2003/ner/).
+However, since obtaining this data requires an additional step of getting a free license, we will be using
+HuggingFace's datasets library which contains a processed version of this dataset.
+"""
+
+"""
+## Install the open source datasets library from HuggingFace
+
+We also download the script used to evaluate NER models.
+"""
+
+"""shell
+pip3 install datasets
+wget https://raw.githubusercontent.com/sighsmile/conlleval/master/conlleval.py
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import keras
+from keras import ops
+import numpy as np
+import tensorflow as tf
+from keras import layers
+from datasets import load_dataset
+from collections import Counter
+from conlleval import evaluate
+
+"""
+We will be using the transformer implementation from this fantastic
+[example](https://keras.io/examples/nlp/text_classification_with_transformer/).
+
+Let's start by defining a `TransformerBlock` layer:
+"""
+
+
+class TransformerBlock(layers.Layer):
+    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
+        super().__init__()
+        self.att = keras.layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=embed_dim
+        )
+        self.ffn = keras.Sequential(
+            [
+                keras.layers.Dense(ff_dim, activation="relu"),
+                keras.layers.Dense(embed_dim),
+            ]
+        )
+        self.layernorm1 = keras.layers.LayerNormalization(epsilon=1e-6)
+        self.layernorm2 = keras.layers.LayerNormalization(epsilon=1e-6)
+        self.dropout1 = keras.layers.Dropout(rate)
+        self.dropout2 = keras.layers.Dropout(rate)
+
+    def call(self, inputs, training=False):
+        attn_output = self.att(inputs, inputs)
+        attn_output = self.dropout1(attn_output, training=training)
+        out1 = self.layernorm1(inputs + attn_output)
+        ffn_output = self.ffn(out1)
+        ffn_output = self.dropout2(ffn_output, training=training)
+        return self.layernorm2(out1 + ffn_output)
+
+
+"""
+Next, let's define a `TokenAndPositionEmbedding` layer:
+"""
+
+
+class TokenAndPositionEmbedding(layers.Layer):
+    def __init__(self, maxlen, vocab_size, embed_dim):
+        super().__init__()
+        self.token_emb = keras.layers.Embedding(
+            input_dim=vocab_size, output_dim=embed_dim
+        )
+        self.pos_emb = keras.layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
+
+    def call(self, inputs):
+        maxlen = ops.shape(inputs)[-1]
+        positions = ops.arange(start=0, stop=maxlen, step=1)
+        position_embeddings = self.pos_emb(positions)
+        token_embeddings = self.token_emb(inputs)
+        return token_embeddings + position_embeddings
+
+
+"""
+## Build the NER model class as a `keras.Model` subclass
+"""
+
+
+class NERModel(keras.Model):
+    def __init__(
+        self, num_tags, vocab_size, maxlen=128, embed_dim=32, num_heads=2, ff_dim=32
+    ):
+        super().__init__()
+        self.embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
+        self.transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
+        self.dropout1 = layers.Dropout(0.1)
+        self.ff = layers.Dense(ff_dim, activation="relu")
+        self.dropout2 = layers.Dropout(0.1)
+        self.ff_final = layers.Dense(num_tags, activation="softmax")
+
+    def call(self, inputs, training=False):
+        x = self.embedding_layer(inputs)
+        x = self.transformer_block(x)
+        x = self.dropout1(x, training=training)
+        x = self.ff(x)
+        x = self.dropout2(x, training=training)
+        x = self.ff_final(x)
+        return x
+
+
+"""
+## Load the CoNLL 2003 dataset from the datasets library and process it
+"""
+
+conll_data = load_dataset("conll2003")
+
+"""
+We will export this data to a tab-separated file format which will be easy to read as a
+`tf.data.Dataset` object.
+"""
+
+
+def export_to_file(export_file_path, data):
+    with open(export_file_path, "w") as f:
+        for record in data:
+            ner_tags = record["ner_tags"]
+            tokens = record["tokens"]
+            if len(tokens) > 0:
+                f.write(
+                    str(len(tokens))
+                    + "\t"
+                    + "\t".join(tokens)
+                    + "\t"
+                    + "\t".join(map(str, ner_tags))
+                    + "\n"
+                )
+
+
+os.mkdir("data")
+export_to_file("./data/conll_train.txt", conll_data["train"])
+export_to_file("./data/conll_val.txt", conll_data["validation"])
+
+"""
+## Make the NER label lookup table
+
+NER labels are usually provided in IOB, IOB2 or IOBES formats. Checkout this link for
+more information:
+[Wikipedia](https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging))
+
+Note that we start our label numbering from 1 since 0 will be reserved for padding. We
+have a total of 10 labels: 9 from the NER dataset and one for padding.
+"""
+
+
+def make_tag_lookup_table():
+    iob_labels = ["B", "I"]
+    ner_labels = ["PER", "ORG", "LOC", "MISC"]
+    all_labels = [(label1, label2) for label2 in ner_labels for label1 in iob_labels]
+    all_labels = ["-".join([a, b]) for a, b in all_labels]
+    all_labels = ["[PAD]", "O"] + all_labels
+    return dict(zip(range(0, len(all_labels) + 1), all_labels))
+
+
+mapping = make_tag_lookup_table()
+print(mapping)
+
+"""
+Get a list of all tokens in the training dataset. This will be used to create the
+vocabulary.
+"""
+
+all_tokens = sum(conll_data["train"]["tokens"], [])
+all_tokens_array = np.array(list(map(str.lower, all_tokens)))
+
+counter = Counter(all_tokens_array)
+print(len(counter))
+
+num_tags = len(mapping)
+vocab_size = 20000
+
+# We only take (vocab_size - 2) most commons words from the training data since
+# the `StringLookup` class uses 2 additional tokens - one denoting an unknown
+# token and another one denoting a masking token
+vocabulary = [token for token, count in counter.most_common(vocab_size - 2)]
+
+# The StringLook class will convert tokens to token IDs
+lookup_layer = keras.layers.StringLookup(vocabulary=vocabulary)
+
+"""
+Create 2 new `Dataset` objects from the training and validation data
+"""
+
+train_data = tf.data.TextLineDataset("./data/conll_train.txt")
+val_data = tf.data.TextLineDataset("./data/conll_val.txt")
+
+"""
+Print out one line to make sure it looks good. The first record in the line is the number of tokens.
+After that we will have all the tokens followed by all the ner tags.
+"""
+
+print(list(train_data.take(1).as_numpy_iterator()))
+
+"""
+We will be using the following map function to transform the data in the dataset:
+"""
+
+
+def map_record_to_training_data(record):
+    record = tf.strings.split(record, sep="\t")
+    length = tf.strings.to_number(record[0], out_type=tf.int32)
+    tokens = record[1 : length + 1]
+    tags = record[length + 1 :]
+    tags = tf.strings.to_number(tags, out_type=tf.int64)
+    tags += 1
+    return tokens, tags
+
+
+def lowercase_and_convert_to_ids(tokens):
+    tokens = tf.strings.lower(tokens)
+    return lookup_layer(tokens)
+
+
+# We use `padded_batch` here because each record in the dataset has a
+# different length.
+batch_size = 32
+train_dataset = (
+    train_data.map(map_record_to_training_data)
+    .map(lambda x, y: (lowercase_and_convert_to_ids(x), y))
+    .padded_batch(batch_size)
+)
+val_dataset = (
+    val_data.map(map_record_to_training_data)
+    .map(lambda x, y: (lowercase_and_convert_to_ids(x), y))
+    .padded_batch(batch_size)
+)
+
+ner_model = NERModel(num_tags, vocab_size, embed_dim=32, num_heads=4, ff_dim=64)
+
+"""
+We will be using a custom loss function that will ignore the loss from padded tokens.
+"""
+
+
+class CustomNonPaddingTokenLoss(keras.losses.Loss):
+    def __init__(self, name="custom_ner_loss"):
+        super().__init__(name=name)
+
+    def call(self, y_true, y_pred):
+        loss_fn = keras.losses.SparseCategoricalCrossentropy(
+            from_logits=False, reduction=None
+        )
+        loss = loss_fn(y_true, y_pred)
+        mask = ops.cast((y_true > 0), dtype="float32")
+        loss = loss * mask
+        return ops.sum(loss) / ops.sum(mask)
+
+
+loss = CustomNonPaddingTokenLoss()
+
+"""
+## Compile and fit the model
+"""
+
+tf.config.run_functions_eagerly(True)
+ner_model.compile(optimizer="adam", loss=loss)
+ner_model.fit(train_dataset, epochs=10)
+
+
+def tokenize_and_convert_to_ids(text):
+    tokens = text.split()
+    return lowercase_and_convert_to_ids(tokens)
+
+
+# Sample inference using the trained model
+sample_input = tokenize_and_convert_to_ids(
+    "eu rejects german call to boycott british lamb"
+)
+sample_input = ops.reshape(sample_input, shape=[1, -1])
+print(sample_input)
+
+output = ner_model.predict(sample_input)
+prediction = np.argmax(output, axis=-1)[0]
+prediction = [mapping[i] for i in prediction]
+
+# eu -> B-ORG, german -> B-MISC, british -> B-MISC
+print(prediction)
+
+"""
+## Metrics calculation
+
+Here is a function to calculate the metrics. The function calculates F1 score for the
+overall NER dataset as well as individual scores for each NER tag.
+"""
+
+
+def calculate_metrics(dataset):
+    all_true_tag_ids, all_predicted_tag_ids = [], []
+
+    for x, y in dataset:
+        output = ner_model.predict(x, verbose=0)
+        predictions = ops.argmax(output, axis=-1)
+        predictions = ops.reshape(predictions, [-1])
+
+        true_tag_ids = ops.reshape(y, [-1])
+
+        mask = (true_tag_ids > 0) & (predictions > 0)
+        true_tag_ids = true_tag_ids[mask]
+        predicted_tag_ids = predictions[mask]
+
+        all_true_tag_ids.append(true_tag_ids)
+        all_predicted_tag_ids.append(predicted_tag_ids)
+
+    all_true_tag_ids = np.concatenate(all_true_tag_ids)
+    all_predicted_tag_ids = np.concatenate(all_predicted_tag_ids)
+
+    predicted_tags = [mapping[tag] for tag in all_predicted_tag_ids]
+    real_tags = [mapping[tag] for tag in all_true_tag_ids]
+
+    evaluate(real_tags, predicted_tags)
+
+
+calculate_metrics(val_dataset)
+
+"""
+## Conclusions
+
+In this exercise, we created a simple transformer based named entity recognition model.
+We trained it on the CoNLL 2003 shared task data and got an overall F1 score of around 70%.
+State of the art NER models fine-tuned on pretrained models such as BERT or ELECTRA can easily
+get much higher F1 score -between 90-95% on this dataset owing to the inherent knowledge
+of words as part of the pretraining process and the usage of subword tokenization.
+
+You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/keras-io/ner-with-transformers)
+and try the demo on [Hugging Face Spaces](https://huggingface.co/spaces/keras-io/ner_with_transformers)."""
diff --git a/knowledge_base/nlp/neural_machine_translation_with_keras_hub.py b/knowledge_base/nlp/neural_machine_translation_with_keras_hub.py
new file mode 100644
index 0000000000000000000000000000000000000000..dcd260c5c42183041eaadb93cf57c9f2c797a995
--- /dev/null
+++ b/knowledge_base/nlp/neural_machine_translation_with_keras_hub.py
@@ -0,0 +1,510 @@
+"""
+Title: English-to-Spanish translation with KerasHub
+Author: [Abheesht Sharma](https://github.com/abheesht17/)
+Date created: 2022/05/26
+Last modified: 2024/04/30
+Description: Use KerasHub to train a sequence-to-sequence Transformer model on the machine translation task.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+KerasHub provides building blocks for NLP (model layers, tokenizers, metrics, etc.) and
+makes it convenient to construct NLP pipelines.
+
+In this example, we'll use KerasHub layers to build an encoder-decoder Transformer
+model, and train it on the English-to-Spanish machine translation task.
+
+This example is based on the
+[English-to-Spanish NMT
+example](https://keras.io/examples/nlp/neural_machine_translation_with_transformer/)
+by [fchollet](https://twitter.com/fchollet). The original example is more low-level
+and implements layers from scratch, whereas this example uses KerasHub to show
+some more advanced approaches, such as subword tokenization and using metrics
+to compute the quality of generated translations.
+
+You'll learn how to:
+
+- Tokenize text using `keras_hub.tokenizers.WordPieceTokenizer`.
+- Implement a sequence-to-sequence Transformer model using KerasHub's
+`keras_hub.layers.TransformerEncoder`, `keras_hub.layers.TransformerDecoder` and
+`keras_hub.layers.TokenAndPositionEmbedding` layers, and train it.
+- Use `keras_hub.samplers` to generate translations of unseen input sentences
+ using the top-p decoding strategy!
+
+Don't worry if you aren't familiar with KerasHub. This tutorial will start with
+the basics. Let's dive right in!
+"""
+
+"""
+## Setup
+
+Before we start implementing the pipeline, let's import all the libraries we need.
+"""
+
+"""shell
+pip install -q --upgrade rouge-score
+pip install -q --upgrade keras-hub
+pip install -q --upgrade keras  # Upgrade to Keras 3.
+"""
+
+import keras_hub
+import pathlib
+import random
+
+import keras
+from keras import ops
+
+import tensorflow.data as tf_data
+from tensorflow_text.tools.wordpiece_vocab import (
+    bert_vocab_from_dataset as bert_vocab,
+)
+
+"""
+Let's also define our parameters/hyperparameters.
+"""
+
+BATCH_SIZE = 64
+EPOCHS = 1  # This should be at least 10 for convergence
+MAX_SEQUENCE_LENGTH = 40
+ENG_VOCAB_SIZE = 15000
+SPA_VOCAB_SIZE = 15000
+
+EMBED_DIM = 256
+INTERMEDIATE_DIM = 2048
+NUM_HEADS = 8
+
+"""
+## Downloading the data
+
+We'll be working with an English-to-Spanish translation dataset
+provided by [Anki](https://www.manythings.org/anki/). Let's download it:
+"""
+
+text_file = keras.utils.get_file(
+    fname="spa-eng.zip",
+    origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
+    extract=True,
+)
+text_file = pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"
+
+"""
+## Parsing the data
+
+Each line contains an English sentence and its corresponding Spanish sentence.
+The English sentence is the *source sequence* and Spanish one is the *target sequence*.
+Before adding the text to a list, we convert it to lowercase.
+"""
+
+with open(text_file) as f:
+    lines = f.read().split("\n")[:-1]
+text_pairs = []
+for line in lines:
+    eng, spa = line.split("\t")
+    eng = eng.lower()
+    spa = spa.lower()
+    text_pairs.append((eng, spa))
+
+"""
+Here's what our sentence pairs look like:
+"""
+
+for _ in range(5):
+    print(random.choice(text_pairs))
+
+"""
+Now, let's split the sentence pairs into a training set, a validation set,
+and a test set.
+"""
+
+random.shuffle(text_pairs)
+num_val_samples = int(0.15 * len(text_pairs))
+num_train_samples = len(text_pairs) - 2 * num_val_samples
+train_pairs = text_pairs[:num_train_samples]
+val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
+test_pairs = text_pairs[num_train_samples + num_val_samples :]
+
+print(f"{len(text_pairs)} total pairs")
+print(f"{len(train_pairs)} training pairs")
+print(f"{len(val_pairs)} validation pairs")
+print(f"{len(test_pairs)} test pairs")
+
+
+"""
+## Tokenizing the data
+
+We'll define two tokenizers - one for the source language (English), and the other
+for the target language (Spanish). We'll be using
+`keras_hub.tokenizers.WordPieceTokenizer` to tokenize the text.
+`keras_hub.tokenizers.WordPieceTokenizer` takes a WordPiece vocabulary
+and has functions for tokenizing the text, and detokenizing sequences of tokens.
+
+Before we define the two tokenizers, we first need to train them on the dataset
+we have. The WordPiece tokenization algorithm is a subword tokenization algorithm;
+training it on a corpus gives us a vocabulary of subwords. A subword tokenizer
+is a compromise between word tokenizers (word tokenizers need very large
+vocabularies for good coverage of input words), and character tokenizers
+(characters don't really encode meaning like words do). Luckily, KerasHub
+makes it very simple to train WordPiece on a corpus with the
+`keras_hub.tokenizers.compute_word_piece_vocabulary` utility.
+"""
+
+
+def train_word_piece(text_samples, vocab_size, reserved_tokens):
+    word_piece_ds = tf_data.Dataset.from_tensor_slices(text_samples)
+    vocab = keras_hub.tokenizers.compute_word_piece_vocabulary(
+        word_piece_ds.batch(1000).prefetch(2),
+        vocabulary_size=vocab_size,
+        reserved_tokens=reserved_tokens,
+    )
+    return vocab
+
+
+"""
+Every vocabulary has a few special, reserved tokens. We have four such tokens:
+
+- `"[PAD]"` - Padding token. Padding tokens are appended to the input sequence
+length when the input sequence length is shorter than the maximum sequence length.
+- `"[UNK]"` - Unknown token.
+- `"[START]"` - Token that marks the start of the input sequence.
+- `"[END]"` - Token that marks the end of the input sequence.
+"""
+
+reserved_tokens = ["[PAD]", "[UNK]", "[START]", "[END]"]
+
+eng_samples = [text_pair[0] for text_pair in train_pairs]
+eng_vocab = train_word_piece(eng_samples, ENG_VOCAB_SIZE, reserved_tokens)
+
+spa_samples = [text_pair[1] for text_pair in train_pairs]
+spa_vocab = train_word_piece(spa_samples, SPA_VOCAB_SIZE, reserved_tokens)
+
+"""
+Let's see some tokens!
+"""
+
+print("English Tokens: ", eng_vocab[100:110])
+print("Spanish Tokens: ", spa_vocab[100:110])
+
+"""
+Now, let's define the tokenizers. We will configure the tokenizers with the
+the vocabularies trained above.
+"""
+
+eng_tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
+    vocabulary=eng_vocab, lowercase=False
+)
+spa_tokenizer = keras_hub.tokenizers.WordPieceTokenizer(
+    vocabulary=spa_vocab, lowercase=False
+)
+
+"""
+Let's try and tokenize a sample from our dataset! To verify whether the text has
+been tokenized correctly, we can also detokenize the list of tokens back to the
+original text.
+"""
+
+eng_input_ex = text_pairs[0][0]
+eng_tokens_ex = eng_tokenizer.tokenize(eng_input_ex)
+print("English sentence: ", eng_input_ex)
+print("Tokens: ", eng_tokens_ex)
+print(
+    "Recovered text after detokenizing: ",
+    eng_tokenizer.detokenize(eng_tokens_ex),
+)
+
+print()
+
+spa_input_ex = text_pairs[0][1]
+spa_tokens_ex = spa_tokenizer.tokenize(spa_input_ex)
+print("Spanish sentence: ", spa_input_ex)
+print("Tokens: ", spa_tokens_ex)
+print(
+    "Recovered text after detokenizing: ",
+    spa_tokenizer.detokenize(spa_tokens_ex),
+)
+
+"""
+## Format datasets
+
+Next, we'll format our datasets.
+
+At each training step, the model will seek to predict target words N+1 (and beyond)
+using the source sentence and the target words 0 to N.
+
+As such, the training dataset will yield a tuple `(inputs, targets)`, where:
+
+- `inputs` is a dictionary with the keys `encoder_inputs` and `decoder_inputs`.
+`encoder_inputs` is the tokenized source sentence and `decoder_inputs` is the target
+sentence "so far",
+that is to say, the words 0 to N used to predict word N+1 (and beyond) in the target
+sentence.
+- `target` is the target sentence offset by one step:
+it provides the next words in the target sentence -- what the model will try to predict.
+
+We will add special tokens, `"[START]"` and `"[END]"`, to the input Spanish
+sentence after tokenizing the text. We will also pad the input to a fixed length.
+This can be easily done using `keras_hub.layers.StartEndPacker`.
+"""
+
+
+def preprocess_batch(eng, spa):
+    batch_size = ops.shape(spa)[0]
+
+    eng = eng_tokenizer(eng)
+    spa = spa_tokenizer(spa)
+
+    # Pad `eng` to `MAX_SEQUENCE_LENGTH`.
+    eng_start_end_packer = keras_hub.layers.StartEndPacker(
+        sequence_length=MAX_SEQUENCE_LENGTH,
+        pad_value=eng_tokenizer.token_to_id("[PAD]"),
+    )
+    eng = eng_start_end_packer(eng)
+
+    # Add special tokens (`"[START]"` and `"[END]"`) to `spa` and pad it as well.
+    spa_start_end_packer = keras_hub.layers.StartEndPacker(
+        sequence_length=MAX_SEQUENCE_LENGTH + 1,
+        start_value=spa_tokenizer.token_to_id("[START]"),
+        end_value=spa_tokenizer.token_to_id("[END]"),
+        pad_value=spa_tokenizer.token_to_id("[PAD]"),
+    )
+    spa = spa_start_end_packer(spa)
+
+    return (
+        {
+            "encoder_inputs": eng,
+            "decoder_inputs": spa[:, :-1],
+        },
+        spa[:, 1:],
+    )
+
+
+def make_dataset(pairs):
+    eng_texts, spa_texts = zip(*pairs)
+    eng_texts = list(eng_texts)
+    spa_texts = list(spa_texts)
+    dataset = tf_data.Dataset.from_tensor_slices((eng_texts, spa_texts))
+    dataset = dataset.batch(BATCH_SIZE)
+    dataset = dataset.map(preprocess_batch, num_parallel_calls=tf_data.AUTOTUNE)
+    return dataset.shuffle(2048).prefetch(16).cache()
+
+
+train_ds = make_dataset(train_pairs)
+val_ds = make_dataset(val_pairs)
+
+"""
+Let's take a quick look at the sequence shapes
+(we have batches of 64 pairs, and all sequences are 40 steps long):
+"""
+
+for inputs, targets in train_ds.take(1):
+    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
+    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
+    print(f"targets.shape: {targets.shape}")
+
+
+"""
+## Building the model
+
+Now, let's move on to the exciting part - defining our model!
+We first need an embedding layer, i.e., a vector for every token in our input sequence.
+This embedding layer can be initialised randomly. We also need a positional
+embedding layer which encodes the word order in the sequence. The convention is
+to add these two embeddings. KerasHub has a `keras_hub.layers.TokenAndPositionEmbedding `
+layer which does all of the above steps for us.
+
+Our sequence-to-sequence Transformer consists of a `keras_hub.layers.TransformerEncoder`
+layer and a `keras_hub.layers.TransformerDecoder` layer chained together.
+
+The source sequence will be passed to `keras_hub.layers.TransformerEncoder`, which
+will produce a new representation of it. This new representation will then be passed
+to the `keras_hub.layers.TransformerDecoder`, together with the target sequence
+so far (target words 0 to N). The `keras_hub.layers.TransformerDecoder` will
+then seek to predict the next words in the target sequence (N+1 and beyond).
+
+A key detail that makes this possible is causal masking.
+The `keras_hub.layers.TransformerDecoder` sees the entire sequence at once, and
+thus we must make sure that it only uses information from target tokens 0 to N
+when predicting token N+1 (otherwise, it could use information from the future,
+which would result in a model that cannot be used at inference time). Causal masking
+is enabled by default in `keras_hub.layers.TransformerDecoder`.
+
+We also need to mask the padding tokens (`"[PAD]"`). For this, we can set the
+`mask_zero` argument of the `keras_hub.layers.TokenAndPositionEmbedding` layer
+to True. This will then be propagated to all subsequent layers.
+"""
+
+# Encoder
+encoder_inputs = keras.Input(shape=(None,), name="encoder_inputs")
+
+x = keras_hub.layers.TokenAndPositionEmbedding(
+    vocabulary_size=ENG_VOCAB_SIZE,
+    sequence_length=MAX_SEQUENCE_LENGTH,
+    embedding_dim=EMBED_DIM,
+)(encoder_inputs)
+
+encoder_outputs = keras_hub.layers.TransformerEncoder(
+    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
+)(inputs=x)
+encoder = keras.Model(encoder_inputs, encoder_outputs)
+
+
+# Decoder
+decoder_inputs = keras.Input(shape=(None,), name="decoder_inputs")
+encoded_seq_inputs = keras.Input(shape=(None, EMBED_DIM), name="decoder_state_inputs")
+
+x = keras_hub.layers.TokenAndPositionEmbedding(
+    vocabulary_size=SPA_VOCAB_SIZE,
+    sequence_length=MAX_SEQUENCE_LENGTH,
+    embedding_dim=EMBED_DIM,
+)(decoder_inputs)
+
+x = keras_hub.layers.TransformerDecoder(
+    intermediate_dim=INTERMEDIATE_DIM, num_heads=NUM_HEADS
+)(decoder_sequence=x, encoder_sequence=encoded_seq_inputs)
+x = keras.layers.Dropout(0.5)(x)
+decoder_outputs = keras.layers.Dense(SPA_VOCAB_SIZE, activation="softmax")(x)
+decoder = keras.Model(
+    [
+        decoder_inputs,
+        encoded_seq_inputs,
+    ],
+    decoder_outputs,
+)
+decoder_outputs = decoder([decoder_inputs, encoder_outputs])
+
+transformer = keras.Model(
+    [encoder_inputs, decoder_inputs],
+    decoder_outputs,
+    name="transformer",
+)
+
+"""
+## Training our model
+
+We'll use accuracy as a quick way to monitor training progress on the validation data.
+Note that machine translation typically uses BLEU scores as well as other metrics,
+rather than accuracy. However, in order to use metrics like ROUGE, BLEU, etc. we
+will have decode the probabilities and generate the text. Text generation is
+computationally expensive, and performing this during training is not recommended.
+
+Here we only train for 1 epoch, but to get the model to actually converge
+you should train for at least 10 epochs.
+"""
+
+transformer.summary()
+transformer.compile(
+    "rmsprop", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
+)
+transformer.fit(train_ds, epochs=EPOCHS, validation_data=val_ds)
+
+"""
+## Decoding test sentences (qualitative analysis)
+
+Finally, let's demonstrate how to translate brand new English sentences.
+We simply feed into the model the tokenized English sentence
+as well as the target token `"[START]"`. The model outputs probabilities of the
+next token. We then we repeatedly generated the next token conditioned on the
+tokens generated so far, until we hit the token `"[END]"`.
+
+For decoding, we will use the `keras_hub.samplers` module from
+KerasHub. Greedy Decoding is a text decoding method which outputs the most
+likely next token at each time step, i.e., the token with the highest probability.
+"""
+
+
+def decode_sequences(input_sentences):
+    batch_size = 1
+
+    # Tokenize the encoder input.
+    encoder_input_tokens = ops.convert_to_tensor(eng_tokenizer(input_sentences))
+    if len(encoder_input_tokens[0]) < MAX_SEQUENCE_LENGTH:
+        pads = ops.full((1, MAX_SEQUENCE_LENGTH - len(encoder_input_tokens[0])), 0)
+        encoder_input_tokens = ops.concatenate(
+            [encoder_input_tokens.to_tensor(), pads], 1
+        )
+
+    # Define a function that outputs the next token's probability given the
+    # input sequence.
+    def next(prompt, cache, index):
+        logits = transformer([encoder_input_tokens, prompt])[:, index - 1, :]
+        # Ignore hidden states for now; only needed for contrastive search.
+        hidden_states = None
+        return logits, hidden_states, cache
+
+    # Build a prompt of length 40 with a start token and padding tokens.
+    length = 40
+    start = ops.full((batch_size, 1), spa_tokenizer.token_to_id("[START]"))
+    pad = ops.full((batch_size, length - 1), spa_tokenizer.token_to_id("[PAD]"))
+    prompt = ops.concatenate((start, pad), axis=-1)
+
+    generated_tokens = keras_hub.samplers.GreedySampler()(
+        next,
+        prompt,
+        stop_token_ids=[spa_tokenizer.token_to_id("[END]")],
+        index=1,  # Start sampling after start token.
+    )
+    generated_sentences = spa_tokenizer.detokenize(generated_tokens)
+    return generated_sentences
+
+
+test_eng_texts = [pair[0] for pair in test_pairs]
+for i in range(2):
+    input_sentence = random.choice(test_eng_texts)
+    translated = decode_sequences([input_sentence])
+    translated = translated.numpy()[0].decode("utf-8")
+    translated = (
+        translated.replace("[PAD]", "")
+        .replace("[START]", "")
+        .replace("[END]", "")
+        .strip()
+    )
+    print(f"** Example {i} **")
+    print(input_sentence)
+    print(translated)
+    print()
+
+"""
+## Evaluating our model (quantitative analysis)
+
+There are many metrics which are used for text generation tasks. Here, to
+evaluate translations generated by our model, let's compute the ROUGE-1 and
+ROUGE-2 scores. Essentially, ROUGE-N is a score based on the number of common
+n-grams between the reference text and the generated text. ROUGE-1 and ROUGE-2
+use the number of common unigrams and bigrams, respectively.
+
+We will calculate the score over 30 test samples (since decoding is an
+expensive process).
+"""
+
+rouge_1 = keras_hub.metrics.RougeN(order=1)
+rouge_2 = keras_hub.metrics.RougeN(order=2)
+
+for test_pair in test_pairs[:30]:
+    input_sentence = test_pair[0]
+    reference_sentence = test_pair[1]
+
+    translated_sentence = decode_sequences([input_sentence])
+    translated_sentence = translated_sentence.numpy()[0].decode("utf-8")
+    translated_sentence = (
+        translated_sentence.replace("[PAD]", "")
+        .replace("[START]", "")
+        .replace("[END]", "")
+        .strip()
+    )
+
+    rouge_1(reference_sentence, translated_sentence)
+    rouge_2(reference_sentence, translated_sentence)
+
+print("ROUGE-1 Score: ", rouge_1.result())
+print("ROUGE-2 Score: ", rouge_2.result())
+
+"""
+After 10 epochs, the scores are as follows:
+
+|               | **ROUGE-1** | **ROUGE-2** |
+|:-------------:|:-----------:|:-----------:|
+| **Precision** |    0.568    |    0.374    |
+|   **Recall**  |    0.615    |    0.394    |
+|  **F1 Score** |    0.579    |    0.381    |
+"""
diff --git a/knowledge_base/nlp/neural_machine_translation_with_transformer.py b/knowledge_base/nlp/neural_machine_translation_with_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..54558533a61aa4f287e83ce5047f58b3e5a36b4c
--- /dev/null
+++ b/knowledge_base/nlp/neural_machine_translation_with_transformer.py
@@ -0,0 +1,507 @@
+"""
+Title: English-to-Spanish translation with a sequence-to-sequence Transformer
+Author: [fchollet](https://twitter.com/fchollet)
+Date created: 2021/05/26
+Last modified: 2024/11/18
+Description: Implementing a sequence-to-sequence Transformer and training it on a machine translation task.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+In this example, we'll build a sequence-to-sequence Transformer model, which
+we'll train on an English-to-Spanish machine translation task.
+
+You'll learn how to:
+
+- Vectorize text using the Keras `TextVectorization` layer.
+- Implement a `TransformerEncoder` layer, a `TransformerDecoder` layer,
+and a `PositionalEmbedding` layer.
+- Prepare data for training a sequence-to-sequence model.
+- Use the trained model to generate translations of never-seen-before
+input sentences (sequence-to-sequence inference).
+
+The code featured here is adapted from the book
+[Deep Learning with Python, Second Edition](https://www.manning.com/books/deep-learning-with-python-second-edition)
+(chapter 11: Deep learning for text).
+The present example is fairly barebones, so for detailed explanations of
+how each building block works, as well as the theory behind Transformers,
+I recommend reading the book.
+"""
+"""
+## Setup
+"""
+
+# We set the backend to TensorFlow. The code works with
+# both `tensorflow` and `torch`. It does not work with JAX
+# due to the behavior of `jax.numpy.tile` in a jit scope
+# (used in `TransformerDecoder.get_causal_attention_mask()`:
+# `tile` in JAX does not support a dynamic `reps` argument.
+# You can make the code work in JAX by wrapping the
+# inside of the `get_causal_attention_mask` method in
+# a decorator to prevent jit compilation:
+# `with jax.ensure_compile_time_eval():`.
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import pathlib
+import random
+import string
+import re
+import numpy as np
+
+import tensorflow.data as tf_data
+import tensorflow.strings as tf_strings
+
+import keras
+from keras import layers
+from keras import ops
+from keras.layers import TextVectorization
+
+"""
+## Downloading the data
+
+We'll be working with an English-to-Spanish translation dataset
+provided by [Anki](https://www.manythings.org/anki/). Let's download it:
+"""
+
+text_file = keras.utils.get_file(
+    fname="spa-eng.zip",
+    origin="http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
+    extract=True,
+)
+text_file = pathlib.Path(text_file).parent / "spa-eng" / "spa.txt"
+
+"""
+## Parsing the data
+
+Each line contains an English sentence and its corresponding Spanish sentence.
+The English sentence is the *source sequence* and Spanish one is the *target sequence*.
+We prepend the token `"[start]"` and we append the token `"[end]"` to the Spanish sentence.
+"""
+
+with open(text_file) as f:
+    lines = f.read().split("\n")[:-1]
+text_pairs = []
+for line in lines:
+    eng, spa = line.split("\t")
+    spa = "[start] " + spa + " [end]"
+    text_pairs.append((eng, spa))
+
+"""
+Here's what our sentence pairs look like:
+"""
+
+for _ in range(5):
+    print(random.choice(text_pairs))
+
+"""
+Now, let's split the sentence pairs into a training set, a validation set,
+and a test set.
+"""
+
+random.shuffle(text_pairs)
+num_val_samples = int(0.15 * len(text_pairs))
+num_train_samples = len(text_pairs) - 2 * num_val_samples
+train_pairs = text_pairs[:num_train_samples]
+val_pairs = text_pairs[num_train_samples : num_train_samples + num_val_samples]
+test_pairs = text_pairs[num_train_samples + num_val_samples :]
+
+print(f"{len(text_pairs)} total pairs")
+print(f"{len(train_pairs)} training pairs")
+print(f"{len(val_pairs)} validation pairs")
+print(f"{len(test_pairs)} test pairs")
+
+"""
+## Vectorizing the text data
+
+We'll use two instances of the `TextVectorization` layer to vectorize the text
+data (one for English and one for Spanish),
+that is to say, to turn the original strings into integer sequences
+where each integer represents the index of a word in a vocabulary.
+
+The English layer will use the default string standardization (strip punctuation characters)
+and splitting scheme (split on whitespace), while
+the Spanish layer will use a custom standardization, where we add the character
+`"¿"` to the set of punctuation characters to be stripped.
+
+Note: in a production-grade machine translation model, I would not recommend
+stripping the punctuation characters in either language. Instead, I would recommend turning
+each punctuation character into its own token,
+which you could achieve by providing a custom `split` function to the `TextVectorization` layer.
+"""
+
+strip_chars = string.punctuation + "¿"
+strip_chars = strip_chars.replace("[", "")
+strip_chars = strip_chars.replace("]", "")
+
+vocab_size = 15000
+sequence_length = 20
+batch_size = 64
+
+
+def custom_standardization(input_string):
+    lowercase = tf_strings.lower(input_string)
+    return tf_strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")
+
+
+eng_vectorization = TextVectorization(
+    max_tokens=vocab_size,
+    output_mode="int",
+    output_sequence_length=sequence_length,
+)
+spa_vectorization = TextVectorization(
+    max_tokens=vocab_size,
+    output_mode="int",
+    output_sequence_length=sequence_length + 1,
+    standardize=custom_standardization,
+)
+train_eng_texts = [pair[0] for pair in train_pairs]
+train_spa_texts = [pair[1] for pair in train_pairs]
+eng_vectorization.adapt(train_eng_texts)
+spa_vectorization.adapt(train_spa_texts)
+
+"""
+Next, we'll format our datasets.
+
+At each training step, the model will seek to predict target words N+1 (and beyond)
+using the source sentence and the target words 0 to N.
+
+As such, the training dataset will yield a tuple `(inputs, targets)`, where:
+
+- `inputs` is a dictionary with the keys `encoder_inputs` and `decoder_inputs`.
+`encoder_inputs` is the vectorized source sentence and `decoder_inputs` is the target sentence "so far",
+that is to say, the words 0 to N used to predict word N+1 (and beyond) in the target sentence.
+- `target` is the target sentence offset by one step:
+it provides the next words in the target sentence -- what the model will try to predict.
+"""
+
+
+def format_dataset(eng, spa):
+    eng = eng_vectorization(eng)
+    spa = spa_vectorization(spa)
+    return (
+        {
+            "encoder_inputs": eng,
+            "decoder_inputs": spa[:, :-1],
+        },
+        spa[:, 1:],
+    )
+
+
+def make_dataset(pairs):
+    eng_texts, spa_texts = zip(*pairs)
+    eng_texts = list(eng_texts)
+    spa_texts = list(spa_texts)
+    dataset = tf_data.Dataset.from_tensor_slices((eng_texts, spa_texts))
+    dataset = dataset.batch(batch_size)
+    dataset = dataset.map(format_dataset)
+    return dataset.cache().shuffle(2048).prefetch(16)
+
+
+train_ds = make_dataset(train_pairs)
+val_ds = make_dataset(val_pairs)
+
+"""
+Let's take a quick look at the sequence shapes
+(we have batches of 64 pairs, and all sequences are 20 steps long):
+"""
+
+for inputs, targets in train_ds.take(1):
+    print(f'inputs["encoder_inputs"].shape: {inputs["encoder_inputs"].shape}')
+    print(f'inputs["decoder_inputs"].shape: {inputs["decoder_inputs"].shape}')
+    print(f"targets.shape: {targets.shape}")
+
+"""
+## Building the model
+
+Our sequence-to-sequence Transformer consists of a `TransformerEncoder`
+and a `TransformerDecoder` chained together. To make the model aware of word order,
+we also use a `PositionalEmbedding` layer.
+
+The source sequence will be pass to the `TransformerEncoder`,
+which will produce a new representation of it.
+This new representation will then be passed
+to the `TransformerDecoder`, together with the target sequence so far (target words 0 to N).
+The `TransformerDecoder` will then seek to predict the next words in the target sequence (N+1 and beyond).
+
+A key detail that makes this possible is causal masking
+(see method `get_causal_attention_mask()` on the `TransformerDecoder`).
+The `TransformerDecoder` sees the entire sequences at once, and thus we must make
+sure that it only uses information from target tokens 0 to N when predicting token N+1
+(otherwise, it could use information from the future, which would
+result in a model that cannot be used at inference time).
+"""
+import keras.ops as ops
+
+
+class TransformerEncoder(layers.Layer):
+    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.dense_dim = dense_dim
+        self.num_heads = num_heads
+        self.attention = layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=embed_dim
+        )
+        self.dense_proj = keras.Sequential(
+            [
+                layers.Dense(dense_dim, activation="relu"),
+                layers.Dense(embed_dim),
+            ]
+        )
+        self.layernorm_1 = layers.LayerNormalization()
+        self.layernorm_2 = layers.LayerNormalization()
+        self.supports_masking = True
+
+    def call(self, inputs, mask=None):
+        if mask is not None:
+            padding_mask = ops.cast(mask[:, None, :], dtype="int32")
+        else:
+            padding_mask = None
+
+        attention_output = self.attention(
+            query=inputs, value=inputs, key=inputs, attention_mask=padding_mask
+        )
+        proj_input = self.layernorm_1(inputs + attention_output)
+        proj_output = self.dense_proj(proj_input)
+        return self.layernorm_2(proj_input + proj_output)
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "embed_dim": self.embed_dim,
+                "dense_dim": self.dense_dim,
+                "num_heads": self.num_heads,
+            }
+        )
+        return config
+
+
+class PositionalEmbedding(layers.Layer):
+    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
+        super().__init__(**kwargs)
+        self.token_embeddings = layers.Embedding(
+            input_dim=vocab_size, output_dim=embed_dim
+        )
+        self.position_embeddings = layers.Embedding(
+            input_dim=sequence_length, output_dim=embed_dim
+        )
+        self.sequence_length = sequence_length
+        self.vocab_size = vocab_size
+        self.embed_dim = embed_dim
+
+    def call(self, inputs):
+        length = ops.shape(inputs)[-1]
+        positions = ops.arange(0, length, 1)
+        embedded_tokens = self.token_embeddings(inputs)
+        embedded_positions = self.position_embeddings(positions)
+        return embedded_tokens + embedded_positions
+
+    def compute_mask(self, inputs, mask=None):
+        return ops.not_equal(inputs, 0)
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "sequence_length": self.sequence_length,
+                "vocab_size": self.vocab_size,
+                "embed_dim": self.embed_dim,
+            }
+        )
+        return config
+
+
+class TransformerDecoder(layers.Layer):
+    def __init__(self, embed_dim, latent_dim, num_heads, **kwargs):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.latent_dim = latent_dim
+        self.num_heads = num_heads
+        self.attention_1 = layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=embed_dim
+        )
+        self.attention_2 = layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=embed_dim
+        )
+        self.dense_proj = keras.Sequential(
+            [
+                layers.Dense(latent_dim, activation="relu"),
+                layers.Dense(embed_dim),
+            ]
+        )
+        self.layernorm_1 = layers.LayerNormalization()
+        self.layernorm_2 = layers.LayerNormalization()
+        self.layernorm_3 = layers.LayerNormalization()
+        self.supports_masking = True
+
+    def call(self, inputs, mask=None):
+        inputs, encoder_outputs = inputs
+        causal_mask = self.get_causal_attention_mask(inputs)
+
+        if mask is None:
+            inputs_padding_mask, encoder_outputs_padding_mask = None, None
+        else:
+            inputs_padding_mask, encoder_outputs_padding_mask = mask
+
+        attention_output_1 = self.attention_1(
+            query=inputs,
+            value=inputs,
+            key=inputs,
+            attention_mask=causal_mask,
+            query_mask=inputs_padding_mask,
+        )
+        out_1 = self.layernorm_1(inputs + attention_output_1)
+
+        attention_output_2 = self.attention_2(
+            query=out_1,
+            value=encoder_outputs,
+            key=encoder_outputs,
+            query_mask=inputs_padding_mask,
+            key_mask=encoder_outputs_padding_mask,
+        )
+        out_2 = self.layernorm_2(out_1 + attention_output_2)
+
+        proj_output = self.dense_proj(out_2)
+        return self.layernorm_3(out_2 + proj_output)
+
+    def get_causal_attention_mask(self, inputs):
+        input_shape = ops.shape(inputs)
+        batch_size, sequence_length = input_shape[0], input_shape[1]
+        i = ops.arange(sequence_length)[:, None]
+        j = ops.arange(sequence_length)
+        mask = ops.cast(i >= j, dtype="int32")
+        mask = ops.reshape(mask, (1, input_shape[1], input_shape[1]))
+        mult = ops.concatenate(
+            [ops.expand_dims(batch_size, -1), ops.convert_to_tensor([1, 1])],
+            axis=0,
+        )
+        return ops.tile(mask, mult)
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "embed_dim": self.embed_dim,
+                "latent_dim": self.latent_dim,
+                "num_heads": self.num_heads,
+            }
+        )
+        return config
+
+
+"""
+Next, we assemble the end-to-end model.
+"""
+
+embed_dim = 256
+latent_dim = 2048
+num_heads = 8
+
+encoder_inputs = keras.Input(shape=(None,), dtype="int64", name="encoder_inputs")
+x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(encoder_inputs)
+encoder_outputs = TransformerEncoder(embed_dim, latent_dim, num_heads)(x)
+encoder = keras.Model(encoder_inputs, encoder_outputs)
+
+decoder_inputs = keras.Input(shape=(None,), dtype="int64", name="decoder_inputs")
+encoded_seq_inputs = keras.Input(shape=(None, embed_dim), name="decoder_state_inputs")
+x = PositionalEmbedding(sequence_length, vocab_size, embed_dim)(decoder_inputs)
+x = TransformerDecoder(embed_dim, latent_dim, num_heads)([x, encoder_outputs])
+x = layers.Dropout(0.5)(x)
+decoder_outputs = layers.Dense(vocab_size, activation="softmax")(x)
+decoder = keras.Model([decoder_inputs, encoded_seq_inputs], decoder_outputs)
+
+transformer = keras.Model(
+    {"encoder_inputs": encoder_inputs, "decoder_inputs": decoder_inputs},
+    decoder_outputs,
+    name="transformer",
+)
+
+"""
+## Training our model
+
+We'll use accuracy as a quick way to monitor training progress on the validation data.
+Note that machine translation typically uses BLEU scores as well as other metrics, rather than accuracy.
+
+Here we only train for 1 epoch, but to get the model to actually converge
+you should train for at least 30 epochs.
+"""
+
+epochs = 1  # This should be at least 30 for convergence
+
+transformer.summary()
+transformer.compile(
+    "rmsprop",
+    loss=keras.losses.SparseCategoricalCrossentropy(ignore_class=0),
+    metrics=["accuracy"],
+)
+transformer.fit(train_ds, epochs=epochs, validation_data=val_ds)
+
+"""
+## Decoding test sentences
+
+Finally, let's demonstrate how to translate brand new English sentences.
+We simply feed into the model the vectorized English sentence
+as well as the target token `"[start]"`, then we repeatedly generated the next token, until
+we hit the token `"[end]"`.
+"""
+
+spa_vocab = spa_vectorization.get_vocabulary()
+spa_index_lookup = dict(zip(range(len(spa_vocab)), spa_vocab))
+max_decoded_sentence_length = 20
+
+
+def decode_sequence(input_sentence):
+    tokenized_input_sentence = eng_vectorization([input_sentence])
+    decoded_sentence = "[start]"
+    for i in range(max_decoded_sentence_length):
+        tokenized_target_sentence = spa_vectorization([decoded_sentence])[:, :-1]
+        predictions = transformer(
+            {
+                "encoder_inputs": tokenized_input_sentence,
+                "decoder_inputs": tokenized_target_sentence,
+            }
+        )
+
+        # ops.argmax(predictions[0, i, :]) is not a concrete value for jax here
+        sampled_token_index = ops.convert_to_numpy(
+            ops.argmax(predictions[0, i, :])
+        ).item(0)
+        sampled_token = spa_index_lookup[sampled_token_index]
+        decoded_sentence += " " + sampled_token
+
+        if sampled_token == "[end]":
+            break
+    return decoded_sentence
+
+
+test_eng_texts = [pair[0] for pair in test_pairs]
+for _ in range(30):
+    input_sentence = random.choice(test_eng_texts)
+    translated = decode_sequence(input_sentence)
+
+"""
+After 30 epochs, we get results such as:
+
+> She handed him the money.
+> [start] ella le pasó el dinero [end]
+
+> Tom has never heard Mary sing.
+> [start] tom nunca ha oído cantar a mary [end]
+
+> Perhaps she will come tomorrow.
+> [start] tal vez ella vendrá mañana [end]
+
+> I love to write.
+> [start] me encanta escribir [end]
+
+> His French is improving little by little.
+> [start] su francés va a [UNK] sólo un poco [end]
+
+> My hotel told me to call you.
+> [start] mi hotel me dijo que te [UNK] [end]
+"""
diff --git a/knowledge_base/nlp/parameter_efficient_finetuning_of_gpt2_with_lora.py b/knowledge_base/nlp/parameter_efficient_finetuning_of_gpt2_with_lora.py
new file mode 100644
index 0000000000000000000000000000000000000000..830cdc09c65b7b3fc37db883387f409dc3fdad86
--- /dev/null
+++ b/knowledge_base/nlp/parameter_efficient_finetuning_of_gpt2_with_lora.py
@@ -0,0 +1,607 @@
+"""
+Title: Parameter-efficient fine-tuning of GPT-2 with LoRA
+Author: [Abheesht Sharma](https://github.com/abheesht17/), [Matthew Watson](https://github.com/mattdangerw/)
+Date created: 2023/05/27
+Last modified: 2023/05/27
+Description: Use KerasHub to fine-tune a GPT-2 LLM with LoRA.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+Large Language Models (LLMs) have been shown to be effective at a variety of NLP
+tasks. An LLM is first pre-trained on a large corpus of text in a
+self-supervised fashion. Pre-training helps LLMs learn general-purpose knowledge,
+such as statistical relationships between words. An LLM can then be fine-tuned
+on a downstream task of interest (such as sentiment analysis).
+
+However, LLMs are extremely large in size, and we don't need to train all the
+parameters in the model while fine-tuning, especially because datasets on which
+the model is fine-tuned are relatively small. Another way of saying this is
+that LLMs are over-parametrized for fine-tuning. This is where
+[Low-Rank Adaptation (LoRA)](https://arxiv.org/abs/2106.09685) comes in; it
+significantly reduces the number of trainable parameters. This results in a
+decrease in training time and GPU memory usage, while maintaining the quality
+of the outputs.
+
+In this example, we will explain LoRA in technical terms, show how the technical
+explanation translates to code, hack KerasHub's
+[GPT-2 model](https://keras.io/api/keras_hub/models/gpt2/) and fine-tune
+it on the next token prediction task using LoRA. We will compare LoRA GPT-2
+with a fully fine-tuned GPT-2 in terms of the quality of the generated text,
+training time and GPU memory usage.
+
+Note: This example runs on the TensorFlow backend purely for the
+`tf.config.experimental.get_memory_info` API to easily plot memory usage.
+Outside of the memory usage callback, this example will run on `jax` and `torch`
+backends.
+"""
+
+"""
+## Setup
+
+Before we start implementing the pipeline, let's install and import all the
+libraries we need. We'll be using the KerasHub library.
+
+Secondly, let's enable mixed precision training. This will help us reduce the
+training time.
+"""
+
+"""shell
+pip install -q --upgrade keras-hub
+pip install -q --upgrade keras  # Upgrade to Keras 3.
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import keras_hub
+import keras
+import matplotlib.pyplot as plt
+import tensorflow as tf
+import tensorflow_datasets as tfds
+import time
+
+keras.mixed_precision.set_global_policy("mixed_float16")
+
+"""
+Let's also define our hyperparameters.
+"""
+
+# General hyperparameters
+BATCH_SIZE = 32
+NUM_BATCHES = 500
+EPOCHS = 1  # Can be set to a higher value for better results
+MAX_SEQUENCE_LENGTH = 128
+MAX_GENERATION_LENGTH = 200
+
+GPT2_PRESET = "gpt2_base_en"
+
+# LoRA-specific hyperparameters
+RANK = 4
+ALPHA = 32.0
+
+
+"""
+## Dataset
+
+Let's load a Reddit dataset. We will fine-tune both the GPT-2 model and the
+LoRA GPT-2 model on a subset of this dataset. The aim is to produce text similar
+in style to Reddit posts.
+"""
+
+reddit_ds = tfds.load("reddit_tifu", split="train", as_supervised=True)
+
+"""
+The dataset has two fields: `document` and `title`.
+"""
+
+for document, title in reddit_ds:
+    print(document.numpy())
+    print(title.numpy())
+    break
+
+"""
+We'll now batch the dataset and retain only the `document` field because we are
+fine-tuning the model on the next word prediction task. Take a subset
+of the dataset for the purpose of this example.
+"""
+
+train_ds = (
+    reddit_ds.map(lambda document, _: document)
+    .batch(BATCH_SIZE)
+    .cache()
+    .prefetch(tf.data.AUTOTUNE)
+)
+train_ds = train_ds.take(NUM_BATCHES)
+
+"""
+## Helper functions
+
+Before we begin fine-tuning the models, let's define a few helper functions and
+classes.
+"""
+
+"""
+### Callback for tracking GPU memory usage
+
+We'll define a custom callback function which tracks GPU memory usage. The
+callback function uses TensorFlow's `tf.config.experimental.get_memory_info`
+API.
+
+Here, we assume that we are using a single GPU, `GPU:0`.
+"""
+
+
+class GPUMemoryCallback(keras.callbacks.Callback):
+    def __init__(
+        self,
+        target_batches,
+        print_stats=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.target_batches = target_batches
+        self.print_stats = print_stats
+
+        self.memory_usage = []
+        self.labels = []
+
+    def _compute_memory_usage(self):
+        memory_stats = tf.config.experimental.get_memory_info("GPU:0")
+        # Convert bytes to GB and store in list.
+        peak_usage = round(memory_stats["peak"] / (2**30), 3)
+        self.memory_usage.append(peak_usage)
+
+    def on_epoch_begin(self, epoch, logs=None):
+        self._compute_memory_usage()
+        self.labels.append(f"epoch {epoch} start")
+
+    def on_train_batch_begin(self, batch, logs=None):
+        if batch in self.target_batches:
+            self._compute_memory_usage()
+            self.labels.append(f"batch {batch}")
+
+    def on_epoch_end(self, epoch, logs=None):
+        self._compute_memory_usage()
+        self.labels.append(f"epoch {epoch} end")
+
+
+"""
+### Function for text generation
+
+Here is a helper function to generate text.
+"""
+
+
+def generate_text(model, input_text, max_length=200):
+    start = time.time()
+
+    output = model.generate(input_text, max_length=max_length)
+    print("\nOutput:")
+    print(output)
+
+    end = time.time()
+    print(f"Total Time Elapsed: {end - start:.2f}s")
+
+
+"""
+### Define optimizer and loss
+
+We will use AdamW optimizer and cross-entropy loss for training both models.
+"""
+
+
+def get_optimizer_and_loss():
+    optimizer = keras.optimizers.AdamW(
+        learning_rate=5e-5,
+        weight_decay=0.01,
+        epsilon=1e-6,
+        global_clipnorm=1.0,  # Gradient clipping.
+    )
+    # Exclude layernorm and bias terms from weight decay.
+    optimizer.exclude_from_weight_decay(var_names=["bias"])
+    optimizer.exclude_from_weight_decay(var_names=["gamma"])
+    optimizer.exclude_from_weight_decay(var_names=["beta"])
+
+    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+    return optimizer, loss
+
+
+"""
+## Fine-tune GPT-2
+
+Let's load the model and preprocessor first. We use a sequence length of 128
+instead of 1024 (which is the default sequence length). This will limit our
+ability to predict long sequences, but will allow us to run this example quickly
+on Colab.
+"""
+
+preprocessor = keras_hub.models.GPT2CausalLMPreprocessor.from_preset(
+    "gpt2_base_en",
+    sequence_length=MAX_SEQUENCE_LENGTH,
+)
+gpt2_lm = keras_hub.models.GPT2CausalLM.from_preset(
+    "gpt2_base_en", preprocessor=preprocessor
+)
+
+gpt2_lm.summary()
+
+"""
+Initialize the GPU memory tracker callback object, and compile the model. We
+use the Adam optimizer with a linearly decaying learning rate.
+"""
+
+gpu_memory_callback = GPUMemoryCallback(
+    target_batches=[5, 10, 25, 50, 100, 150, 200, 300, 400, 500],
+    print_stats=True,
+)
+
+optimizer, loss = get_optimizer_and_loss()
+
+gpt2_lm.compile(
+    optimizer=optimizer,
+    loss=loss,
+    weighted_metrics=["accuracy"],
+)
+
+"""
+We are all set to train the model!
+"""
+
+gpt2_lm.fit(train_ds, epochs=EPOCHS, callbacks=[gpu_memory_callback])
+gpt2_lm_memory_usage = gpu_memory_callback.memory_usage
+
+"""
+As a final step, let's generate some text. We will harness the power of XLA. The
+first call to `generate()` will be slow because of XLA compilation, but
+subsequent calls will be super-fast. :)
+"""
+
+generate_text(gpt2_lm, "I like basketball", max_length=MAX_GENERATION_LENGTH)
+generate_text(gpt2_lm, "That Italian restaurant is", max_length=MAX_GENERATION_LENGTH)
+
+"""
+## LoRA GPT-2
+
+In this section, we discuss the technical details of LoRA, build a LoRA GPT-2
+model, fine-tune it and generate text.
+
+### What exactly is LoRA?
+
+LoRA is a parameter-efficient fine-tuning technique for LLMs. It freezes the
+weights of the LLM, and injects trainable rank-decomposition matrices. Let's
+understand this more clearly.
+
+Assume we have an `n x n` pre-trained dense layer (or weight matrix), `W0`. We
+initialize two dense layers, `A` and `B`, of shapes `n x rank`, and `rank x n`,
+respectively. `rank` is much smaller than `n`. In the paper, values between 1
+and 4 are shown to work well.
+
+
+#### LoRA equation
+
+The original equation is `output = W0x + b0`, where `x` is the input, `W0` and
+`b0` are the weight matrix and bias terms of the original dense layer (frozen).
+The LoRA equation is: `output = W0x + b0 + BAx`, where `A` and `B` are the
+rank-decomposition matrices.
+
+LoRA is based on the idea that updates to the weights of the pre-trained
+language model have a low "intrinsic rank" since pre-trained language models are
+over-parametrized. Predictive performance of full fine-tuning can be replicated
+even by constraining `W0`'s updates to low-rank decomposition matrices.
+
+<p align="center">
+  <img src="https://i.imgur.com/f4TFqMi.png" alt="lora_diagram" height="250"/>
+</p>
+<br>
+
+#### Number of trainable parameters
+
+Let's do some quick math. Suppose `n` is 768, and `rank` is 4. `W0` has
+`768 x 768 = 589,824` parameters, whereas the LoRA layers, `A` and `B` together
+have `768 x 4 + 4 x 768 = 6,144` parameters. So, for the dense layer, we go from
+`589,824` trainable parameters to `6,144` trainable parameters!
+
+#### Why does LoRA reduce memory footprint?
+
+Even though the total number of parameters increase (since we are adding LoRA
+layers), the memory footprint reduces, because the number of trainable
+parameters reduces. Let's dive deeper into this.
+
+The memory usage of a model can be split into four parts:
+
+- Model memory: This is the memory required to store the model weights. This
+will be slightly higher for LoRA than GPT-2.
+- Forward pass memory: This mostly depends on batch size, sequence length, etc.
+We keep this constant for both models for a fair comparison.
+- Backward pass memory: This is the memory required to store the gradients.
+Note that the gradients are computed only for the trainable parameters.
+- Optimizer memory: This is the memory required to store the optimizer state.
+For example, the Adam optimizer stores the "1st moment vectors" and
+"2nd moment vectors" for the trainable parameters.
+
+Since, with LoRA, there is a huge reduction in the number of trainable
+parameters, the optimizer memory and the memory required to store the gradients
+for LoRA is much less than GPT-2. This is where most of the memory savings
+happen.
+
+#### Why is LoRA so popular?
+
+- Reduces GPU memory usage;
+- Faster training; and
+- No additional inference latency.
+
+### Create LoRA layer
+
+According to the technical description above, let's create a LoRA layer. In
+a transformer model, the LoRA layer is created and injected for the query and
+value projection matrices. In `keras.layers.MultiHeadAttention`, the query/value
+projection layers are `keras.layers.EinsumDense` layers.
+"""
+
+import math
+
+
+class LoraLayer(keras.layers.Layer):
+    def __init__(
+        self,
+        original_layer,
+        rank=8,
+        alpha=32,
+        trainable=False,
+        **kwargs,
+    ):
+        # We want to keep the name of this layer the same as the original
+        # dense layer.
+        original_layer_config = original_layer.get_config()
+        name = original_layer_config["name"]
+
+        kwargs.pop("name", None)
+
+        super().__init__(name=name, trainable=trainable, **kwargs)
+
+        self.rank = rank
+        self.alpha = alpha
+
+        self._scale = alpha / rank
+
+        self._num_heads = original_layer_config["output_shape"][-2]
+        self._hidden_dim = self._num_heads * original_layer_config["output_shape"][-1]
+
+        # Layers.
+
+        # Original dense layer.
+        self.original_layer = original_layer
+        # No matter whether we are training the model or are in inference mode,
+        # this layer should be frozen.
+        self.original_layer.trainable = False
+
+        # LoRA dense layers.
+        self.A = keras.layers.Dense(
+            units=rank,
+            use_bias=False,
+            # Note: the original paper mentions that normal distribution was
+            # used for initialization. However, the official LoRA implementation
+            # uses "Kaiming/He Initialization".
+            kernel_initializer=keras.initializers.VarianceScaling(
+                scale=math.sqrt(5), mode="fan_in", distribution="uniform"
+            ),
+            trainable=trainable,
+            name=f"lora_A",
+        )
+        # B has the same `equation` and `output_shape` as the original layer.
+        # `equation = abc,cde->abde`, where `a`: batch size, `b`: sequence
+        # length, `c`: `hidden_dim`, `d`: `num_heads`,
+        # `e`: `hidden_dim//num_heads`. The only difference is that in layer `B`,
+        # `c` represents `rank`.
+        self.B = keras.layers.EinsumDense(
+            equation=original_layer_config["equation"],
+            output_shape=original_layer_config["output_shape"],
+            kernel_initializer="zeros",
+            trainable=trainable,
+            name=f"lora_B",
+        )
+
+    def call(self, inputs):
+        original_output = self.original_layer(inputs)
+        if self.trainable:
+            # If we are fine-tuning the model, we will add LoRA layers' output
+            # to the original layer's output.
+            lora_output = self.B(self.A(inputs)) * self._scale
+            return original_output + lora_output
+
+        # If we are in inference mode, we "merge" the LoRA layers' weights into
+        # the original layer's weights - more on this in the text generation
+        # section!
+        return original_output
+
+
+"""
+### Inject LoRA layer into the model
+
+We will now hack the original GPT-2 model and inject LoRA layers into it. Let's
+do a couple of things before doing that:
+
+- Delete previous model;
+- Reset "peak" GPU memory usage using `tf.config.experimental.reset_memory_stats`;
+- Load a new GPT-2 model.
+"""
+
+del gpt2_lm
+del optimizer
+del loss
+
+# This resets "peak" memory usage to "current" memory usage.
+tf.config.experimental.reset_memory_stats("GPU:0")
+
+# Load the original model.
+preprocessor = keras_hub.models.GPT2CausalLMPreprocessor.from_preset(
+    "gpt2_base_en",
+    sequence_length=128,
+)
+lora_model = keras_hub.models.GPT2CausalLM.from_preset(
+    "gpt2_base_en",
+    preprocessor=preprocessor,
+)
+
+"""
+We will now override the original query/value projection matrices with our
+new LoRA layers.
+"""
+
+for layer_idx in range(lora_model.backbone.num_layers):
+    # Change query dense layer.
+    decoder_layer = lora_model.backbone.get_layer(f"transformer_layer_{layer_idx}")
+    self_attention_layer = decoder_layer._self_attention_layer
+    # Allow mutation to Keras layer state.
+    self_attention_layer._tracker.locked = False
+
+    # Change query dense layer.
+    self_attention_layer._query_dense = LoraLayer(
+        self_attention_layer._query_dense,
+        rank=RANK,
+        alpha=ALPHA,
+        trainable=True,
+    )
+
+    # Change value dense layer.
+    self_attention_layer._value_dense = LoraLayer(
+        self_attention_layer._value_dense,
+        rank=RANK,
+        alpha=ALPHA,
+        trainable=True,
+    )
+
+"""
+Let's now do a forward pass to make sure we still have a valid chain of
+computation.
+"""
+
+lora_model(preprocessor(["LoRA is very useful for quick LLM finetuning"])[0])
+pass
+
+"""
+Freeze the entire LLM, only the LoRA layers should be trainable.
+"""
+
+for layer in lora_model._flatten_layers():
+    lst_of_sublayers = list(layer._flatten_layers())
+
+    if len(lst_of_sublayers) == 1:  # "leaves of the model"
+        if layer.name in ["lora_A", "lora_B"]:
+            layer.trainable = True
+        else:
+            layer.trainable = False
+
+"""
+Print the model's summary and see if the number of non-trainable parameters and
+total parameters are correct.
+
+In a previous section, we had calculated the number of parameters associated with
+the LoRA layers to be 6,144. The total trainable parameters in the model should
+be `num_layers * (query, value) * 6,144 = 12 * 2 * 6,144 = 147,456`. The
+number of non-trainable parameters should be the same as the total number of
+parameters in the original GPT-2 model, which is `124,439,808`.
+"""
+
+lora_model.summary()
+
+"""
+### Fine-tune LoRA GPT-2
+
+Now that we have hacked and verified the LoRA GPT-2 model, let's train it!
+"""
+
+gpu_memory_callback = GPUMemoryCallback(
+    target_batches=[5, 10, 25, 50, 100, 150, 200, 300, 400, 500],
+    print_stats=True,
+)
+
+optimizer, loss = get_optimizer_and_loss()
+
+lora_model.compile(
+    optimizer=optimizer,
+    loss=loss,
+    weighted_metrics=["accuracy"],
+)
+
+lora_model.fit(
+    train_ds,
+    epochs=EPOCHS,
+    callbacks=[gpu_memory_callback],
+)
+lora_model_memory_usage = gpu_memory_callback.memory_usage
+
+"""
+And we are done fine-tuning the model! Before we generate text, let's compare
+the training time and memory usage of the two models. The training time of GPT-2
+on a 16 GB Tesla T4 (Colab) is 7 minutes, and for LoRA, it is 5 minutes, a 30%
+decrease. The memory usage of LoRA GPT-2 is roughly 35% times less than GPT-2.
+"""
+
+plt.bar(
+    ["GPT-2", "LoRA GPT-2"],
+    [max(gpt2_lm_memory_usage), max(lora_model_memory_usage)],
+    color=["red", "blue"],
+)
+
+plt.xlabel("Time")
+plt.ylabel("GPU Memory Usage (in GB)")
+
+plt.title("GPU Memory Usage Comparison")
+plt.legend()
+plt.show()
+
+"""
+### Merge weights and generate text!
+
+One of the biggest advantages of LoRA over other adapter methods is that it
+does not incur any additional inference latency. Let's understand why.
+
+Recall our LoRA equation: `output = W0x + b0 + BAx`. We can rewrite this as:
+`output = = Wx + b0 = (W0 + BA)x + b0`, where `W = W0 + BA`. This means that if
+we merge the weights of the original model and the adapter, we will be essentially
+doing the same computation as the original model!
+"""
+
+for layer_idx in range(lora_model.backbone.num_layers):
+    self_attention_layer = lora_model.backbone.get_layer(
+        f"transformer_layer_{layer_idx}"
+    )._self_attention_layer
+
+    # Merge query dense layer.
+    query_lora_layer = self_attention_layer._query_dense
+
+    A_weights = query_lora_layer.A.kernel  # (768, 1) (a, b)
+    B_weights = query_lora_layer.B.kernel  # (1, 12, 64) (b, c, d)
+    increment_weights = tf.einsum("ab,bcd->acd", A_weights, B_weights) * (ALPHA / RANK)
+    query_lora_layer.original_layer.kernel.assign_add(increment_weights)
+
+    # Merge value dense layer.
+    value_lora_layer = self_attention_layer._value_dense
+
+    A_weights = value_lora_layer.A.kernel  # (768, 1) (a, b)
+    B_weights = value_lora_layer.B.kernel  # (1, 12, 64) (b, c, d)
+    increment_weights = tf.einsum("ab,bcd->acd", A_weights, B_weights) * (ALPHA / RANK)
+    value_lora_layer.original_layer.kernel.assign_add(increment_weights)
+
+    # Put back in place the original layers with updated weights
+    self_attention_layer._query_dense = query_lora_layer.original_layer
+    self_attention_layer._value_dense = value_lora_layer.original_layer
+
+"""
+We are now all set to generate text with our LoRA model :).
+"""
+
+# Freezing weights not necessary during generation since no weights are updated.
+generate_text(lora_model, "I like basketball", max_length=MAX_GENERATION_LENGTH)
+generate_text(
+    lora_model, "That Italian restaurant is", max_length=MAX_GENERATION_LENGTH
+)
+
+"""
+And we're all done!
+"""
diff --git a/knowledge_base/nlp/pretrained_word_embeddings.py b/knowledge_base/nlp/pretrained_word_embeddings.py
new file mode 100644
index 0000000000000000000000000000000000000000..8a717cd8af9fb495dc18982bd1b6de21e7bd023a
--- /dev/null
+++ b/knowledge_base/nlp/pretrained_word_embeddings.py
@@ -0,0 +1,305 @@
+"""
+Title: Using pre-trained word embeddings
+Author: [fchollet](https://twitter.com/fchollet)
+Date created: 2020/05/05
+Last modified: 2020/05/05
+Description: Text classification on the Newsgroup20 dataset using pre-trained GloVe word embeddings.
+Accelerator: GPU
+"""
+
+"""
+## Setup
+"""
+
+import os
+
+# Only the TensorFlow backend supports string inputs.
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import pathlib
+import numpy as np
+import tensorflow.data as tf_data
+import keras
+from keras import layers
+
+"""
+## Introduction
+
+In this example, we show how to train a text classification model that uses pre-trained
+word embeddings.
+
+We'll work with the Newsgroup20 dataset, a set of 20,000 message board messages
+belonging to 20 different topic categories.
+
+For the pre-trained word embeddings, we'll use
+[GloVe embeddings](http://nlp.stanford.edu/projects/glove/).
+"""
+
+"""
+## Download the Newsgroup20 data
+"""
+
+data_path = keras.utils.get_file(
+    "news20.tar.gz",
+    "http://www.cs.cmu.edu/afs/cs.cmu.edu/project/theo-20/www/data/news20.tar.gz",
+    untar=True,
+)
+
+"""
+## Let's take a look at the data
+"""
+
+data_dir = pathlib.Path(data_path).parent / "20_newsgroup"
+dirnames = os.listdir(data_dir)
+print("Number of directories:", len(dirnames))
+print("Directory names:", dirnames)
+
+fnames = os.listdir(data_dir / "comp.graphics")
+print("Number of files in comp.graphics:", len(fnames))
+print("Some example filenames:", fnames[:5])
+
+"""
+Here's a example of what one file contains:
+"""
+
+print(open(data_dir / "comp.graphics" / "38987").read())
+
+"""
+As you can see, there are header lines that are leaking the file's category, either
+explicitly (the first line is literally the category name), or implicitly, e.g. via the
+`Organization` filed. Let's get rid of the headers:
+"""
+
+samples = []
+labels = []
+class_names = []
+class_index = 0
+for dirname in sorted(os.listdir(data_dir)):
+    class_names.append(dirname)
+    dirpath = data_dir / dirname
+    fnames = os.listdir(dirpath)
+    print("Processing %s, %d files found" % (dirname, len(fnames)))
+    for fname in fnames:
+        fpath = dirpath / fname
+        f = open(fpath, encoding="latin-1")
+        content = f.read()
+        lines = content.split("\n")
+        lines = lines[10:]
+        content = "\n".join(lines)
+        samples.append(content)
+        labels.append(class_index)
+    class_index += 1
+
+print("Classes:", class_names)
+print("Number of samples:", len(samples))
+
+"""
+There's actually one category that doesn't have the expected number of files, but the
+difference is small enough that the problem remains a balanced classification problem.
+"""
+
+"""
+## Shuffle and split the data into training & validation sets
+"""
+
+# Shuffle the data
+seed = 1337
+rng = np.random.RandomState(seed)
+rng.shuffle(samples)
+rng = np.random.RandomState(seed)
+rng.shuffle(labels)
+
+# Extract a training & validation split
+validation_split = 0.2
+num_validation_samples = int(validation_split * len(samples))
+train_samples = samples[:-num_validation_samples]
+val_samples = samples[-num_validation_samples:]
+train_labels = labels[:-num_validation_samples]
+val_labels = labels[-num_validation_samples:]
+
+"""
+## Create a vocabulary index
+
+Let's use the `TextVectorization` to index the vocabulary found in the dataset.
+Later, we'll use the same layer instance to vectorize the samples.
+
+Our layer will only consider the top 20,000 words, and will truncate or pad sequences to
+be actually 200 tokens long.
+"""
+
+vectorizer = layers.TextVectorization(max_tokens=20000, output_sequence_length=200)
+text_ds = tf_data.Dataset.from_tensor_slices(train_samples).batch(128)
+vectorizer.adapt(text_ds)
+
+"""
+You can retrieve the computed vocabulary used via `vectorizer.get_vocabulary()`. Let's
+print the top 5 words:
+"""
+
+vectorizer.get_vocabulary()[:5]
+
+"""
+Let's vectorize a test sentence:
+"""
+
+output = vectorizer([["the cat sat on the mat"]])
+output.numpy()[0, :6]
+
+"""
+As you can see, "the" gets represented as "2". Why not 0, given that "the" was the first
+word in the vocabulary? That's because index 0 is reserved for padding and index 1 is
+reserved for "out of vocabulary" tokens.
+
+Here's a dict mapping words to their indices:
+"""
+
+voc = vectorizer.get_vocabulary()
+word_index = dict(zip(voc, range(len(voc))))
+
+"""
+As you can see, we obtain the same encoding as above for our test sentence:
+"""
+
+test = ["the", "cat", "sat", "on", "the", "mat"]
+[word_index[w] for w in test]
+
+"""
+## Load pre-trained word embeddings
+"""
+
+"""
+Let's download pre-trained GloVe embeddings (a 822M zip file).
+
+You'll need to run the following commands:
+"""
+
+"""shell
+wget https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
+unzip -q glove.6B.zip
+"""
+
+"""
+The archive contains text-encoded vectors of various sizes: 50-dimensional,
+100-dimensional, 200-dimensional, 300-dimensional. We'll use the 100D ones.
+
+Let's make a dict mapping words (strings) to their NumPy vector representation:
+"""
+
+path_to_glove_file = "glove.6B.100d.txt"
+
+embeddings_index = {}
+with open(path_to_glove_file) as f:
+    for line in f:
+        word, coefs = line.split(maxsplit=1)
+        coefs = np.fromstring(coefs, "f", sep=" ")
+        embeddings_index[word] = coefs
+
+print("Found %s word vectors." % len(embeddings_index))
+
+"""
+Now, let's prepare a corresponding embedding matrix that we can use in a Keras
+`Embedding` layer. It's a simple NumPy matrix where entry at index `i` is the pre-trained
+vector for the word of index `i` in our `vectorizer`'s vocabulary.
+"""
+
+num_tokens = len(voc) + 2
+embedding_dim = 100
+hits = 0
+misses = 0
+
+# Prepare embedding matrix
+embedding_matrix = np.zeros((num_tokens, embedding_dim))
+for word, i in word_index.items():
+    embedding_vector = embeddings_index.get(word)
+    if embedding_vector is not None:
+        # Words not found in embedding index will be all-zeros.
+        # This includes the representation for "padding" and "OOV"
+        embedding_matrix[i] = embedding_vector
+        hits += 1
+    else:
+        misses += 1
+print("Converted %d words (%d misses)" % (hits, misses))
+
+
+"""
+Next, we load the pre-trained word embeddings matrix into an `Embedding` layer.
+
+Note that we set `trainable=False` so as to keep the embeddings fixed (we don't want to
+update them during training).
+"""
+
+from keras.layers import Embedding
+
+embedding_layer = Embedding(
+    num_tokens,
+    embedding_dim,
+    trainable=False,
+)
+embedding_layer.build((1,))
+embedding_layer.set_weights([embedding_matrix])
+
+"""
+## Build the model
+
+A simple 1D convnet with global max pooling and a classifier at the end.
+"""
+
+int_sequences_input = keras.Input(shape=(None,), dtype="int32")
+embedded_sequences = embedding_layer(int_sequences_input)
+x = layers.Conv1D(128, 5, activation="relu")(embedded_sequences)
+x = layers.MaxPooling1D(5)(x)
+x = layers.Conv1D(128, 5, activation="relu")(x)
+x = layers.MaxPooling1D(5)(x)
+x = layers.Conv1D(128, 5, activation="relu")(x)
+x = layers.GlobalMaxPooling1D()(x)
+x = layers.Dense(128, activation="relu")(x)
+x = layers.Dropout(0.5)(x)
+preds = layers.Dense(len(class_names), activation="softmax")(x)
+model = keras.Model(int_sequences_input, preds)
+model.summary()
+
+"""
+## Train the model
+
+First, convert our list-of-strings data to NumPy arrays of integer indices. The arrays
+are right-padded.
+"""
+
+x_train = vectorizer(np.array([[s] for s in train_samples])).numpy()
+x_val = vectorizer(np.array([[s] for s in val_samples])).numpy()
+
+y_train = np.array(train_labels)
+y_val = np.array(val_labels)
+
+"""
+We use categorical crossentropy as our loss since we're doing softmax classification.
+Moreover, we use `sparse_categorical_crossentropy` since our labels are integers.
+"""
+
+model.compile(
+    loss="sparse_categorical_crossentropy", optimizer="rmsprop", metrics=["acc"]
+)
+model.fit(x_train, y_train, batch_size=128, epochs=20, validation_data=(x_val, y_val))
+
+"""
+## Export an end-to-end model
+
+Now, we may want to export a `Model` object that takes as input a string of arbitrary
+length, rather than a sequence of indices. It would make the model much more portable,
+since you wouldn't have to worry about the input preprocessing pipeline.
+
+Our `vectorizer` is actually a Keras layer, so it's simple:
+"""
+
+string_input = keras.Input(shape=(1,), dtype="string")
+x = vectorizer(string_input)
+preds = model(x)
+end_to_end_model = keras.Model(string_input, preds)
+
+probabilities = end_to_end_model(
+    keras.ops.convert_to_tensor(
+        [["this message is about computer graphics and 3D modeling"]]
+    )
+)
+
+print(class_names[np.argmax(probabilities[0])])
diff --git a/knowledge_base/nlp/semantic_similarity_with_bert.py b/knowledge_base/nlp/semantic_similarity_with_bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..51ef00e66d22d9be941831cd53ccaeb3a7e81503
--- /dev/null
+++ b/knowledge_base/nlp/semantic_similarity_with_bert.py
@@ -0,0 +1,397 @@
+"""
+Title: Semantic Similarity with BERT
+Author: [Mohamad Merchant](https://twitter.com/mohmadmerchant1)
+Date created: 2020/08/15
+Last modified: 2020/08/29
+Description: Natural Language Inference by fine-tuning BERT model on SNLI Corpus.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+Semantic Similarity is the task of determining how similar
+two sentences are, in terms of what they mean.
+This example demonstrates the use of SNLI (Stanford Natural Language Inference) Corpus
+to predict sentence semantic similarity with Transformers.
+We will fine-tune a BERT model that takes two sentences as inputs
+and that outputs a similarity score for these two sentences.
+
+### References
+
+* [BERT](https://arxiv.org/pdf/1810.04805.pdf)
+* [SNLI](https://nlp.stanford.edu/projects/snli/)
+"""
+
+"""
+## Setup
+
+Note: install HuggingFace `transformers` via `pip install transformers` (version >= 2.11.0).
+"""
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+import transformers
+
+"""
+## Configuration
+"""
+
+max_length = 128  # Maximum length of input sentence to the model.
+batch_size = 32
+epochs = 2
+
+# Labels in our dataset.
+labels = ["contradiction", "entailment", "neutral"]
+
+"""
+## Load the Data
+"""
+
+"""shell
+curl -LO https://raw.githubusercontent.com/MohamadMerchant/SNLI/master/data.tar.gz
+tar -xvzf data.tar.gz
+"""
+# There are more than 550k samples in total; we will use 100k for this example.
+train_df = pd.read_csv("SNLI_Corpus/snli_1.0_train.csv", nrows=100000)
+valid_df = pd.read_csv("SNLI_Corpus/snli_1.0_dev.csv")
+test_df = pd.read_csv("SNLI_Corpus/snli_1.0_test.csv")
+
+# Shape of the data
+print(f"Total train samples : {train_df.shape[0]}")
+print(f"Total validation samples: {valid_df.shape[0]}")
+print(f"Total test samples: {valid_df.shape[0]}")
+
+"""
+Dataset Overview:
+
+- sentence1: The premise caption that was supplied to the author of the pair.
+- sentence2: The hypothesis caption that was written by the author of the pair.
+- similarity: This is the label chosen by the majority of annotators.
+Where no majority exists, the label "-" is used (we will skip such samples here).
+
+Here are the "similarity" label values in our dataset:
+
+- Contradiction: The sentences share no similarity.
+- Entailment: The sentences have similar meaning.
+- Neutral: The sentences are neutral.
+"""
+
+"""
+Let's look at one sample from the dataset:
+"""
+print(f"Sentence1: {train_df.loc[1, 'sentence1']}")
+print(f"Sentence2: {train_df.loc[1, 'sentence2']}")
+print(f"Similarity: {train_df.loc[1, 'similarity']}")
+
+"""
+## Preprocessing
+"""
+
+# We have some NaN entries in our train data, we will simply drop them.
+print("Number of missing values")
+print(train_df.isnull().sum())
+train_df.dropna(axis=0, inplace=True)
+
+"""
+Distribution of our training targets.
+"""
+print("Train Target Distribution")
+print(train_df.similarity.value_counts())
+
+"""
+Distribution of our validation targets.
+"""
+print("Validation Target Distribution")
+print(valid_df.similarity.value_counts())
+
+"""
+The value "-" appears as part of our training and validation targets.
+We will skip these samples.
+"""
+train_df = (
+    train_df[train_df.similarity != "-"]
+    .sample(frac=1.0, random_state=42)
+    .reset_index(drop=True)
+)
+valid_df = (
+    valid_df[valid_df.similarity != "-"]
+    .sample(frac=1.0, random_state=42)
+    .reset_index(drop=True)
+)
+
+"""
+One-hot encode training, validation, and test labels.
+"""
+train_df["label"] = train_df["similarity"].apply(
+    lambda x: 0 if x == "contradiction" else 1 if x == "entailment" else 2
+)
+y_train = tf.keras.utils.to_categorical(train_df.label, num_classes=3)
+
+valid_df["label"] = valid_df["similarity"].apply(
+    lambda x: 0 if x == "contradiction" else 1 if x == "entailment" else 2
+)
+y_val = tf.keras.utils.to_categorical(valid_df.label, num_classes=3)
+
+test_df["label"] = test_df["similarity"].apply(
+    lambda x: 0 if x == "contradiction" else 1 if x == "entailment" else 2
+)
+y_test = tf.keras.utils.to_categorical(test_df.label, num_classes=3)
+
+"""
+## Create a custom data generator
+"""
+
+
+class BertSemanticDataGenerator(tf.keras.utils.Sequence):
+    """Generates batches of data.
+
+    Args:
+        sentence_pairs: Array of premise and hypothesis input sentences.
+        labels: Array of labels.
+        batch_size: Integer batch size.
+        shuffle: boolean, whether to shuffle the data.
+        include_targets: boolean, whether to include the labels.
+
+    Returns:
+        Tuples `([input_ids, attention_mask, `token_type_ids], labels)`
+        (or just `[input_ids, attention_mask, `token_type_ids]`
+         if `include_targets=False`)
+    """
+
+    def __init__(
+        self,
+        sentence_pairs,
+        labels,
+        batch_size=batch_size,
+        shuffle=True,
+        include_targets=True,
+    ):
+        self.sentence_pairs = sentence_pairs
+        self.labels = labels
+        self.shuffle = shuffle
+        self.batch_size = batch_size
+        self.include_targets = include_targets
+        # Load our BERT Tokenizer to encode the text.
+        # We will use base-base-uncased pretrained model.
+        self.tokenizer = transformers.BertTokenizer.from_pretrained(
+            "bert-base-uncased", do_lower_case=True
+        )
+        self.indexes = np.arange(len(self.sentence_pairs))
+        self.on_epoch_end()
+
+    def __len__(self):
+        # Denotes the number of batches per epoch.
+        return len(self.sentence_pairs) // self.batch_size
+
+    def __getitem__(self, idx):
+        # Retrieves the batch of index.
+        indexes = self.indexes[idx * self.batch_size : (idx + 1) * self.batch_size]
+        sentence_pairs = self.sentence_pairs[indexes]
+
+        # With BERT tokenizer's batch_encode_plus batch of both the sentences are
+        # encoded together and separated by [SEP] token.
+        encoded = self.tokenizer.batch_encode_plus(
+            sentence_pairs.tolist(),
+            add_special_tokens=True,
+            max_length=max_length,
+            return_attention_mask=True,
+            return_token_type_ids=True,
+            pad_to_max_length=True,
+            return_tensors="tf",
+        )
+
+        # Convert batch of encoded features to numpy array.
+        input_ids = np.array(encoded["input_ids"], dtype="int32")
+        attention_masks = np.array(encoded["attention_mask"], dtype="int32")
+        token_type_ids = np.array(encoded["token_type_ids"], dtype="int32")
+
+        # Set to true if data generator is used for training/validation.
+        if self.include_targets:
+            labels = np.array(self.labels[indexes], dtype="int32")
+            return [input_ids, attention_masks, token_type_ids], labels
+        else:
+            return [input_ids, attention_masks, token_type_ids]
+
+    def on_epoch_end(self):
+        # Shuffle indexes after each epoch if shuffle is set to True.
+        if self.shuffle:
+            np.random.RandomState(42).shuffle(self.indexes)
+
+
+"""
+## Build the model
+"""
+# Create the model under a distribution strategy scope.
+strategy = tf.distribute.MirroredStrategy()
+
+with strategy.scope():
+    # Encoded token ids from BERT tokenizer.
+    input_ids = tf.keras.layers.Input(
+        shape=(max_length,), dtype=tf.int32, name="input_ids"
+    )
+    # Attention masks indicates to the model which tokens should be attended to.
+    attention_masks = tf.keras.layers.Input(
+        shape=(max_length,), dtype=tf.int32, name="attention_masks"
+    )
+    # Token type ids are binary masks identifying different sequences in the model.
+    token_type_ids = tf.keras.layers.Input(
+        shape=(max_length,), dtype=tf.int32, name="token_type_ids"
+    )
+    # Loading pretrained BERT model.
+    bert_model = transformers.TFBertModel.from_pretrained("bert-base-uncased")
+    # Freeze the BERT model to reuse the pretrained features without modifying them.
+    bert_model.trainable = False
+
+    bert_output = bert_model.bert(
+        input_ids, attention_mask=attention_masks, token_type_ids=token_type_ids
+    )
+    sequence_output = bert_output.last_hidden_state
+    pooled_output = bert_output.pooler_output
+
+    # Add trainable layers on top of frozen layers to adapt the pretrained features on the new data.
+    bi_lstm = tf.keras.layers.Bidirectional(
+        tf.keras.layers.LSTM(64, return_sequences=True)
+    )(sequence_output)
+    # Applying hybrid pooling approach to bi_lstm sequence output.
+    avg_pool = tf.keras.layers.GlobalAveragePooling1D()(bi_lstm)
+    max_pool = tf.keras.layers.GlobalMaxPooling1D()(bi_lstm)
+    concat = tf.keras.layers.concatenate([avg_pool, max_pool])
+    dropout = tf.keras.layers.Dropout(0.3)(concat)
+    output = tf.keras.layers.Dense(3, activation="softmax")(dropout)
+    model = tf.keras.models.Model(
+        inputs=[input_ids, attention_masks, token_type_ids], outputs=output
+    )
+
+    model.compile(
+        optimizer=tf.keras.optimizers.Adam(),
+        loss="categorical_crossentropy",
+        metrics=["acc"],
+    )
+
+
+print(f"Strategy: {strategy}")
+model.summary()
+
+"""
+Create train and validation data generators
+"""
+train_data = BertSemanticDataGenerator(
+    train_df[["sentence1", "sentence2"]].values.astype("str"),
+    y_train,
+    batch_size=batch_size,
+    shuffle=True,
+)
+valid_data = BertSemanticDataGenerator(
+    valid_df[["sentence1", "sentence2"]].values.astype("str"),
+    y_val,
+    batch_size=batch_size,
+    shuffle=False,
+)
+
+"""
+## Train the Model
+
+Training is done only for the top layers to perform "feature extraction",
+which will allow the model to use the representations of the pretrained model.
+"""
+history = model.fit(
+    train_data,
+    validation_data=valid_data,
+    epochs=epochs,
+    use_multiprocessing=True,
+    workers=-1,
+)
+
+"""
+## Fine-tuning
+
+This step must only be performed after the feature extraction model has
+been trained to convergence on the new data.
+
+This is an optional last step where `bert_model` is unfreezed and retrained
+with a very low learning rate. This can deliver meaningful improvement by
+incrementally adapting the pretrained features to the new data.
+"""
+
+# Unfreeze the bert_model.
+bert_model.trainable = True
+# Recompile the model to make the change effective.
+model.compile(
+    optimizer=tf.keras.optimizers.Adam(1e-5),
+    loss="categorical_crossentropy",
+    metrics=["accuracy"],
+)
+model.summary()
+
+"""
+## Train the entire model end-to-end
+"""
+history = model.fit(
+    train_data,
+    validation_data=valid_data,
+    epochs=epochs,
+    use_multiprocessing=True,
+    workers=-1,
+)
+
+"""
+## Evaluate model on the test set
+"""
+test_data = BertSemanticDataGenerator(
+    test_df[["sentence1", "sentence2"]].values.astype("str"),
+    y_test,
+    batch_size=batch_size,
+    shuffle=False,
+)
+model.evaluate(test_data, verbose=1)
+
+"""
+## Inference on custom sentences
+"""
+
+
+def check_similarity(sentence1, sentence2):
+    sentence_pairs = np.array([[str(sentence1), str(sentence2)]])
+    test_data = BertSemanticDataGenerator(
+        sentence_pairs,
+        labels=None,
+        batch_size=1,
+        shuffle=False,
+        include_targets=False,
+    )
+
+    proba = model.predict(test_data[0])[0]
+    idx = np.argmax(proba)
+    proba = f"{proba[idx]: .2f}%"
+    pred = labels[idx]
+    return pred, proba
+
+
+"""
+Check results on some example sentence pairs.
+"""
+sentence1 = "Two women are observing something together."
+sentence2 = "Two women are standing with their eyes closed."
+check_similarity(sentence1, sentence2)
+"""
+Check results on some example sentence pairs.
+"""
+sentence1 = "A smiling costumed woman is holding an umbrella"
+sentence2 = "A happy woman in a fairy costume holds an umbrella"
+check_similarity(sentence1, sentence2)
+
+"""
+Check results on some example sentence pairs
+"""
+sentence1 = "A soccer game with multiple males playing"
+sentence2 = "Some men are playing a sport"
+check_similarity(sentence1, sentence2)
+
+"""
+Example available on HuggingFace
+
+| Trained Model | Demo |
+| :--: | :--: |
+| [![Generic badge](https://img.shields.io/badge/%F0%9F%A4%97%20Model-semantic%20similarity%20with%20bert-black.svg)](https://huggingface.co/keras-io/bert-semantic-similarity) | [![Generic badge](https://img.shields.io/badge/%F0%9F%A4%97%20Spaces-semantic%20similarity%20with%20bert-black.svg)](https://huggingface.co/spaces/keras-io/bert-semantic-similarity) |
+"""
diff --git a/knowledge_base/nlp/semantic_similarity_with_keras_hub.py b/knowledge_base/nlp/semantic_similarity_with_keras_hub.py
new file mode 100644
index 0000000000000000000000000000000000000000..409177406eb8b0f24adefcaf45b53be3b9531f9d
--- /dev/null
+++ b/knowledge_base/nlp/semantic_similarity_with_keras_hub.py
@@ -0,0 +1,323 @@
+"""
+Title: Semantic Similarity with KerasHub
+Author: [Anshuman Mishra](https://github.com/shivance/)
+Date created: 2023/02/25
+Last modified: 2023/02/25
+Description: Use pretrained models from KerasHub for the Semantic Similarity Task.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+Semantic similarity refers to the task of determining the degree of similarity between two
+sentences in terms of their meaning. We already saw in [this](https://keras.io/examples/nlp/semantic_similarity_with_bert/)
+example how to use SNLI (Stanford Natural Language Inference) corpus to predict sentence
+semantic similarity with the HuggingFace Transformers library. In this tutorial we will
+learn how to use [KerasHub](https://keras.io/keras_hub/), an extension of the core Keras API,
+for the same task. Furthermore, we will discover how KerasHub effectively reduces boilerplate
+code and simplifies the process of building and utilizing models. For more information on KerasHub,
+please refer to [KerasHub's official documentation](https://keras.io/keras_hub/).
+
+This guide is broken down into the following parts:
+
+1. *Setup*, task definition, and establishing a baseline.
+2. *Establishing baseline* with BERT.
+3. *Saving and Reloading* the model.
+4. *Performing inference* with the model.
+5  *Improving accuracy* with RoBERTa
+
+## Setup
+
+The following guide uses [Keras Core](https://keras.io/keras_core/) to work in
+any of `tensorflow`, `jax` or `torch`. Support for Keras Core is baked into
+KerasHub, simply change the `KERAS_BACKEND` environment variable below to change
+the backend you would like to use. We select the `jax` backend below, which will
+give us a particularly fast train step below.
+"""
+
+"""shell
+pip install -q --upgrade keras-hub
+pip install -q --upgrade keras  # Upgrade to Keras 3.
+"""
+
+import numpy as np
+import tensorflow as tf
+import keras
+import keras_hub
+import tensorflow_datasets as tfds
+
+"""
+To load the SNLI dataset, we use the tensorflow-datasets library, which
+contains over 550,000 samples in total. However, to ensure that this example runs
+quickly, we use only 20% of the training samples.
+
+## Overview of SNLI Dataset
+
+Every sample in the dataset contains three components: `hypothesis`, `premise`,
+and `label`. epresents the original caption provided to the author of the pair,
+while the hypothesis refers to the hypothesis caption created by the author of
+the pair. The label is assigned by annotators to indicate the similarity between
+the two sentences.
+
+The dataset contains three possible similarity label values: Contradiction, Entailment,
+and Neutral. Contradiction represents completely dissimilar sentences, while Entailment
+denotes similar meaning sentences. Lastly, Neutral refers to sentences where no clear
+similarity or dissimilarity can be established between them.
+"""
+
+snli_train = tfds.load("snli", split="train[:20%]")
+snli_val = tfds.load("snli", split="validation")
+snli_test = tfds.load("snli", split="test")
+
+# Here's an example of how our training samples look like, where we randomly select
+# four samples:
+sample = snli_test.batch(4).take(1).get_single_element()
+sample
+
+"""
+### Preprocessing
+
+In our dataset, we have identified that some samples have missing or incorrectly labeled
+data, which is denoted by a value of -1. To ensure the accuracy and reliability of our model,
+we simply filter out these samples from our dataset.
+"""
+
+
+def filter_labels(sample):
+    return sample["label"] >= 0
+
+
+"""
+Here's a utility function that splits the example into an `(x, y)` tuple that is suitable
+for `model.fit()`. By default, `keras_hub.models.BertClassifier` will tokenize and pack
+together raw strings using a `"[SEP]"` token during training. Therefore, this label
+splitting is all the data preparation that we need to perform.
+"""
+
+
+def split_labels(sample):
+    x = (sample["hypothesis"], sample["premise"])
+    y = sample["label"]
+    return x, y
+
+
+train_ds = (
+    snli_train.filter(filter_labels)
+    .map(split_labels, num_parallel_calls=tf.data.AUTOTUNE)
+    .batch(16)
+)
+val_ds = (
+    snli_val.filter(filter_labels)
+    .map(split_labels, num_parallel_calls=tf.data.AUTOTUNE)
+    .batch(16)
+)
+test_ds = (
+    snli_test.filter(filter_labels)
+    .map(split_labels, num_parallel_calls=tf.data.AUTOTUNE)
+    .batch(16)
+)
+
+
+"""
+## Establishing baseline with BERT.
+
+We use the BERT model from KerasHub to establish a baseline for our semantic similarity
+task. The `keras_hub.models.BertClassifier` class attaches a classification head to the BERT
+Backbone, mapping the backbone outputs to a logit output suitable for a classification task.
+This significantly reduces the need for custom code.
+
+KerasHub models have built-in tokenization capabilities that handle tokenization by default
+based on the selected model. However, users can also use custom preprocessing techniques
+as per their specific needs. If we pass a tuple as input, the model will tokenize all the
+strings and concatenate them with a `"[SEP]"` separator.
+
+We use this model with pretrained weights, and we can use the `from_preset()` method
+to use our own preprocessor. For the SNLI dataset, we set `num_classes` to 3.
+"""
+
+bert_classifier = keras_hub.models.BertClassifier.from_preset(
+    "bert_tiny_en_uncased", num_classes=3
+)
+
+"""
+Please note that the BERT Tiny model has only 4,386,307 trainable parameters.
+
+KerasHub task models come with compilation defaults. We can now train the model we just
+instantiated by calling the `fit()` method.
+"""
+
+bert_classifier.fit(train_ds, validation_data=val_ds, epochs=1)
+
+"""
+Our BERT classifier achieved an accuracy of around 76% on the validation split. Now,
+let's evaluate its performance on the test split.
+
+### Evaluate the performance of the trained model on test data.
+"""
+
+bert_classifier.evaluate(test_ds)
+
+"""
+Our baseline BERT model achieved a similar accuracy of around 76% on the test split.
+Now, let's try to improve its performance by recompiling the model with a slightly
+higher learning rate.
+"""
+
+bert_classifier = keras_hub.models.BertClassifier.from_preset(
+    "bert_tiny_en_uncased", num_classes=3
+)
+bert_classifier.compile(
+    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    optimizer=keras.optimizers.Adam(5e-5),
+    metrics=["accuracy"],
+)
+
+bert_classifier.fit(train_ds, validation_data=val_ds, epochs=1)
+bert_classifier.evaluate(test_ds)
+
+"""
+Just tweaking the learning rate alone was not enough to boost performance, which
+stayed right around 76%. Let's try again, but this time with
+`keras.optimizers.AdamW`, and a learning rate schedule.
+"""
+
+
+class TriangularSchedule(keras.optimizers.schedules.LearningRateSchedule):
+    """Linear ramp up for `warmup` steps, then linear decay to zero at `total` steps."""
+
+    def __init__(self, rate, warmup, total):
+        self.rate = rate
+        self.warmup = warmup
+        self.total = total
+
+    def get_config(self):
+        config = {"rate": self.rate, "warmup": self.warmup, "total": self.total}
+        return config
+
+    def __call__(self, step):
+        step = keras.ops.cast(step, dtype="float32")
+        rate = keras.ops.cast(self.rate, dtype="float32")
+        warmup = keras.ops.cast(self.warmup, dtype="float32")
+        total = keras.ops.cast(self.total, dtype="float32")
+
+        warmup_rate = rate * step / self.warmup
+        cooldown_rate = rate * (total - step) / (total - warmup)
+        triangular_rate = keras.ops.minimum(warmup_rate, cooldown_rate)
+        return keras.ops.maximum(triangular_rate, 0.0)
+
+
+bert_classifier = keras_hub.models.BertClassifier.from_preset(
+    "bert_tiny_en_uncased", num_classes=3
+)
+
+# Get the total count of training batches.
+# This requires walking the dataset to filter all -1 labels.
+epochs = 3
+total_steps = sum(1 for _ in train_ds.as_numpy_iterator()) * epochs
+warmup_steps = int(total_steps * 0.2)
+
+bert_classifier.compile(
+    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    optimizer=keras.optimizers.AdamW(
+        TriangularSchedule(1e-4, warmup_steps, total_steps)
+    ),
+    metrics=["accuracy"],
+)
+
+bert_classifier.fit(train_ds, validation_data=val_ds, epochs=epochs)
+
+"""
+Success! With the learning rate scheduler and the `AdamW` optimizer, our validation
+accuracy improved to around 79%.
+
+Now, let's evaluate our final model on the test set and see how it performs.
+"""
+
+bert_classifier.evaluate(test_ds)
+
+"""
+Our Tiny BERT model achieved an accuracy of approximately 79% on the test set
+with the use of a learning rate scheduler. This is a significant improvement over
+our previous results. Fine-tuning a pretrained BERT
+model can be a powerful tool in natural language processing tasks, and even a
+small model like Tiny BERT can achieve impressive results.
+
+Let's save our model for now
+and move on to learning how to perform inference with it.
+
+## Save and Reload the model
+"""
+bert_classifier.save("bert_classifier.keras")
+restored_model = keras.models.load_model("bert_classifier.keras")
+restored_model.evaluate(test_ds)
+
+"""
+## Performing inference with the model.
+
+Let's see how to perform inference with KerasHub models
+"""
+
+# Convert to Hypothesis-Premise pair, for forward pass through model
+sample = (sample["hypothesis"], sample["premise"])
+sample
+
+"""
+The default preprocessor in KerasHub models handles input tokenization automatically,
+so we don't need to perform tokenization explicitly.
+"""
+predictions = bert_classifier.predict(sample)
+
+
+def softmax(x):
+    return np.exp(x) / np.exp(x).sum(axis=0)
+
+
+# Get the class predictions with maximum probabilities
+predictions = softmax(predictions)
+
+"""
+## Improving accuracy with RoBERTa
+
+Now that we have established a baseline, we can attempt to improve our results
+by experimenting with different models. Thanks to KerasHub, fine-tuning a RoBERTa
+checkpoint on the same dataset is easy with just a few lines of code.
+"""
+
+# Inittializing a RoBERTa from preset
+roberta_classifier = keras_hub.models.RobertaClassifier.from_preset(
+    "roberta_base_en", num_classes=3
+)
+
+roberta_classifier.fit(train_ds, validation_data=val_ds, epochs=1)
+
+roberta_classifier.evaluate(test_ds)
+
+"""
+The RoBERTa base model has significantly more trainable parameters than the BERT
+Tiny model, with almost 30 times as many at 124,645,635 parameters. As a result, it took
+approximately 1.5 hours to train on a P100 GPU. However, the performance
+improvement was substantial, with accuracy increasing to 88% on both the validation
+and test splits. With RoBERTa, we were able to fit a maximum batch size of 16 on
+our P100 GPU.
+
+Despite using a different model, the steps to perform inference with RoBERTa are
+the same as with BERT!
+"""
+
+predictions = roberta_classifier.predict(sample)
+print(tf.math.argmax(predictions, axis=1).numpy())
+
+"""
+We hope this tutorial has been helpful in demonstrating the ease and effectiveness
+of using KerasHub and BERT for semantic similarity tasks.
+
+Throughout this tutorial, we demonstrated how to use a pretrained BERT model to
+establish a baseline and improve performance by training a larger RoBERTa model
+using just a few lines of code.
+
+The KerasHub toolbox provides a range of modular building blocks for preprocessing
+text, including pretrained state-of-the-art models and low-level Transformer Encoder
+layers. We believe that this makes experimenting with natural language solutions
+more accessible and efficient.
+"""
diff --git a/knowledge_base/nlp/sentence_embeddings_with_sbert.py b/knowledge_base/nlp/sentence_embeddings_with_sbert.py
new file mode 100644
index 0000000000000000000000000000000000000000..4867eb28bcb84638ddec4bda18bfb58d9102a537
--- /dev/null
+++ b/knowledge_base/nlp/sentence_embeddings_with_sbert.py
@@ -0,0 +1,466 @@
+"""
+Title: Sentence embeddings using Siamese RoBERTa-networks
+Author: [Mohammed Abu El-Nasr](https://github.com/abuelnasr0)
+Date created: 2023/07/14
+Last modified: 2023/07/14
+Description: Fine-tune a RoBERTa model to generate sentence embeddings using KerasHub.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+BERT and RoBERTa can be used for semantic textual similarity tasks, where two sentences
+are passed to the model and the network predicts whether they are similar or not. But
+what if we have a large collection of sentences and want to find the most similar pairs
+in that collection? That will take n*(n-1)/2 inference computations, where n is the
+number of sentences in the collection. For example, if n = 10000, the required time will
+be 65 hours on a V100 GPU.
+
+A common method to overcome the time overhead issue is to pass one sentence to the model,
+then average the output of the model, or take the first token (the [CLS] token) and use
+them as a [sentence embedding](https://en.wikipedia.org/wiki/Sentence_embedding), then
+use a vector similarity measure like cosine similarity or Manhatten / Euclidean distance
+to find close sentences (semantically similar sentences). That will reduce the time to
+find the most similar pairs in a collection of 10,000 sentences from 65 hours to 5
+seconds!
+
+If we use RoBERTa directly, that will yield rather bad sentence embeddings. But if we
+fine-tune RoBERTa using a Siamese network, that will generate semantically meaningful
+sentence embeddings. This will enable RoBERTa to be used for new tasks. These tasks
+include:
+
+- Large-scale semantic similarity comparison.
+- Clustering.
+- Information retrieval via semantic search.
+
+In this example, we will show how to fine-tune a RoBERTa model using a Siamese network
+such that it will be able to produce semantically meaningful sentence embeddings and use
+them in a semantic search and clustering example.
+This method of fine-tuning was introduced in
+[Sentence-BERT](https://arxiv.org/abs/1908.10084)
+"""
+
+"""
+## Setup
+
+Let's install and import the libraries we need. We'll be using the KerasHub library in
+this example.
+
+We will also enable [mixed precision](https://www.tensorflow.org/guide/mixed_precision)
+training. This will help us reduce the training time.
+"""
+
+"""shell
+pip install -q --upgrade keras-hub
+pip install -q --upgrade keras  # Upgrade to Keras 3.
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import keras
+import keras_hub
+import tensorflow as tf
+import tensorflow_datasets as tfds
+import sklearn.cluster as cluster
+
+keras.mixed_precision.set_global_policy("mixed_float16")
+
+"""
+## Fine-tune the model using siamese networks
+
+[Siamese network](https://en.wikipedia.org/wiki/Siamese_neural_network) is a neural
+network architecture that contains two or more subnetworks. The subnetworks share the
+same weights. It is used to generate feature vectors for each input and then compare them
+for similarity.
+
+For our example, the subnetwork will be a RoBERTa model that has a pooling layer on top
+of it to produce the embeddings of the input sentences. These embeddings will then be
+compared to each other to learn to produce semantically meaningful embeddings.
+
+The pooling strategies used are mean, max, and CLS pooling. Mean pooling produces the
+best results. We will use it in our examples.
+"""
+
+"""
+### Fine-tune using the regression objective function
+
+For building the siamese network with the regression objective function, the siamese
+network is asked to predict the cosine similarity between the embeddings of the two input
+sentences.
+
+Cosine similarity indicates the angle between the sentence embeddings. If the cosine
+similarity is high, that means there is a small angle between the embeddings; hence, they
+are semantically similar.
+"""
+
+"""
+#### Load the dataset
+
+We will use the STSB dataset to fine-tune the model for the regression objective. STSB
+consists of a collection of sentence pairs that are labelled in the range [0, 5]. 0
+indicates the least semantic similarity between the two sentences, and 5 indicates the
+most semantic similarity between the two sentences.
+
+The range of the cosine similarity is [-1, 1] and it's the output of the siamese network,
+but the range of the labels in the dataset is [0, 5]. We need to unify the range between
+the cosine similarity and the dataset labels, so while preparing the dataset, we will
+divide the labels by 2.5 and subtract 1.
+"""
+
+TRAIN_BATCH_SIZE = 6
+VALIDATION_BATCH_SIZE = 8
+
+TRAIN_NUM_BATCHES = 300
+VALIDATION_NUM_BATCHES = 40
+
+AUTOTUNE = tf.data.experimental.AUTOTUNE
+
+
+def change_range(x):
+    return (x / 2.5) - 1
+
+
+def prepare_dataset(dataset, num_batches, batch_size):
+    dataset = dataset.map(
+        lambda z: (
+            [z["sentence1"], z["sentence2"]],
+            [tf.cast(change_range(z["label"]), tf.float32)],
+        ),
+        num_parallel_calls=AUTOTUNE,
+    )
+    dataset = dataset.batch(batch_size)
+    dataset = dataset.take(num_batches)
+    dataset = dataset.prefetch(AUTOTUNE)
+    return dataset
+
+
+stsb_ds = tfds.load(
+    "glue/stsb",
+)
+stsb_train, stsb_valid = stsb_ds["train"], stsb_ds["validation"]
+
+stsb_train = prepare_dataset(stsb_train, TRAIN_NUM_BATCHES, TRAIN_BATCH_SIZE)
+stsb_valid = prepare_dataset(stsb_valid, VALIDATION_NUM_BATCHES, VALIDATION_BATCH_SIZE)
+
+"""
+Let's see examples from the dataset of two sentenses and their similarity.
+"""
+
+for x, y in stsb_train:
+    for i, example in enumerate(x):
+        print(f"sentence 1 : {example[0]} ")
+        print(f"sentence 2 : {example[1]} ")
+        print(f"similarity : {y[i]} \n")
+    break
+
+"""
+#### Build the encoder model.
+
+Now, we'll build the encoder model that will produce the sentence embeddings. It consists
+of:
+
+- A preprocessor layer to tokenize and generate padding masks for the sentences.
+- A backbone model that will generate the contextual representation of each token in the
+sentence.
+- A mean pooling layer to produce the embeddings. We will use `keras.layers.GlobalAveragePooling1D`
+to apply the mean pooling to the backbone outputs. We will pass the padding mask to the
+layer to exclude padded tokens from being averaged.
+- A normalization layer to normalize the embeddings as we are using the cosine similarity.
+"""
+
+preprocessor = keras_hub.models.RobertaPreprocessor.from_preset("roberta_base_en")
+backbone = keras_hub.models.RobertaBackbone.from_preset("roberta_base_en")
+inputs = keras.Input(shape=(1,), dtype="string", name="sentence")
+x = preprocessor(inputs)
+h = backbone(x)
+embedding = keras.layers.GlobalAveragePooling1D(name="pooling_layer")(
+    h, x["padding_mask"]
+)
+n_embedding = keras.layers.UnitNormalization(axis=1)(embedding)
+roberta_normal_encoder = keras.Model(inputs=inputs, outputs=n_embedding)
+
+roberta_normal_encoder.summary()
+
+"""
+#### Build the Siamese network with the regression objective function.
+
+It's described above that the Siamese network has two or more subnetworks, and for this
+Siamese model, we need two encoders. But we don't have two encoders; we have only one
+encoder, but we will pass the two sentences through it. That way, we can have two paths
+to get the embeddings and also shared weights between the two paths.
+
+After passing the two sentences to the model and getting the normalized embeddings, we
+will multiply the two normalized embeddings to get the cosine similarity between the two
+sentences.
+"""
+
+
+class RegressionSiamese(keras.Model):
+    def __init__(self, encoder, **kwargs):
+        inputs = keras.Input(shape=(2,), dtype="string", name="sentences")
+        sen1, sen2 = keras.ops.split(inputs, 2, axis=1)
+        u = encoder(sen1)
+        v = encoder(sen2)
+        cosine_similarity_scores = keras.ops.matmul(u, keras.ops.transpose(v))
+
+        super().__init__(
+            inputs=inputs,
+            outputs=cosine_similarity_scores,
+            **kwargs,
+        )
+
+        self.encoder = encoder
+
+    def get_encoder(self):
+        return self.encoder
+
+
+"""
+#### Fit the model
+
+Let's try this example before training and compare it to the output after training.
+"""
+
+sentences = [
+    "Today is a very sunny day.",
+    "I am hungry, I will get my meal.",
+    "The dog is eating his food.",
+]
+query = ["The dog is enjoying his meal."]
+
+encoder = roberta_normal_encoder
+
+sentence_embeddings = encoder(tf.constant(sentences))
+query_embedding = encoder(tf.constant(query))
+
+cosine_similarity_scores = tf.matmul(query_embedding, tf.transpose(sentence_embeddings))
+for i, sim in enumerate(cosine_similarity_scores[0]):
+    print(f"cosine similarity score between sentence {i+1} and the query = {sim} ")
+
+"""
+For the training we will use `MeanSquaredError()` as loss function, and `Adam()`
+optimizer with learning rate = 2e-5.
+"""
+
+roberta_regression_siamese = RegressionSiamese(roberta_normal_encoder)
+
+roberta_regression_siamese.compile(
+    loss=keras.losses.MeanSquaredError(),
+    optimizer=keras.optimizers.Adam(2e-5),
+    jit_compile=False,
+)
+
+roberta_regression_siamese.fit(stsb_train, validation_data=stsb_valid, epochs=1)
+
+"""
+Let's try the model after training, we will notice a huge difference in the output. That
+means that the model after fine-tuning is capable of producing semantically meaningful
+embeddings. where the semantically similar sentences have a small angle between them. and
+semantically dissimilar sentences have a large angle between them.
+"""
+
+sentences = [
+    "Today is a very sunny day.",
+    "I am hungry, I will get my meal.",
+    "The dog is eating his food.",
+]
+query = ["The dog is enjoying his food."]
+
+encoder = roberta_regression_siamese.get_encoder()
+
+sentence_embeddings = encoder(tf.constant(sentences))
+query_embedding = encoder(tf.constant(query))
+
+cosine_simalarities = tf.matmul(query_embedding, tf.transpose(sentence_embeddings))
+for i, sim in enumerate(cosine_simalarities[0]):
+    print(f"cosine similarity between sentence {i+1} and the query = {sim} ")
+
+"""
+### Fine-tune Using the triplet Objective Function
+
+For the Siamese network with the triplet objective function, three sentences are passed
+to the Siamese network *anchor*, *positive*, and *negative* sentences. *anchor* and
+*positive* sentences are semantically similar, and *anchor* and *negative* sentences are
+semantically dissimilar. The objective is to minimize the distance between the *anchor*
+sentence and the *positive* sentence, and to maximize the distance between the *anchor*
+sentence and the *negative* sentence.
+"""
+
+"""
+#### Load the dataset
+
+We will use the Wikipedia-sections-triplets dataset for fine-tuning. This data set
+consists of sentences derived from the Wikipedia website. It has a collection of 3
+sentences *anchor*, *positive*, *negative*. *anchor* and *positive* are derived from the
+same section. *anchor* and *negative* are derived from different sections.
+
+This dataset has 1.8 million training triplets and 220,000 test triplets. In this
+example, we will only use 1200 triplets for training and 300 for testing.
+"""
+
+"""shell
+wget https://sbert.net/datasets/wikipedia-sections-triplets.zip -q
+unzip wikipedia-sections-triplets.zip  -d  wikipedia-sections-triplets
+"""
+
+NUM_TRAIN_BATCHES = 200
+NUM_TEST_BATCHES = 75
+AUTOTUNE = tf.data.experimental.AUTOTUNE
+
+
+def prepare_wiki_data(dataset, num_batches):
+    dataset = dataset.map(
+        lambda z: ((z["Sentence1"], z["Sentence2"], z["Sentence3"]), 0)
+    )
+    dataset = dataset.batch(6)
+    dataset = dataset.take(num_batches)
+    dataset = dataset.prefetch(AUTOTUNE)
+    return dataset
+
+
+wiki_train = tf.data.experimental.make_csv_dataset(
+    "wikipedia-sections-triplets/train.csv",
+    batch_size=1,
+    num_epochs=1,
+)
+wiki_test = tf.data.experimental.make_csv_dataset(
+    "wikipedia-sections-triplets/test.csv",
+    batch_size=1,
+    num_epochs=1,
+)
+
+wiki_train = prepare_wiki_data(wiki_train, NUM_TRAIN_BATCHES)
+wiki_test = prepare_wiki_data(wiki_test, NUM_TEST_BATCHES)
+
+"""
+#### Build the encoder model
+
+For this encoder model, we will use RoBERTa with mean pooling and we will not normalize
+the output embeddings. The encoder model consists of:
+
+- A preprocessor layer to tokenize and generate padding masks for the sentences.
+- A backbone model that will generate the contextual representation of each token in the
+sentence.
+- A mean pooling layer to produce the embeddings.
+"""
+
+preprocessor = keras_hub.models.RobertaPreprocessor.from_preset("roberta_base_en")
+backbone = keras_hub.models.RobertaBackbone.from_preset("roberta_base_en")
+input = keras.Input(shape=(1,), dtype="string", name="sentence")
+
+x = preprocessor(input)
+h = backbone(x)
+embedding = keras.layers.GlobalAveragePooling1D(name="pooling_layer")(
+    h, x["padding_mask"]
+)
+
+roberta_encoder = keras.Model(inputs=input, outputs=embedding)
+
+
+roberta_encoder.summary()
+
+"""
+#### Build the Siamese network with the triplet objective function
+
+For the Siamese network with the triplet objective function, we will build the model with
+an encoder, and we will pass the three sentences through that encoder. We will get an
+embedding for each sentence, and we will calculate the `positive_dist` and
+`negative_dist` that will be passed to the loss function described below.
+"""
+
+
+class TripletSiamese(keras.Model):
+    def __init__(self, encoder, **kwargs):
+        anchor = keras.Input(shape=(1,), dtype="string")
+        positive = keras.Input(shape=(1,), dtype="string")
+        negative = keras.Input(shape=(1,), dtype="string")
+
+        ea = encoder(anchor)
+        ep = encoder(positive)
+        en = encoder(negative)
+
+        positive_dist = keras.ops.sum(keras.ops.square(ea - ep), axis=1)
+        negative_dist = keras.ops.sum(keras.ops.square(ea - en), axis=1)
+
+        positive_dist = keras.ops.sqrt(positive_dist)
+        negative_dist = keras.ops.sqrt(negative_dist)
+
+        output = keras.ops.stack([positive_dist, negative_dist], axis=0)
+
+        super().__init__(inputs=[anchor, positive, negative], outputs=output, **kwargs)
+
+        self.encoder = encoder
+
+    def get_encoder(self):
+        return self.encoder
+
+
+"""
+We will use a custom loss function for the triplet objective. The loss function will
+receive the distance between the *anchor* and the *positive* embeddings `positive_dist`,
+and the distance between the *anchor* and the *negative* embeddings `negative_dist`,
+where they are stacked together in `y_pred`.
+
+We will use `positive_dist` and `negative_dist` to compute the loss such that
+`negative_dist` is larger than `positive_dist` at least by a specific margin.
+Mathematically, we will minimize this loss function: `max( positive_dist - negative_dist
++ margin, 0)`.
+
+There is no `y_true` used in this loss function. Note that we set the labels in the
+dataset to zero, but they will not be used.
+"""
+
+
+class TripletLoss(keras.losses.Loss):
+    def __init__(self, margin=1, **kwargs):
+        super().__init__(**kwargs)
+        self.margin = margin
+
+    def call(self, y_true, y_pred):
+        positive_dist, negative_dist = tf.unstack(y_pred, axis=0)
+
+        losses = keras.ops.relu(positive_dist - negative_dist + self.margin)
+        return keras.ops.mean(losses, axis=0)
+
+
+"""
+#### Fit the model
+
+For the training, we will use the custom `TripletLoss()` loss function, and `Adam()`
+optimizer with a learning rate = 2e-5.
+"""
+
+roberta_triplet_siamese = TripletSiamese(roberta_encoder)
+
+roberta_triplet_siamese.compile(
+    loss=TripletLoss(),
+    optimizer=keras.optimizers.Adam(2e-5),
+    jit_compile=False,
+)
+
+roberta_triplet_siamese.fit(wiki_train, validation_data=wiki_test, epochs=1)
+
+"""
+Let's try this model in a clustering example. Here are 6 questions. first 3 questions
+about learning English, and the last 3 questions about working online. Let's see if the
+embeddings produced by our encoder will cluster them correctly.
+"""
+
+questions = [
+    "What should I do to improve my English writting?",
+    "How to be good at speaking English?",
+    "How can I improve my English?",
+    "How to earn money online?",
+    "How do I earn money online?",
+    "How to work and earn money through internet?",
+]
+
+encoder = roberta_triplet_siamese.get_encoder()
+embeddings = encoder(tf.constant(questions))
+kmeans = cluster.KMeans(n_clusters=2, random_state=0, n_init="auto").fit(embeddings)
+
+for i, label in enumerate(kmeans.labels_):
+    print(f"sentence ({questions[i]}) belongs to cluster {label}")
diff --git a/knowledge_base/nlp/text_classification_from_scratch.py b/knowledge_base/nlp/text_classification_from_scratch.py
new file mode 100644
index 0000000000000000000000000000000000000000..048e076b3a67bf1c7ef9f4d2702b57d48b201897
--- /dev/null
+++ b/knowledge_base/nlp/text_classification_from_scratch.py
@@ -0,0 +1,302 @@
+"""
+Title: Text classification from scratch
+Authors: Mark Omernick, Francois Chollet
+Date created: 2019/11/06
+Last modified: 2020/05/17
+Description: Text sentiment classification starting from raw text files.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+This example shows how to do text classification starting from raw text (as
+a set of text files on disk). We demonstrate the workflow on the IMDB sentiment
+classification dataset (unprocessed version). We use the `TextVectorization` layer for
+ word splitting & indexing.
+"""
+
+"""
+## Setup
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import keras
+import tensorflow as tf
+import numpy as np
+from keras import layers
+
+"""
+## Load the data: IMDB movie review sentiment classification
+
+Let's download the data and inspect its structure.
+"""
+
+"""shell
+curl -O https://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
+tar -xf aclImdb_v1.tar.gz
+"""
+
+"""
+The `aclImdb` folder contains a `train` and `test` subfolder:
+"""
+
+"""shell
+ls aclImdb
+"""
+
+"""shell
+ls aclImdb/test
+"""
+
+"""shell
+ls aclImdb/train
+"""
+
+"""
+The `aclImdb/train/pos` and `aclImdb/train/neg` folders contain text files, each of
+ which represents one review (either positive or negative):
+"""
+
+"""shell
+cat aclImdb/train/pos/6248_7.txt
+"""
+
+"""
+We are only interested in the `pos` and `neg` subfolders, so let's delete the other subfolder that has text files in it:
+"""
+
+"""shell
+rm -r aclImdb/train/unsup
+"""
+
+"""
+You can use the utility `keras.utils.text_dataset_from_directory` to
+generate a labeled `tf.data.Dataset` object from a set of text files on disk filed
+ into class-specific folders.
+
+Let's use it to generate the training, validation, and test datasets. The validation
+and training datasets are generated from two subsets of the `train` directory, with 20%
+of samples going to the validation dataset and 80% going to the training dataset.
+
+Having a validation dataset in addition to the test dataset is useful for tuning
+hyperparameters, such as the model architecture, for which the test dataset should not
+be used.
+
+Before putting the model out into the real world however, it should be retrained using all
+available training data (without creating a validation dataset), so its performance is maximized.
+
+When using the `validation_split` & `subset` arguments, make sure to either specify a
+random seed, or to pass `shuffle=False`, so that the validation & training splits you
+get have no overlap.
+
+"""
+
+batch_size = 32
+raw_train_ds = keras.utils.text_dataset_from_directory(
+    "aclImdb/train",
+    batch_size=batch_size,
+    validation_split=0.2,
+    subset="training",
+    seed=1337,
+)
+raw_val_ds = keras.utils.text_dataset_from_directory(
+    "aclImdb/train",
+    batch_size=batch_size,
+    validation_split=0.2,
+    subset="validation",
+    seed=1337,
+)
+raw_test_ds = keras.utils.text_dataset_from_directory(
+    "aclImdb/test", batch_size=batch_size
+)
+
+print(f"Number of batches in raw_train_ds: {raw_train_ds.cardinality()}")
+print(f"Number of batches in raw_val_ds: {raw_val_ds.cardinality()}")
+print(f"Number of batches in raw_test_ds: {raw_test_ds.cardinality()}")
+
+"""
+Let's preview a few samples:
+"""
+
+# It's important to take a look at your raw data to ensure your normalization
+# and tokenization will work as expected. We can do that by taking a few
+# examples from the training set and looking at them.
+# This is one of the places where eager execution shines:
+# we can just evaluate these tensors using .numpy()
+# instead of needing to evaluate them in a Session/Graph context.
+for text_batch, label_batch in raw_train_ds.take(1):
+    for i in range(5):
+        print(text_batch.numpy()[i])
+        print(label_batch.numpy()[i])
+
+"""
+## Prepare the data
+
+In particular, we remove `<br />` tags.
+"""
+
+import string
+import re
+
+
+# Having looked at our data above, we see that the raw text contains HTML break
+# tags of the form '<br />'. These tags will not be removed by the default
+# standardizer (which doesn't strip HTML). Because of this, we will need to
+# create a custom standardization function.
+def custom_standardization(input_data):
+    lowercase = tf.strings.lower(input_data)
+    stripped_html = tf.strings.regex_replace(lowercase, "<br />", " ")
+    return tf.strings.regex_replace(
+        stripped_html, f"[{re.escape(string.punctuation)}]", ""
+    )
+
+
+# Model constants.
+max_features = 20000
+embedding_dim = 128
+sequence_length = 500
+
+# Now that we have our custom standardization, we can instantiate our text
+# vectorization layer. We are using this layer to normalize, split, and map
+# strings to integers, so we set our 'output_mode' to 'int'.
+# Note that we're using the default split function,
+# and the custom standardization defined above.
+# We also set an explicit maximum sequence length, since the CNNs later in our
+# model won't support ragged sequences.
+vectorize_layer = keras.layers.TextVectorization(
+    standardize=custom_standardization,
+    max_tokens=max_features,
+    output_mode="int",
+    output_sequence_length=sequence_length,
+)
+
+# Now that the vectorize_layer has been created, call `adapt` on a text-only
+# dataset to create the vocabulary. You don't have to batch, but for very large
+# datasets this means you're not keeping spare copies of the dataset in memory.
+
+# Let's make a text-only dataset (no labels):
+text_ds = raw_train_ds.map(lambda x, y: x)
+# Let's call `adapt`:
+vectorize_layer.adapt(text_ds)
+
+"""
+## Two options to vectorize the data
+
+There are 2 ways we can use our text vectorization layer:
+
+**Option 1: Make it part of the model**, so as to obtain a model that processes raw
+ strings, like this:
+"""
+
+"""
+
+```python
+text_input = keras.Input(shape=(1,), dtype=tf.string, name='text')
+x = vectorize_layer(text_input)
+x = layers.Embedding(max_features + 1, embedding_dim)(x)
+...
+```
+
+**Option 2: Apply it to the text dataset** to obtain a dataset of word indices, then
+ feed it into a model that expects integer sequences as inputs.
+
+An important difference between the two is that option 2 enables you to do
+**asynchronous CPU processing and buffering** of your data when training on GPU.
+So if you're training the model on GPU, you probably want to go with this option to get
+ the best performance. This is what we will do below.
+
+If we were to export our model to production, we'd ship a model that accepts raw
+strings as input, like in the code snippet for option 1 above. This can be done after
+ training. We do this in the last section.
+
+
+"""
+
+
+def vectorize_text(text, label):
+    text = tf.expand_dims(text, -1)
+    return vectorize_layer(text), label
+
+
+# Vectorize the data.
+train_ds = raw_train_ds.map(vectorize_text)
+val_ds = raw_val_ds.map(vectorize_text)
+test_ds = raw_test_ds.map(vectorize_text)
+
+# Do async prefetching / buffering of the data for best performance on GPU.
+train_ds = train_ds.cache().prefetch(buffer_size=10)
+val_ds = val_ds.cache().prefetch(buffer_size=10)
+test_ds = test_ds.cache().prefetch(buffer_size=10)
+
+"""
+## Build a model
+
+We choose a simple 1D convnet starting with an `Embedding` layer.
+"""
+
+# A integer input for vocab indices.
+inputs = keras.Input(shape=(None,), dtype="int64")
+
+# Next, we add a layer to map those vocab indices into a space of dimensionality
+# 'embedding_dim'.
+x = layers.Embedding(max_features, embedding_dim)(inputs)
+x = layers.Dropout(0.5)(x)
+
+# Conv1D + global max pooling
+x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
+x = layers.Conv1D(128, 7, padding="valid", activation="relu", strides=3)(x)
+x = layers.GlobalMaxPooling1D()(x)
+
+# We add a vanilla hidden layer:
+x = layers.Dense(128, activation="relu")(x)
+x = layers.Dropout(0.5)(x)
+
+# We project onto a single unit output layer, and squash it with a sigmoid:
+predictions = layers.Dense(1, activation="sigmoid", name="predictions")(x)
+
+model = keras.Model(inputs, predictions)
+
+# Compile the model with binary crossentropy loss and an adam optimizer.
+model.compile(loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"])
+
+"""
+## Train the model
+"""
+
+epochs = 3
+
+# Fit the model using the train and test datasets.
+model.fit(train_ds, validation_data=val_ds, epochs=epochs)
+
+"""
+## Evaluate the model on the test set
+"""
+
+model.evaluate(test_ds)
+
+"""
+## Make an end-to-end model
+
+If you want to obtain a model capable of processing raw strings, you can simply
+create a new model (using the weights we just trained):
+"""
+
+# A string input
+inputs = keras.Input(shape=(1,), dtype="string")
+# Turn strings into vocab indices
+indices = vectorize_layer(inputs)
+# Turn vocab indices into predictions
+outputs = model(indices)
+
+# Our end to end model
+end_to_end_model = keras.Model(inputs, outputs)
+end_to_end_model.compile(
+    loss="binary_crossentropy", optimizer="adam", metrics=["accuracy"]
+)
+
+# Test it with `raw_test_ds`, which yields raw strings
+end_to_end_model.evaluate(raw_test_ds)
diff --git a/knowledge_base/nlp/text_classification_with_switch_transformer.py b/knowledge_base/nlp/text_classification_with_switch_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6f19a2e187d7cdf489fb8f671eeb3359f37fec30
--- /dev/null
+++ b/knowledge_base/nlp/text_classification_with_switch_transformer.py
@@ -0,0 +1,328 @@
+"""
+Title: Text classification with Switch Transformer
+Author: [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/)
+Date created: 2020/05/10
+Last modified: 2021/02/15
+Description: Implement a Switch Transformer for text classification.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+This example demonstrates the implementation of the
+[Switch Transformer](https://arxiv.org/abs/2101.03961) model for text
+classification.
+
+The Switch Transformer replaces the feedforward network (FFN) layer in the standard
+Transformer with a Mixture of Expert (MoE) routing layer, where each expert operates
+independently on the tokens in the sequence. This allows increasing the model size without
+increasing the computation needed to process each example.
+
+Note that, for training the Switch Transformer efficiently, data and model parallelism
+need to be applied, so that expert modules can run simultaneously, each on its own accelerator.
+While the implementation described in the paper uses the
+[TensorFlow Mesh](https://github.com/tensorflow/mesh) framework for distributed training,
+this example presents a simple, non-distributed implementation of the Switch Transformer
+model for demonstration purposes.
+"""
+
+"""
+## Setup
+"""
+
+import keras
+from keras import ops
+from keras import layers
+
+"""
+## Download and prepare dataset
+"""
+
+vocab_size = 20000  # Only consider the top 20k words
+num_tokens_per_example = 200  # Only consider the first 200 words of each movie review
+(x_train, y_train), (x_val, y_val) = keras.datasets.imdb.load_data(num_words=vocab_size)
+print(len(x_train), "Training sequences")
+print(len(x_val), "Validation sequences")
+x_train = keras.utils.pad_sequences(x_train, maxlen=num_tokens_per_example)
+x_val = keras.utils.pad_sequences(x_val, maxlen=num_tokens_per_example)
+
+"""
+## Define hyperparameters
+"""
+
+embed_dim = 32  # Embedding size for each token.
+num_heads = 2  # Number of attention heads
+ff_dim = 32  # Hidden layer size in feedforward network.
+num_experts = 10  # Number of experts used in the Switch Transformer.
+batch_size = 50  # Batch size.
+learning_rate = 0.001  # Learning rate.
+dropout_rate = 0.25  # Dropout rate.
+num_epochs = 3  # Number of epochs.
+num_tokens_per_batch = (
+    batch_size * num_tokens_per_example
+)  # Total number of tokens per batch.
+print(f"Number of tokens per batch: {num_tokens_per_batch}")
+
+"""
+## Implement token & position embedding layer
+
+It consists of two separate embedding layers, one for tokens, one for token index (positions).
+"""
+
+
+class TokenAndPositionEmbedding(layers.Layer):
+    def __init__(self, maxlen, vocab_size, embed_dim):
+        super().__init__()
+        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
+        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
+
+    def call(self, x):
+        maxlen = ops.shape(x)[-1]
+        positions = ops.arange(start=0, stop=maxlen, step=1)
+        positions = self.pos_emb(positions)
+        x = self.token_emb(x)
+        return x + positions
+
+
+"""
+## Implement the feedforward network
+
+This is used as the Mixture of Experts in the Switch Transformer.
+"""
+
+
+def create_feedforward_network(ff_dim, embed_dim, name=None):
+    return keras.Sequential(
+        [layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim)], name=name
+    )
+
+
+"""
+## Implement the load-balanced loss
+
+This is an auxiliary loss to encourage a balanced load across experts.
+"""
+
+
+def load_balanced_loss(router_probs, expert_mask):
+    # router_probs [tokens_per_batch, num_experts] is the probability assigned for
+    # each expert per token. expert_mask [tokens_per_batch, num_experts] contains
+    # the expert with the highest router probability in one−hot format.
+
+    num_experts = ops.shape(expert_mask)[-1]
+    # Get the fraction of tokens routed to each expert.
+    # density is a vector of length num experts that sums to 1.
+    density = ops.mean(expert_mask, axis=0)
+    # Get fraction of probability mass assigned to each expert from the router
+    # across all tokens. density_proxy is a vector of length num experts that sums to 1.
+    density_proxy = ops.mean(router_probs, axis=0)
+    # Want both vectors to have uniform allocation (1/num experts) across all
+    # num_expert elements. The two vectors will be pushed towards uniform allocation
+    # when the dot product is minimized.
+    loss = ops.mean(density_proxy * density) * ops.cast((num_experts**2), "float32")
+    return loss
+
+
+"""
+### Implement the router as a layer
+"""
+
+
+class Router(layers.Layer):
+    def __init__(self, num_experts, expert_capacity):
+        self.num_experts = num_experts
+        self.route = layers.Dense(units=num_experts)
+        self.expert_capacity = expert_capacity
+        super().__init__()
+
+    def call(self, inputs, training=False):
+        # inputs shape: [tokens_per_batch, embed_dim]
+        # router_logits shape: [tokens_per_batch, num_experts]
+        router_logits = self.route(inputs)
+
+        if training:
+            # Add noise for exploration across experts.
+            router_logits += keras.random.uniform(
+                shape=router_logits.shape, minval=0.9, maxval=1.1
+            )
+        # Probabilities for each token of what expert it should be sent to.
+        router_probs = keras.activations.softmax(router_logits, axis=-1)
+        # Get the top−1 expert for each token. expert_gate is the top−1 probability
+        # from the router for each token. expert_index is what expert each token
+        # is going to be routed to.
+        expert_gate, expert_index = ops.top_k(router_probs, k=1)
+        # expert_mask shape: [tokens_per_batch, num_experts]
+        expert_mask = ops.one_hot(expert_index, self.num_experts)
+        # Compute load balancing loss.
+        aux_loss = load_balanced_loss(router_probs, expert_mask)
+        self.add_loss(aux_loss)
+        # Experts have a fixed capacity, ensure we do not exceed it. Construct
+        # the batch indices, to each expert, with position in expert make sure that
+        # not more that expert capacity examples can be routed to each expert.
+        position_in_expert = ops.cast(
+            ops.cumsum(expert_mask, axis=0) * expert_mask, "int32"
+        )
+        # Keep only tokens that fit within expert capacity.
+        expert_mask *= ops.cast(
+            ops.less(ops.cast(position_in_expert, "int32"), self.expert_capacity),
+            "float32",
+        )
+        expert_mask_flat = ops.sum(expert_mask, axis=-1)
+        # Mask out the experts that have overflowed the expert capacity.
+        expert_gate *= expert_mask_flat
+        # Combine expert outputs and scaling with router probability.
+        # combine_tensor shape: [tokens_per_batch, num_experts, expert_capacity]
+        combined_tensor = ops.expand_dims(
+            expert_gate
+            * expert_mask_flat
+            * ops.squeeze(ops.one_hot(expert_index, self.num_experts), 1),
+            -1,
+        ) * ops.squeeze(ops.one_hot(position_in_expert, self.expert_capacity), 1)
+        # Create binary dispatch_tensor [tokens_per_batch, num_experts, expert_capacity]
+        # that is 1 if the token gets routed to the corresponding expert.
+        dispatch_tensor = ops.cast(combined_tensor, "float32")
+
+        return dispatch_tensor, combined_tensor
+
+
+"""
+### Implement a Switch layer
+"""
+
+
+class Switch(layers.Layer):
+    def __init__(
+        self, num_experts, embed_dim, ff_dim, num_tokens_per_batch, capacity_factor=1
+    ):
+        self.num_experts = num_experts
+        self.embed_dim = embed_dim
+        self.experts = [
+            create_feedforward_network(ff_dim, embed_dim) for _ in range(num_experts)
+        ]
+
+        self.expert_capacity = num_tokens_per_batch // self.num_experts
+        self.router = Router(self.num_experts, self.expert_capacity)
+        super().__init__()
+
+    def call(self, inputs):
+        batch_size = ops.shape(inputs)[0]
+        num_tokens_per_example = ops.shape(inputs)[1]
+
+        # inputs shape: [num_tokens_per_batch, embed_dim]
+        inputs = ops.reshape(inputs, [num_tokens_per_batch, self.embed_dim])
+        # dispatch_tensor shape: [expert_capacity, num_experts, tokens_per_batch]
+        # combine_tensor shape: [tokens_per_batch, num_experts, expert_capacity]
+        dispatch_tensor, combine_tensor = self.router(inputs)
+        # expert_inputs shape: [num_experts, expert_capacity, embed_dim]
+        expert_inputs = ops.einsum("ab,acd->cdb", inputs, dispatch_tensor)
+        expert_inputs = ops.reshape(
+            expert_inputs, [self.num_experts, self.expert_capacity, self.embed_dim]
+        )
+        # Dispatch to experts
+        expert_input_list = ops.unstack(expert_inputs, axis=0)
+        expert_output_list = [
+            self.experts[idx](expert_input)
+            for idx, expert_input in enumerate(expert_input_list)
+        ]
+        # expert_outputs shape: [expert_capacity, num_experts, embed_dim]
+        expert_outputs = ops.stack(expert_output_list, axis=1)
+        # expert_outputs_combined shape: [tokens_per_batch, embed_dim]
+        expert_outputs_combined = ops.einsum(
+            "abc,xba->xc", expert_outputs, combine_tensor
+        )
+        # output shape: [batch_size, num_tokens_per_example, embed_dim]
+        outputs = ops.reshape(
+            expert_outputs_combined,
+            [batch_size, num_tokens_per_example, self.embed_dim],
+        )
+        return outputs
+
+
+"""
+## Implement a Transformer block layer
+"""
+
+
+class TransformerBlock(layers.Layer):
+    def __init__(self, embed_dim, num_heads, ffn, dropout_rate=0.1):
+        super().__init__()
+        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
+        # The ffn can be either a standard feedforward network or a switch
+        # layer with a Mixture of Experts.
+        self.ffn = ffn
+        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
+        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
+        self.dropout1 = layers.Dropout(dropout_rate)
+        self.dropout2 = layers.Dropout(dropout_rate)
+
+    def call(self, inputs, training=False):
+        attn_output = self.att(inputs, inputs)
+        attn_output = self.dropout1(attn_output, training=training)
+        out1 = self.layernorm1(inputs + attn_output)
+        ffn_output = self.ffn(out1)
+        ffn_output = self.dropout2(ffn_output, training=training)
+        return self.layernorm2(out1 + ffn_output)
+
+
+"""
+## Implement the classifier
+
+The `TransformerBlock` layer outputs one vector for each time step of our input sequence.
+Here, we take the mean across all time steps and use a feedforward network on top
+of it to classify text.
+"""
+
+
+def create_classifier():
+    switch = Switch(num_experts, embed_dim, ff_dim, num_tokens_per_batch)
+    transformer_block = TransformerBlock(embed_dim // num_heads, num_heads, switch)
+
+    inputs = layers.Input(shape=(num_tokens_per_example,))
+    embedding_layer = TokenAndPositionEmbedding(
+        num_tokens_per_example, vocab_size, embed_dim
+    )
+    x = embedding_layer(inputs)
+    x = transformer_block(x)
+    x = layers.GlobalAveragePooling1D()(x)
+    x = layers.Dropout(dropout_rate)(x)
+    x = layers.Dense(ff_dim, activation="relu")(x)
+    x = layers.Dropout(dropout_rate)(x)
+    outputs = layers.Dense(2, activation="softmax")(x)
+
+    classifier = keras.Model(inputs=inputs, outputs=outputs)
+    return classifier
+
+
+"""
+## Train and evaluate the model
+"""
+
+
+def run_experiment(classifier):
+    classifier.compile(
+        optimizer=keras.optimizers.Adam(learning_rate),
+        loss="sparse_categorical_crossentropy",
+        metrics=["accuracy"],
+    )
+    history = classifier.fit(
+        x_train,
+        y_train,
+        batch_size=batch_size,
+        epochs=num_epochs,
+        validation_data=(x_val, y_val),
+    )
+    return history
+
+
+classifier = create_classifier()
+run_experiment(classifier)
+
+
+"""
+## Conclusion
+
+Compared to the standard Transformer architecture, the Switch Transformer can have a much
+larger number of parameters, leading to increased model
+capacity, while maintaining a reasonable computational cost.
+"""
diff --git a/knowledge_base/nlp/text_classification_with_transformer.py b/knowledge_base/nlp/text_classification_with_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a049bb9f8f6a72b1f9bba102fadcc6f0a700fd6
--- /dev/null
+++ b/knowledge_base/nlp/text_classification_with_transformer.py
@@ -0,0 +1,118 @@
+"""
+Title: Text classification with Transformer
+Author: [Apoorv Nandan](https://twitter.com/NandanApoorv)
+Date created: 2020/05/10
+Last modified: 2024/01/18
+Description: Implement a Transformer block as a Keras layer and use it for text classification.
+Accelerator: GPU
+Converted to Keras 3 by: [Sitam Meur](https://github.com/sitamgithub-MSIT)
+"""
+
+"""
+## Setup
+"""
+
+import keras
+from keras import ops
+from keras import layers
+
+
+"""
+## Implement a Transformer block as a layer
+"""
+
+
+class TransformerBlock(layers.Layer):
+    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
+        super().__init__()
+        self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
+        self.ffn = keras.Sequential(
+            [
+                layers.Dense(ff_dim, activation="relu"),
+                layers.Dense(embed_dim),
+            ]
+        )
+        self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
+        self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
+        self.dropout1 = layers.Dropout(rate)
+        self.dropout2 = layers.Dropout(rate)
+
+    def call(self, inputs):
+        attn_output = self.att(inputs, inputs)
+        attn_output = self.dropout1(attn_output)
+        out1 = self.layernorm1(inputs + attn_output)
+        ffn_output = self.ffn(out1)
+        ffn_output = self.dropout2(ffn_output)
+        return self.layernorm2(out1 + ffn_output)
+
+
+"""
+## Implement embedding layer
+
+Two separate embedding layers, one for tokens, one for token index (positions).
+"""
+
+
+class TokenAndPositionEmbedding(layers.Layer):
+    def __init__(self, maxlen, vocab_size, embed_dim):
+        super().__init__()
+        self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
+        self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
+
+    def call(self, x):
+        maxlen = ops.shape(x)[-1]
+        positions = ops.arange(start=0, stop=maxlen, step=1)
+        positions = self.pos_emb(positions)
+        x = self.token_emb(x)
+        return x + positions
+
+
+"""
+## Download and prepare dataset
+"""
+
+vocab_size = 20000  # Only consider the top 20k words
+maxlen = 200  # Only consider the first 200 words of each movie review
+(x_train, y_train), (x_val, y_val) = keras.datasets.imdb.load_data(num_words=vocab_size)
+print(len(x_train), "Training sequences")
+print(len(x_val), "Validation sequences")
+x_train = keras.utils.pad_sequences(x_train, maxlen=maxlen)
+x_val = keras.utils.pad_sequences(x_val, maxlen=maxlen)
+
+"""
+## Create classifier model using transformer layer
+
+Transformer layer outputs one vector for each time step of our input sequence.
+Here, we take the mean across all time steps and
+use a feed forward network on top of it to classify text.
+"""
+
+
+embed_dim = 32  # Embedding size for each token
+num_heads = 2  # Number of attention heads
+ff_dim = 32  # Hidden layer size in feed forward network inside transformer
+
+inputs = layers.Input(shape=(maxlen,))
+embedding_layer = TokenAndPositionEmbedding(maxlen, vocab_size, embed_dim)
+x = embedding_layer(inputs)
+transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
+x = transformer_block(x)
+x = layers.GlobalAveragePooling1D()(x)
+x = layers.Dropout(0.1)(x)
+x = layers.Dense(20, activation="relu")(x)
+x = layers.Dropout(0.1)(x)
+outputs = layers.Dense(2, activation="softmax")(x)
+
+model = keras.Model(inputs=inputs, outputs=outputs)
+
+
+"""
+## Train and Evaluate
+"""
+
+model.compile(
+    optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
+)
+history = model.fit(
+    x_train, y_train, batch_size=32, epochs=2, validation_data=(x_val, y_val)
+)
diff --git a/knowledge_base/nlp/text_extraction_with_bert.py b/knowledge_base/nlp/text_extraction_with_bert.py
new file mode 100644
index 0000000000000000000000000000000000000000..7964c776bb8495609cb8d5cfaef359bf851bbacc
--- /dev/null
+++ b/knowledge_base/nlp/text_extraction_with_bert.py
@@ -0,0 +1,347 @@
+"""
+Title: Text Extraction with BERT
+Author: [Apoorv Nandan](https://twitter.com/NandanApoorv)
+Date created: 2020/05/23
+Last modified: 2020/05/23
+Description: Fine tune pretrained BERT from HuggingFace Transformers on SQuAD.
+Accelerator: TPU
+"""
+
+"""
+## Introduction
+
+This demonstration uses SQuAD (Stanford Question-Answering Dataset).
+In SQuAD, an input consists of a question, and a paragraph for context.
+The goal is to find the span of text in the paragraph that answers the question.
+We evaluate our performance on this data with the "Exact Match" metric,
+which measures the percentage of predictions that exactly match any one of the
+ground-truth answers.
+
+We fine-tune a BERT model to perform this task as follows:
+
+1. Feed the context and the question as inputs to BERT.
+2. Take two vectors S and T with dimensions equal to that of
+   hidden states in BERT.
+3. Compute the probability of each token being the start and end of
+   the answer span. The probability of a token being the start of
+   the answer is given by a dot product between S and the representation
+   of the token in the last layer of BERT, followed by a softmax over all tokens.
+   The probability of a token being the end of the answer is computed
+   similarly with the vector T.
+4. Fine-tune BERT and learn S and T along the way.
+
+**References:**
+
+- [BERT](https://arxiv.org/abs/1810.04805)
+- [SQuAD](https://arxiv.org/abs/1606.05250)
+"""
+"""
+## Setup
+"""
+import os
+import re
+import json
+import string
+import numpy as np
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+from tokenizers import BertWordPieceTokenizer
+from transformers import BertTokenizer, TFBertModel, BertConfig
+
+max_len = 384
+configuration = BertConfig()  # default parameters and configuration for BERT
+
+"""
+## Set-up BERT tokenizer
+"""
+# Save the slow pretrained tokenizer
+slow_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
+save_path = "bert_base_uncased/"
+if not os.path.exists(save_path):
+    os.makedirs(save_path)
+slow_tokenizer.save_pretrained(save_path)
+
+# Load the fast tokenizer from saved file
+tokenizer = BertWordPieceTokenizer("bert_base_uncased/vocab.txt", lowercase=True)
+
+"""
+## Load the data
+"""
+train_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/train-v1.1.json"
+train_path = keras.utils.get_file("train.json", train_data_url)
+eval_data_url = "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json"
+eval_path = keras.utils.get_file("eval.json", eval_data_url)
+
+"""
+## Preprocess the data
+
+1. Go through the JSON file and store every record as a `SquadExample` object.
+2. Go through each `SquadExample` and create `x_train, y_train, x_eval, y_eval`.
+"""
+
+
+class SquadExample:
+    def __init__(self, question, context, start_char_idx, answer_text, all_answers):
+        self.question = question
+        self.context = context
+        self.start_char_idx = start_char_idx
+        self.answer_text = answer_text
+        self.all_answers = all_answers
+        self.skip = False
+
+    def preprocess(self):
+        context = self.context
+        question = self.question
+        answer_text = self.answer_text
+        start_char_idx = self.start_char_idx
+
+        # Clean context, answer and question
+        context = " ".join(str(context).split())
+        question = " ".join(str(question).split())
+        answer = " ".join(str(answer_text).split())
+
+        # Find end character index of answer in context
+        end_char_idx = start_char_idx + len(answer)
+        if end_char_idx >= len(context):
+            self.skip = True
+            return
+
+        # Mark the character indexes in context that are in answer
+        is_char_in_ans = [0] * len(context)
+        for idx in range(start_char_idx, end_char_idx):
+            is_char_in_ans[idx] = 1
+
+        # Tokenize context
+        tokenized_context = tokenizer.encode(context)
+
+        # Find tokens that were created from answer characters
+        ans_token_idx = []
+        for idx, (start, end) in enumerate(tokenized_context.offsets):
+            if sum(is_char_in_ans[start:end]) > 0:
+                ans_token_idx.append(idx)
+
+        if len(ans_token_idx) == 0:
+            self.skip = True
+            return
+
+        # Find start and end token index for tokens from answer
+        start_token_idx = ans_token_idx[0]
+        end_token_idx = ans_token_idx[-1]
+
+        # Tokenize question
+        tokenized_question = tokenizer.encode(question)
+
+        # Create inputs
+        input_ids = tokenized_context.ids + tokenized_question.ids[1:]
+        token_type_ids = [0] * len(tokenized_context.ids) + [1] * len(
+            tokenized_question.ids[1:]
+        )
+        attention_mask = [1] * len(input_ids)
+
+        # Pad and create attention masks.
+        # Skip if truncation is needed
+        padding_length = max_len - len(input_ids)
+        if padding_length > 0:  # pad
+            input_ids = input_ids + ([0] * padding_length)
+            attention_mask = attention_mask + ([0] * padding_length)
+            token_type_ids = token_type_ids + ([0] * padding_length)
+        elif padding_length < 0:  # skip
+            self.skip = True
+            return
+
+        self.input_ids = input_ids
+        self.token_type_ids = token_type_ids
+        self.attention_mask = attention_mask
+        self.start_token_idx = start_token_idx
+        self.end_token_idx = end_token_idx
+        self.context_token_to_char = tokenized_context.offsets
+
+
+with open(train_path) as f:
+    raw_train_data = json.load(f)
+
+with open(eval_path) as f:
+    raw_eval_data = json.load(f)
+
+
+def create_squad_examples(raw_data):
+    squad_examples = []
+    for item in raw_data["data"]:
+        for para in item["paragraphs"]:
+            context = para["context"]
+            for qa in para["qas"]:
+                question = qa["question"]
+                answer_text = qa["answers"][0]["text"]
+                all_answers = [_["text"] for _ in qa["answers"]]
+                start_char_idx = qa["answers"][0]["answer_start"]
+                squad_eg = SquadExample(
+                    question, context, start_char_idx, answer_text, all_answers
+                )
+                squad_eg.preprocess()
+                squad_examples.append(squad_eg)
+    return squad_examples
+
+
+def create_inputs_targets(squad_examples):
+    dataset_dict = {
+        "input_ids": [],
+        "token_type_ids": [],
+        "attention_mask": [],
+        "start_token_idx": [],
+        "end_token_idx": [],
+    }
+    for item in squad_examples:
+        if item.skip == False:
+            for key in dataset_dict:
+                dataset_dict[key].append(getattr(item, key))
+    for key in dataset_dict:
+        dataset_dict[key] = np.array(dataset_dict[key])
+
+    x = [
+        dataset_dict["input_ids"],
+        dataset_dict["token_type_ids"],
+        dataset_dict["attention_mask"],
+    ]
+    y = [dataset_dict["start_token_idx"], dataset_dict["end_token_idx"]]
+    return x, y
+
+
+train_squad_examples = create_squad_examples(raw_train_data)
+x_train, y_train = create_inputs_targets(train_squad_examples)
+print(f"{len(train_squad_examples)} training points created.")
+
+eval_squad_examples = create_squad_examples(raw_eval_data)
+x_eval, y_eval = create_inputs_targets(eval_squad_examples)
+print(f"{len(eval_squad_examples)} evaluation points created.")
+
+"""
+Create the Question-Answering Model using BERT and Functional API
+"""
+
+
+def create_model():
+    ## BERT encoder
+    encoder = TFBertModel.from_pretrained("bert-base-uncased")
+
+    ## QA Model
+    input_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
+    token_type_ids = layers.Input(shape=(max_len,), dtype=tf.int32)
+    attention_mask = layers.Input(shape=(max_len,), dtype=tf.int32)
+    embedding = encoder(
+        input_ids, token_type_ids=token_type_ids, attention_mask=attention_mask
+    )[0]
+
+    start_logits = layers.Dense(1, name="start_logit", use_bias=False)(embedding)
+    start_logits = layers.Flatten()(start_logits)
+
+    end_logits = layers.Dense(1, name="end_logit", use_bias=False)(embedding)
+    end_logits = layers.Flatten()(end_logits)
+
+    start_probs = layers.Activation(keras.activations.softmax)(start_logits)
+    end_probs = layers.Activation(keras.activations.softmax)(end_logits)
+
+    model = keras.Model(
+        inputs=[input_ids, token_type_ids, attention_mask],
+        outputs=[start_probs, end_probs],
+    )
+    loss = keras.losses.SparseCategoricalCrossentropy(from_logits=False)
+    optimizer = keras.optimizers.Adam(lr=5e-5)
+    model.compile(optimizer=optimizer, loss=[loss, loss])
+    return model
+
+
+"""
+This code should preferably be run on Google Colab TPU runtime.
+With Colab TPUs, each epoch will take 5-6 minutes.
+"""
+use_tpu = True
+if use_tpu:
+    # Create distribution strategy
+    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
+    strategy = tf.distribute.TPUStrategy(tpu)
+
+    # Create model
+    with strategy.scope():
+        model = create_model()
+else:
+    model = create_model()
+
+model.summary()
+
+"""
+## Create evaluation Callback
+
+This callback will compute the exact match score using the validation data
+after every epoch.
+"""
+
+
+def normalize_text(text):
+    text = text.lower()
+
+    # Remove punctuations
+    exclude = set(string.punctuation)
+    text = "".join(ch for ch in text if ch not in exclude)
+
+    # Remove articles
+    regex = re.compile(r"\b(a|an|the)\b", re.UNICODE)
+    text = re.sub(regex, " ", text)
+
+    # Remove extra white space
+    text = " ".join(text.split())
+    return text
+
+
+class ExactMatch(keras.callbacks.Callback):
+    """
+    Each `SquadExample` object contains the character level offsets for each token
+    in its input paragraph. We use them to get back the span of text corresponding
+    to the tokens between our predicted start and end tokens.
+    All the ground-truth answers are also present in each `SquadExample` object.
+    We calculate the percentage of data points where the span of text obtained
+    from model predictions matches one of the ground-truth answers.
+    """
+
+    def __init__(self, x_eval, y_eval):
+        self.x_eval = x_eval
+        self.y_eval = y_eval
+
+    def on_epoch_end(self, epoch, logs=None):
+        pred_start, pred_end = self.model.predict(self.x_eval)
+        count = 0
+        eval_examples_no_skip = [_ for _ in eval_squad_examples if _.skip == False]
+        for idx, (start, end) in enumerate(zip(pred_start, pred_end)):
+            squad_eg = eval_examples_no_skip[idx]
+            offsets = squad_eg.context_token_to_char
+            start = np.argmax(start)
+            end = np.argmax(end)
+            if start >= len(offsets):
+                continue
+            pred_char_start = offsets[start][0]
+            if end < len(offsets):
+                pred_char_end = offsets[end][1]
+                pred_ans = squad_eg.context[pred_char_start:pred_char_end]
+            else:
+                pred_ans = squad_eg.context[pred_char_start:]
+
+            normalized_pred_ans = normalize_text(pred_ans)
+            normalized_true_ans = [normalize_text(_) for _ in squad_eg.all_answers]
+            if normalized_pred_ans in normalized_true_ans:
+                count += 1
+        acc = count / len(self.y_eval[0])
+        print(f"\nepoch={epoch+1}, exact match score={acc:.2f}")
+
+
+"""
+## Train and Evaluate
+"""
+exact_match_callback = ExactMatch(x_eval, y_eval)
+model.fit(
+    x_train,
+    y_train,
+    epochs=1,  # For demonstration, 3 epochs are recommended
+    verbose=2,
+    batch_size=64,
+    callbacks=[exact_match_callback],
+)
diff --git a/knowledge_base/nlp/tweet-classification-using-tfdf.py b/knowledge_base/nlp/tweet-classification-using-tfdf.py
new file mode 100644
index 0000000000000000000000000000000000000000..8320a0bc6eaf1a5cb72676085e58ef90242fbcf6
--- /dev/null
+++ b/knowledge_base/nlp/tweet-classification-using-tfdf.py
@@ -0,0 +1,297 @@
+"""
+Title: Text classification using Decision Forests and pretrained embeddings
+Author: Gitesh Chawda
+Date created: 09/05/2022
+Last modified: 09/05/2022
+Description: Using Tensorflow Decision Forests for text classification.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+[TensorFlow Decision Forests](https://www.tensorflow.org/decision_forests) (TF-DF)
+is a collection of state-of-the-art algorithms for Decision Forest models that are
+compatible with Keras APIs. The module includes Random Forests, Gradient Boosted Trees,
+and CART, and can be used for regression, classification, and ranking tasks.
+
+In this example we will use Gradient Boosted Trees with pretrained embeddings to
+classify disaster-related tweets.
+
+### See also:
+
+- [TF-DF beginner tutorial](https://www.tensorflow.org/decision_forests/tutorials/beginner_colab)
+- [TF-DF intermediate tutorial](https://www.tensorflow.org/decision_forests/tutorials/intermediate_colab).
+"""
+
+"""
+Install Tensorflow Decision Forest using following command :
+`pip install tensorflow_decision_forests`
+"""
+
+
+"""
+## Imports
+"""
+
+import pandas as pd
+import numpy as np
+import tensorflow as tf
+from tensorflow import keras
+import tensorflow_hub as hub
+from tensorflow.keras import layers
+import tensorflow_decision_forests as tfdf
+import matplotlib.pyplot as plt
+
+"""
+## Get the data
+
+The Dataset is available on [Kaggle](https://www.kaggle.com/c/nlp-getting-started)
+
+Dataset description:
+
+**Files:**
+
+- train.csv: the training set
+
+**Columns:**
+
+- id: a unique identifier for each tweet
+- text: the text of the tweet
+- location: the location the tweet was sent from (may be blank)
+- keyword: a particular keyword from the tweet (may be blank)
+- target: in train.csv only, this denotes whether a tweet is about a real disaster (1) or not (0)
+"""
+
+# Turn .csv files into pandas DataFrame's
+df = pd.read_csv(
+    "https://raw.githubusercontent.com/IMvision12/Tweets-Classification-NLP/main/train.csv"
+)
+print(df.head())
+
+"""
+The dataset includes 7613 samples with 5 columns:
+"""
+
+print(f"Training dataset shape: {df.shape}")
+
+"""
+Shuffling and dropping unnecessary columns:
+"""
+
+df_shuffled = df.sample(frac=1, random_state=42)
+# Dropping id, keyword and location columns as these columns consists of mostly nan values
+# we will be using only text and target columns
+df_shuffled.drop(["id", "keyword", "location"], axis=1, inplace=True)
+df_shuffled.reset_index(inplace=True, drop=True)
+print(df_shuffled.head())
+
+"""
+Printing information about the shuffled dataframe:
+"""
+
+print(df_shuffled.info())
+
+"""
+Total number of "disaster" and "non-disaster" tweets:
+"""
+
+print(
+    "Total Number of disaster and non-disaster tweets: "
+    f"{df_shuffled.target.value_counts()}"
+)
+
+"""
+Let's preview a few samples:
+"""
+
+for index, example in df_shuffled[:5].iterrows():
+    print(f"Example #{index}")
+    print(f"\tTarget : {example['target']}")
+    print(f"\tText : {example['text']}")
+
+"""
+Splitting dataset into training and test sets:
+"""
+test_df = df_shuffled.sample(frac=0.1, random_state=42)
+train_df = df_shuffled.drop(test_df.index)
+print(f"Using {len(train_df)} samples for training and {len(test_df)} for validation")
+
+"""
+Total number of "disaster" and "non-disaster" tweets in the training data:
+"""
+print(train_df["target"].value_counts())
+
+"""
+Total number of "disaster" and "non-disaster" tweets in the test data:
+"""
+
+print(test_df["target"].value_counts())
+
+"""
+## Convert data to a `tf.data.Dataset`
+"""
+
+
+def create_dataset(dataframe):
+    dataset = tf.data.Dataset.from_tensor_slices(
+        (dataframe["text"].to_numpy(), dataframe["target"].to_numpy())
+    )
+    dataset = dataset.batch(100)
+    dataset = dataset.prefetch(tf.data.AUTOTUNE)
+    return dataset
+
+
+train_ds = create_dataset(train_df)
+test_ds = create_dataset(test_df)
+
+"""
+## Downloading pretrained embeddings
+
+The Universal Sentence Encoder embeddings encode text into high-dimensional vectors that can be
+used for text classification, semantic similarity, clustering and other natural language
+tasks. They're trained on a variety of data sources and a variety of tasks. Their input is
+variable-length English text and their output is a 512 dimensional vector.
+
+To learn more about these pretrained embeddings, see
+[Universal Sentence Encoder](https://tfhub.dev/google/universal-sentence-encoder/4).
+
+"""
+
+sentence_encoder_layer = hub.KerasLayer(
+    "https://tfhub.dev/google/universal-sentence-encoder/4"
+)
+
+"""
+## Creating our models
+
+We create two models. In the first model (model_1) raw text will be first encoded via
+pretrained embeddings and then passed to a Gradient Boosted Tree model for
+classification. In the second model (model_2) raw text will be directly passed to
+the Gradient Boosted Trees model.
+"""
+
+"""
+Building model_1
+"""
+
+inputs = layers.Input(shape=(), dtype=tf.string)
+outputs = sentence_encoder_layer(inputs)
+preprocessor = keras.Model(inputs=inputs, outputs=outputs)
+model_1 = tfdf.keras.GradientBoostedTreesModel(preprocessing=preprocessor)
+
+"""
+Building model_2
+"""
+
+model_2 = tfdf.keras.GradientBoostedTreesModel()
+
+"""
+## Train the models
+
+We compile our model by passing the metrics `Accuracy`, `Recall`, `Precision` and
+`AUC`. When it comes to the loss, TF-DF automatically detects the best loss for the task
+(Classification or regression). It is printed in the model summary.
+
+Also, because they're batch-training models rather than mini-batch gradient descent models,
+TF-DF models do not need a validation dataset to monitor overfitting, or to stop
+training early. Some algorithms do not use a validation dataset (e.g. Random Forest)
+while some others do (e.g. Gradient Boosted Trees). If a validation dataset is
+needed, it will be extracted automatically from the training dataset.
+"""
+
+# Compiling model_1
+model_1.compile(metrics=["Accuracy", "Recall", "Precision", "AUC"])
+# Here we do not specify epochs as, TF-DF trains exactly one epoch of the dataset
+model_1.fit(train_ds)
+
+# Compiling model_2
+model_2.compile(metrics=["Accuracy", "Recall", "Precision", "AUC"])
+# Here we do not specify epochs as, TF-DF trains exactly one epoch of the dataset
+model_2.fit(train_ds)
+
+"""
+Prints training logs of model_1
+"""
+
+logs_1 = model_1.make_inspector().training_logs()
+print(logs_1)
+
+"""
+Prints training logs of model_2
+"""
+
+logs_2 = model_2.make_inspector().training_logs()
+print(logs_2)
+
+"""
+The model.summary() method prints a variety of information about your decision tree model, including model type, task, input features, and feature importance.
+"""
+
+print("model_1 summary: ")
+print(model_1.summary())
+print()
+print("model_2 summary: ")
+print(model_2.summary())
+
+"""
+## Plotting training metrics
+"""
+
+
+def plot_curve(logs):
+    plt.figure(figsize=(12, 4))
+
+    plt.subplot(1, 2, 1)
+    plt.plot([log.num_trees for log in logs], [log.evaluation.accuracy for log in logs])
+    plt.xlabel("Number of trees")
+    plt.ylabel("Accuracy")
+
+    plt.subplot(1, 2, 2)
+    plt.plot([log.num_trees for log in logs], [log.evaluation.loss for log in logs])
+    plt.xlabel("Number of trees")
+    plt.ylabel("Loss")
+
+    plt.show()
+
+
+plot_curve(logs_1)
+plot_curve(logs_2)
+
+"""
+## Evaluating on test data
+"""
+
+results = model_1.evaluate(test_ds, return_dict=True, verbose=0)
+print("model_1 Evaluation: \n")
+for name, value in results.items():
+    print(f"{name}: {value:.4f}")
+
+results = model_2.evaluate(test_ds, return_dict=True, verbose=0)
+print("model_2 Evaluation: \n")
+for name, value in results.items():
+    print(f"{name}: {value:.4f}")
+
+"""
+## Predicting on validation data
+"""
+
+test_df.reset_index(inplace=True, drop=True)
+for index, row in test_df.iterrows():
+    text = tf.expand_dims(row["text"], axis=0)
+    preds = model_1.predict_step(text)
+    preds = tf.squeeze(tf.round(preds))
+    print(f"Text: {row['text']}")
+    print(f"Prediction: {int(preds)}")
+    print(f"Ground Truth : {row['target']}")
+    if index == 10:
+        break
+
+"""
+## Concluding remarks
+
+The TensorFlow Decision Forests package provides powerful models
+that work especially well with structured data. In our experiments,
+the Gradient Boosted Tree model with pretrained embeddings achieved 81.6%
+test accuracy while the plain Gradient Boosted Tree model had 54.4% accuracy.
+"""
diff --git a/knowledge_base/rl/actor_critic_cartpole.py b/knowledge_base/rl/actor_critic_cartpole.py
new file mode 100644
index 0000000000000000000000000000000000000000..c44552446c7d810465296fd886c53761886b9f21
--- /dev/null
+++ b/knowledge_base/rl/actor_critic_cartpole.py
@@ -0,0 +1,189 @@
+"""
+Title: Actor Critic Method
+Author: [Apoorv Nandan](https://twitter.com/NandanApoorv)
+Date created: 2020/05/13
+Last modified: 2024/02/22
+Description: Implement Actor Critic Method in CartPole environment.
+Accelerator: NONE
+Converted to Keras 3 by: [Sitam Meur](https://github.com/sitamgithub-MSIT)
+"""
+
+"""
+## Introduction
+
+This script shows an implementation of Actor Critic method on CartPole-V0 environment.
+
+### Actor Critic Method
+
+As an agent takes actions and moves through an environment, it learns to map
+the observed state of the environment to two possible outputs:
+
+1. Recommended action: A probability value for each action in the action space.
+   The part of the agent responsible for this output is called the **actor**.
+2. Estimated rewards in the future: Sum of all rewards it expects to receive in the
+   future. The part of the agent responsible for this output is the **critic**.
+
+Agent and Critic learn to perform their tasks, such that the recommended actions
+from the actor maximize the rewards.
+
+### CartPole-V0
+
+A pole is attached to a cart placed on a frictionless track. The agent has to apply
+force to move the cart. It is rewarded for every time step the pole
+remains upright. The agent, therefore, must learn to keep the pole from falling over.
+
+### References
+
+- [Environment documentation](https://gymnasium.farama.org/environments/classic_control/cart_pole/)
+- [CartPole paper](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)
+- [Actor Critic Method](https://hal.inria.fr/hal-00840470/document)
+"""
+"""
+## Setup
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+import gym
+import numpy as np
+import keras
+from keras import ops
+from keras import layers
+import tensorflow as tf
+
+# Configuration parameters for the whole setup
+seed = 42
+gamma = 0.99  # Discount factor for past rewards
+max_steps_per_episode = 10000
+# Adding `render_mode='human'` will show the attempts of the agent
+env = gym.make("CartPole-v0")  # Create the environment
+env.reset(seed=seed)
+eps = np.finfo(np.float32).eps.item()  # Smallest number such that 1.0 + eps != 1.0
+
+"""
+## Implement Actor Critic network
+
+This network learns two functions:
+
+1. Actor: This takes as input the state of our environment and returns a
+probability value for each action in its action space.
+2. Critic: This takes as input the state of our environment and returns
+an estimate of total rewards in the future.
+
+In our implementation, they share the initial layer.
+"""
+
+num_inputs = 4
+num_actions = 2
+num_hidden = 128
+
+inputs = layers.Input(shape=(num_inputs,))
+common = layers.Dense(num_hidden, activation="relu")(inputs)
+action = layers.Dense(num_actions, activation="softmax")(common)
+critic = layers.Dense(1)(common)
+
+model = keras.Model(inputs=inputs, outputs=[action, critic])
+
+"""
+## Train
+"""
+
+optimizer = keras.optimizers.Adam(learning_rate=0.01)
+huber_loss = keras.losses.Huber()
+action_probs_history = []
+critic_value_history = []
+rewards_history = []
+running_reward = 0
+episode_count = 0
+
+while True:  # Run until solved
+    state = env.reset()[0]
+    episode_reward = 0
+    with tf.GradientTape() as tape:
+        for timestep in range(1, max_steps_per_episode):
+
+            state = ops.convert_to_tensor(state)
+            state = ops.expand_dims(state, 0)
+
+            # Predict action probabilities and estimated future rewards
+            # from environment state
+            action_probs, critic_value = model(state)
+            critic_value_history.append(critic_value[0, 0])
+
+            # Sample action from action probability distribution
+            action = np.random.choice(num_actions, p=np.squeeze(action_probs))
+            action_probs_history.append(ops.log(action_probs[0, action]))
+
+            # Apply the sampled action in our environment
+            state, reward, done, *_ = env.step(action)
+            rewards_history.append(reward)
+            episode_reward += reward
+
+            if done:
+                break
+
+        # Update running reward to check condition for solving
+        running_reward = 0.05 * episode_reward + (1 - 0.05) * running_reward
+
+        # Calculate expected value from rewards
+        # - At each timestep what was the total reward received after that timestep
+        # - Rewards in the past are discounted by multiplying them with gamma
+        # - These are the labels for our critic
+        returns = []
+        discounted_sum = 0
+        for r in rewards_history[::-1]:
+            discounted_sum = r + gamma * discounted_sum
+            returns.insert(0, discounted_sum)
+
+        # Normalize
+        returns = np.array(returns)
+        returns = (returns - np.mean(returns)) / (np.std(returns) + eps)
+        returns = returns.tolist()
+
+        # Calculating loss values to update our network
+        history = zip(action_probs_history, critic_value_history, returns)
+        actor_losses = []
+        critic_losses = []
+        for log_prob, value, ret in history:
+            # At this point in history, the critic estimated that we would get a
+            # total reward = `value` in the future. We took an action with log probability
+            # of `log_prob` and ended up receiving a total reward = `ret`.
+            # The actor must be updated so that it predicts an action that leads to
+            # high rewards (compared to critic's estimate) with high probability.
+            diff = ret - value
+            actor_losses.append(-log_prob * diff)  # actor loss
+
+            # The critic must be updated so that it predicts a better estimate of
+            # the future rewards.
+            critic_losses.append(
+                huber_loss(ops.expand_dims(value, 0), ops.expand_dims(ret, 0))
+            )
+
+        # Backpropagation
+        loss_value = sum(actor_losses) + sum(critic_losses)
+        grads = tape.gradient(loss_value, model.trainable_variables)
+        optimizer.apply_gradients(zip(grads, model.trainable_variables))
+
+        # Clear the loss and reward history
+        action_probs_history.clear()
+        critic_value_history.clear()
+        rewards_history.clear()
+
+    # Log details
+    episode_count += 1
+    if episode_count % 10 == 0:
+        template = "running reward: {:.2f} at episode {}"
+        print(template.format(running_reward, episode_count))
+
+    if running_reward > 195:  # Condition to consider the task solved
+        print("Solved at episode {}!".format(episode_count))
+        break
+"""
+## Visualizations
+In early stages of training:
+![Imgur](https://i.imgur.com/5gCs5kH.gif)
+
+In later stages of training:
+![Imgur](https://i.imgur.com/5ziiZUD.gif)
+"""
diff --git a/knowledge_base/rl/ddpg_pendulum.py b/knowledge_base/rl/ddpg_pendulum.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d5b192fea12e321136a80bfc4a576549f4461f5
--- /dev/null
+++ b/knowledge_base/rl/ddpg_pendulum.py
@@ -0,0 +1,433 @@
+"""
+Title: Deep Deterministic Policy Gradient (DDPG)
+Author: [amifunny](https://github.com/amifunny)
+Date created: 2020/06/04
+Last modified: 2024/03/23
+Description: Implementing DDPG algorithm on the Inverted Pendulum Problem.
+Accelerator: None
+"""
+
+"""
+## Introduction
+
+**Deep Deterministic Policy Gradient (DDPG)** is a model-free off-policy algorithm for
+learning continuous actions.
+
+It combines ideas from DPG (Deterministic Policy Gradient) and DQN (Deep Q-Network).
+It uses Experience Replay and slow-learning target networks from DQN, and it is based on
+DPG, which can operate over continuous action spaces.
+
+This tutorial closely follow this paper -
+[Continuous control with deep reinforcement learning](https://arxiv.org/abs/1509.02971)
+
+## Problem
+
+We are trying to solve the classic **Inverted Pendulum** control problem.
+In this setting, we can take only two actions: swing left or swing right.
+
+What make this problem challenging for Q-Learning Algorithms is that actions
+are **continuous** instead of being **discrete**. That is, instead of using two
+discrete actions like `-1` or `+1`, we have to select from infinite actions
+ranging from `-2` to `+2`.
+
+## Quick theory
+
+Just like the Actor-Critic method, we have two networks:
+
+1. Actor - It proposes an action given a state.
+2. Critic - It predicts if the action is good (positive value) or bad (negative value)
+given a state and an action.
+
+DDPG uses two more techniques not present in the original DQN:
+
+**First, it uses two Target networks.**
+
+**Why?** Because it add stability to training. In short, we are learning from estimated
+targets and Target networks are updated slowly, hence keeping our estimated targets
+stable.
+
+Conceptually, this is like saying, "I have an idea of how to play this well,
+I'm going to try it out for a bit until I find something better",
+as opposed to saying "I'm going to re-learn how to play this entire game after every
+move".
+See this [StackOverflow answer](https://stackoverflow.com/a/54238556/13475679).
+
+**Second, it uses Experience Replay.**
+
+We store list of tuples `(state, action, reward, next_state)`, and instead of
+learning only from recent experience, we learn from sampling all of our experience
+accumulated so far.
+
+Now, let's see how is it implemented.
+"""
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import keras
+from keras import layers
+
+import tensorflow as tf
+import gymnasium as gym
+import numpy as np
+import matplotlib.pyplot as plt
+
+"""
+We use [Gymnasium](https://gymnasium.farama.org/) to create the environment.
+We will use the `upper_bound` parameter to scale our actions later.
+"""
+
+# Specify the `render_mode` parameter to show the attempts of the agent in a pop up window.
+env = gym.make("Pendulum-v1")  # , render_mode="human")
+
+num_states = env.observation_space.shape[0]
+print("Size of State Space ->  {}".format(num_states))
+num_actions = env.action_space.shape[0]
+print("Size of Action Space ->  {}".format(num_actions))
+
+upper_bound = env.action_space.high[0]
+lower_bound = env.action_space.low[0]
+
+print("Max Value of Action ->  {}".format(upper_bound))
+print("Min Value of Action ->  {}".format(lower_bound))
+
+"""
+To implement better exploration by the Actor network, we use noisy perturbations,
+specifically
+an **Ornstein-Uhlenbeck process** for generating noise, as described in the paper.
+It samples noise from a correlated normal distribution.
+"""
+
+
+class OUActionNoise:
+    def __init__(self, mean, std_deviation, theta=0.15, dt=1e-2, x_initial=None):
+        self.theta = theta
+        self.mean = mean
+        self.std_dev = std_deviation
+        self.dt = dt
+        self.x_initial = x_initial
+        self.reset()
+
+    def __call__(self):
+        # Formula taken from https://www.wikipedia.org/wiki/Ornstein-Uhlenbeck_process
+        x = (
+            self.x_prev
+            + self.theta * (self.mean - self.x_prev) * self.dt
+            + self.std_dev * np.sqrt(self.dt) * np.random.normal(size=self.mean.shape)
+        )
+        # Store x into x_prev
+        # Makes next noise dependent on current one
+        self.x_prev = x
+        return x
+
+    def reset(self):
+        if self.x_initial is not None:
+            self.x_prev = self.x_initial
+        else:
+            self.x_prev = np.zeros_like(self.mean)
+
+
+"""
+The `Buffer` class implements Experience Replay.
+
+---
+![Algorithm](https://i.imgur.com/mS6iGyJ.jpg)
+---
+
+
+**Critic loss** - Mean Squared Error of `y - Q(s, a)`
+where `y` is the expected return as seen by the Target network,
+and `Q(s, a)` is action value predicted by the Critic network. `y` is a moving target
+that the critic model tries to achieve; we make this target
+stable by updating the Target model slowly.
+
+**Actor loss** - This is computed using the mean of the value given by the Critic network
+for the actions taken by the Actor network. We seek to maximize this quantity.
+
+Hence we update the Actor network so that it produces actions that get
+the maximum predicted value as seen by the Critic, for a given state.
+"""
+
+
+class Buffer:
+    def __init__(self, buffer_capacity=100000, batch_size=64):
+        # Number of "experiences" to store at max
+        self.buffer_capacity = buffer_capacity
+        # Num of tuples to train on.
+        self.batch_size = batch_size
+
+        # Its tells us num of times record() was called.
+        self.buffer_counter = 0
+
+        # Instead of list of tuples as the exp.replay concept go
+        # We use different np.arrays for each tuple element
+        self.state_buffer = np.zeros((self.buffer_capacity, num_states))
+        self.action_buffer = np.zeros((self.buffer_capacity, num_actions))
+        self.reward_buffer = np.zeros((self.buffer_capacity, 1))
+        self.next_state_buffer = np.zeros((self.buffer_capacity, num_states))
+
+    # Takes (s,a,r,s') observation tuple as input
+    def record(self, obs_tuple):
+        # Set index to zero if buffer_capacity is exceeded,
+        # replacing old records
+        index = self.buffer_counter % self.buffer_capacity
+
+        self.state_buffer[index] = obs_tuple[0]
+        self.action_buffer[index] = obs_tuple[1]
+        self.reward_buffer[index] = obs_tuple[2]
+        self.next_state_buffer[index] = obs_tuple[3]
+
+        self.buffer_counter += 1
+
+    # Eager execution is turned on by default in TensorFlow 2. Decorating with tf.function allows
+    # TensorFlow to build a static graph out of the logic and computations in our function.
+    # This provides a large speed up for blocks of code that contain many small TensorFlow operations such as this one.
+    @tf.function
+    def update(
+        self,
+        state_batch,
+        action_batch,
+        reward_batch,
+        next_state_batch,
+    ):
+        # Training and updating Actor & Critic networks.
+        # See Pseudo Code.
+        with tf.GradientTape() as tape:
+            target_actions = target_actor(next_state_batch, training=True)
+            y = reward_batch + gamma * target_critic(
+                [next_state_batch, target_actions], training=True
+            )
+            critic_value = critic_model([state_batch, action_batch], training=True)
+            critic_loss = keras.ops.mean(keras.ops.square(y - critic_value))
+
+        critic_grad = tape.gradient(critic_loss, critic_model.trainable_variables)
+        critic_optimizer.apply_gradients(
+            zip(critic_grad, critic_model.trainable_variables)
+        )
+
+        with tf.GradientTape() as tape:
+            actions = actor_model(state_batch, training=True)
+            critic_value = critic_model([state_batch, actions], training=True)
+            # Used `-value` as we want to maximize the value given
+            # by the critic for our actions
+            actor_loss = -keras.ops.mean(critic_value)
+
+        actor_grad = tape.gradient(actor_loss, actor_model.trainable_variables)
+        actor_optimizer.apply_gradients(
+            zip(actor_grad, actor_model.trainable_variables)
+        )
+
+    # We compute the loss and update parameters
+    def learn(self):
+        # Get sampling range
+        record_range = min(self.buffer_counter, self.buffer_capacity)
+        # Randomly sample indices
+        batch_indices = np.random.choice(record_range, self.batch_size)
+
+        # Convert to tensors
+        state_batch = keras.ops.convert_to_tensor(self.state_buffer[batch_indices])
+        action_batch = keras.ops.convert_to_tensor(self.action_buffer[batch_indices])
+        reward_batch = keras.ops.convert_to_tensor(self.reward_buffer[batch_indices])
+        reward_batch = keras.ops.cast(reward_batch, dtype="float32")
+        next_state_batch = keras.ops.convert_to_tensor(
+            self.next_state_buffer[batch_indices]
+        )
+
+        self.update(state_batch, action_batch, reward_batch, next_state_batch)
+
+
+# This update target parameters slowly
+# Based on rate `tau`, which is much less than one.
+def update_target(target, original, tau):
+    target_weights = target.get_weights()
+    original_weights = original.get_weights()
+
+    for i in range(len(target_weights)):
+        target_weights[i] = original_weights[i] * tau + target_weights[i] * (1 - tau)
+
+    target.set_weights(target_weights)
+
+
+"""
+Here we define the Actor and Critic networks. These are basic Dense models
+with `ReLU` activation.
+
+Note: We need the initialization for last layer of the Actor to be between
+`-0.003` and `0.003` as this prevents us from getting `1` or `-1` output values in
+the initial stages, which would squash our gradients to zero,
+as we use the `tanh` activation.
+"""
+
+
+def get_actor():
+    # Initialize weights between -3e-3 and 3-e3
+    last_init = keras.initializers.RandomUniform(minval=-0.003, maxval=0.003)
+
+    inputs = layers.Input(shape=(num_states,))
+    out = layers.Dense(256, activation="relu")(inputs)
+    out = layers.Dense(256, activation="relu")(out)
+    outputs = layers.Dense(1, activation="tanh", kernel_initializer=last_init)(out)
+
+    # Our upper bound is 2.0 for Pendulum.
+    outputs = outputs * upper_bound
+    model = keras.Model(inputs, outputs)
+    return model
+
+
+def get_critic():
+    # State as input
+    state_input = layers.Input(shape=(num_states,))
+    state_out = layers.Dense(16, activation="relu")(state_input)
+    state_out = layers.Dense(32, activation="relu")(state_out)
+
+    # Action as input
+    action_input = layers.Input(shape=(num_actions,))
+    action_out = layers.Dense(32, activation="relu")(action_input)
+
+    # Both are passed through separate layer before concatenating
+    concat = layers.Concatenate()([state_out, action_out])
+
+    out = layers.Dense(256, activation="relu")(concat)
+    out = layers.Dense(256, activation="relu")(out)
+    outputs = layers.Dense(1)(out)
+
+    # Outputs single value for give state-action
+    model = keras.Model([state_input, action_input], outputs)
+
+    return model
+
+
+"""
+`policy()` returns an action sampled from our Actor network plus some noise for
+exploration.
+"""
+
+
+def policy(state, noise_object):
+    sampled_actions = keras.ops.squeeze(actor_model(state))
+    noise = noise_object()
+    # Adding noise to action
+    sampled_actions = sampled_actions.numpy() + noise
+
+    # We make sure action is within bounds
+    legal_action = np.clip(sampled_actions, lower_bound, upper_bound)
+
+    return [np.squeeze(legal_action)]
+
+
+"""
+## Training hyperparameters
+"""
+
+std_dev = 0.2
+ou_noise = OUActionNoise(mean=np.zeros(1), std_deviation=float(std_dev) * np.ones(1))
+
+actor_model = get_actor()
+critic_model = get_critic()
+
+target_actor = get_actor()
+target_critic = get_critic()
+
+# Making the weights equal initially
+target_actor.set_weights(actor_model.get_weights())
+target_critic.set_weights(critic_model.get_weights())
+
+# Learning rate for actor-critic models
+critic_lr = 0.002
+actor_lr = 0.001
+
+critic_optimizer = keras.optimizers.Adam(critic_lr)
+actor_optimizer = keras.optimizers.Adam(actor_lr)
+
+total_episodes = 100
+# Discount factor for future rewards
+gamma = 0.99
+# Used to update target networks
+tau = 0.005
+
+buffer = Buffer(50000, 64)
+
+"""
+Now we implement our main training loop, and iterate over episodes.
+We sample actions using `policy()` and train with `learn()` at each time step,
+along with updating the Target networks at a rate `tau`.
+"""
+
+# To store reward history of each episode
+ep_reward_list = []
+# To store average reward history of last few episodes
+avg_reward_list = []
+
+# Takes about 4 min to train
+for ep in range(total_episodes):
+    prev_state, _ = env.reset()
+    episodic_reward = 0
+
+    while True:
+        tf_prev_state = keras.ops.expand_dims(
+            keras.ops.convert_to_tensor(prev_state), 0
+        )
+
+        action = policy(tf_prev_state, ou_noise)
+        # Receive state and reward from environment.
+        state, reward, done, truncated, _ = env.step(action)
+
+        buffer.record((prev_state, action, reward, state))
+        episodic_reward += reward
+
+        buffer.learn()
+
+        update_target(target_actor, actor_model, tau)
+        update_target(target_critic, critic_model, tau)
+
+        # End this episode when `done` or `truncated` is True
+        if done or truncated:
+            break
+
+        prev_state = state
+
+    ep_reward_list.append(episodic_reward)
+
+    # Mean of last 40 episodes
+    avg_reward = np.mean(ep_reward_list[-40:])
+    print("Episode * {} * Avg Reward is ==> {}".format(ep, avg_reward))
+    avg_reward_list.append(avg_reward)
+
+# Plotting graph
+# Episodes versus Avg. Rewards
+plt.plot(avg_reward_list)
+plt.xlabel("Episode")
+plt.ylabel("Avg. Episodic Reward")
+plt.show()
+
+"""
+If training proceeds correctly, the average episodic reward will increase with time.
+
+Feel free to try different learning rates, `tau` values, and architectures for the
+Actor and Critic networks.
+
+The Inverted Pendulum problem has low complexity, but DDPG work great on many other
+problems.
+
+Another great environment to try this on is `LunarLander-v2` continuous, but it will take
+more episodes to obtain good results.
+"""
+
+# Save the weights
+actor_model.save_weights("pendulum_actor.weights.h5")
+critic_model.save_weights("pendulum_critic.weights.h5")
+
+target_actor.save_weights("pendulum_target_actor.weights.h5")
+target_critic.save_weights("pendulum_target_critic.weights.h5")
+
+"""
+Before Training:
+
+![before_img](https://i.imgur.com/ox6b9rC.gif)
+"""
+
+"""
+After 100 episodes:
+
+![after_img](https://i.imgur.com/eEH8Cz6.gif)
+"""
diff --git a/knowledge_base/rl/deep_q_network_breakout.py b/knowledge_base/rl/deep_q_network_breakout.py
new file mode 100644
index 0000000000000000000000000000000000000000..d2d70afe4280af0887df868dd14ead9871d96af4
--- /dev/null
+++ b/knowledge_base/rl/deep_q_network_breakout.py
@@ -0,0 +1,285 @@
+"""
+Title: Deep Q-Learning for Atari Breakout
+Author: [Jacob Chapman](https://twitter.com/jacoblchapman) and [Mathias Lechner](https://twitter.com/MLech20)
+Date created: 2020/05/23
+Last modified: 2024/03/17
+Description: Play Atari Breakout with a Deep Q-Network.
+Accelerator: None
+"""
+
+"""
+## Introduction
+
+This script shows an implementation of Deep Q-Learning on the
+`BreakoutNoFrameskip-v4` environment.
+
+### Deep Q-Learning
+
+As an agent takes actions and moves through an environment, it learns to map
+the observed state of the environment to an action. An agent will choose an action
+in a given state based on a "Q-value", which is a weighted reward based on the
+expected highest long-term reward. A Q-Learning Agent learns to perform its
+task such that the recommended action maximizes the potential future rewards.
+This method is considered an "Off-Policy" method,
+meaning its Q values are updated assuming that the best action was chosen, even
+if the best action was not chosen.
+
+### Atari Breakout
+
+In this environment, a board moves along the bottom of the screen returning a ball that
+will destroy blocks at the top of the screen.
+The aim of the game is to remove all blocks and breakout of the
+level. The agent must learn to control the board by moving left and right, returning the
+ball and removing all the blocks without the ball passing the board.
+
+### Note
+
+The Deepmind paper trained for "a total of 50 million frames (that is, around 38 days of
+game experience in total)". However this script will give good results at around 10
+million frames which are processed in less than 24 hours on a modern machine.
+
+You can control the number of episodes by setting the `max_episodes` variable
+to a value greater than 0.
+
+### References
+
+- [Q-Learning](https://link.springer.com/content/pdf/10.1007/BF00992698.pdf)
+- [Deep Q-Learning](https://www.semanticscholar.org/paper/Human-level-control-through-deep-reinforcement-Mnih-Kavukcuoglu/340f48901f72278f6bf78a04ee5b01df208cc508)
+"""
+"""
+## Setup
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import keras
+from keras import layers
+
+import gymnasium as gym
+from gymnasium.wrappers import AtariPreprocessing, FrameStack
+import numpy as np
+import tensorflow as tf
+
+# Configuration parameters for the whole setup
+seed = 42
+gamma = 0.99  # Discount factor for past rewards
+epsilon = 1.0  # Epsilon greedy parameter
+epsilon_min = 0.1  # Minimum epsilon greedy parameter
+epsilon_max = 1.0  # Maximum epsilon greedy parameter
+epsilon_interval = (
+    epsilon_max - epsilon_min
+)  # Rate at which to reduce chance of random action being taken
+batch_size = 32  # Size of batch taken from replay buffer
+max_steps_per_episode = 10000
+max_episodes = 10  # Limit training episodes, will run until solved if smaller than 1
+
+# Use the Atari environment
+# Specify the `render_mode` parameter to show the attempts of the agent in a pop up window.
+env = gym.make("BreakoutNoFrameskip-v4")  # , render_mode="human")
+# Environment preprocessing
+env = AtariPreprocessing(env)
+# Stack four frames
+env = FrameStack(env, 4)
+env.seed(seed)
+"""
+## Implement the Deep Q-Network
+
+This network learns an approximation of the Q-table, which is a mapping between
+the states and actions that an agent will take. For every state we'll have four
+actions, that can be taken. The environment provides the state, and the action
+is chosen by selecting the larger of the four Q-values predicted in the output layer.
+
+"""
+
+num_actions = 4
+
+
+def create_q_model():
+    # Network defined by the Deepmind paper
+    return keras.Sequential(
+        [
+            layers.Lambda(
+                lambda tensor: keras.ops.transpose(tensor, [0, 2, 3, 1]),
+                output_shape=(84, 84, 4),
+                input_shape=(4, 84, 84),
+            ),
+            # Convolutions on the frames on the screen
+            layers.Conv2D(32, 8, strides=4, activation="relu", input_shape=(4, 84, 84)),
+            layers.Conv2D(64, 4, strides=2, activation="relu"),
+            layers.Conv2D(64, 3, strides=1, activation="relu"),
+            layers.Flatten(),
+            layers.Dense(512, activation="relu"),
+            layers.Dense(num_actions, activation="linear"),
+        ]
+    )
+
+
+# The first model makes the predictions for Q-values which are used to
+# make a action.
+model = create_q_model()
+# Build a target model for the prediction of future rewards.
+# The weights of a target model get updated every 10000 steps thus when the
+# loss between the Q-values is calculated the target Q-value is stable.
+model_target = create_q_model()
+
+
+"""
+## Train
+"""
+# In the Deepmind paper they use RMSProp however then Adam optimizer
+# improves training time
+optimizer = keras.optimizers.Adam(learning_rate=0.00025, clipnorm=1.0)
+
+# Experience replay buffers
+action_history = []
+state_history = []
+state_next_history = []
+rewards_history = []
+done_history = []
+episode_reward_history = []
+running_reward = 0
+episode_count = 0
+frame_count = 0
+# Number of frames to take random action and observe output
+epsilon_random_frames = 50000
+# Number of frames for exploration
+epsilon_greedy_frames = 1000000.0
+# Maximum replay length
+# Note: The Deepmind paper suggests 1000000 however this causes memory issues
+max_memory_length = 100000
+# Train the model after 4 actions
+update_after_actions = 4
+# How often to update the target network
+update_target_network = 10000
+# Using huber loss for stability
+loss_function = keras.losses.Huber()
+
+while True:
+    observation, _ = env.reset()
+    state = np.array(observation)
+    episode_reward = 0
+
+    for timestep in range(1, max_steps_per_episode):
+        frame_count += 1
+
+        # Use epsilon-greedy for exploration
+        if frame_count < epsilon_random_frames or epsilon > np.random.rand(1)[0]:
+            # Take random action
+            action = np.random.choice(num_actions)
+        else:
+            # Predict action Q-values
+            # From environment state
+            state_tensor = keras.ops.convert_to_tensor(state)
+            state_tensor = keras.ops.expand_dims(state_tensor, 0)
+            action_probs = model(state_tensor, training=False)
+            # Take best action
+            action = keras.ops.argmax(action_probs[0]).numpy()
+
+        # Decay probability of taking random action
+        epsilon -= epsilon_interval / epsilon_greedy_frames
+        epsilon = max(epsilon, epsilon_min)
+
+        # Apply the sampled action in our environment
+        state_next, reward, done, _, _ = env.step(action)
+        state_next = np.array(state_next)
+
+        episode_reward += reward
+
+        # Save actions and states in replay buffer
+        action_history.append(action)
+        state_history.append(state)
+        state_next_history.append(state_next)
+        done_history.append(done)
+        rewards_history.append(reward)
+        state = state_next
+
+        # Update every fourth frame and once batch size is over 32
+        if frame_count % update_after_actions == 0 and len(done_history) > batch_size:
+            # Get indices of samples for replay buffers
+            indices = np.random.choice(range(len(done_history)), size=batch_size)
+
+            # Using list comprehension to sample from replay buffer
+            state_sample = np.array([state_history[i] for i in indices])
+            state_next_sample = np.array([state_next_history[i] for i in indices])
+            rewards_sample = [rewards_history[i] for i in indices]
+            action_sample = [action_history[i] for i in indices]
+            done_sample = keras.ops.convert_to_tensor(
+                [float(done_history[i]) for i in indices]
+            )
+
+            # Build the updated Q-values for the sampled future states
+            # Use the target model for stability
+            future_rewards = model_target.predict(state_next_sample)
+            # Q value = reward + discount factor * expected future reward
+            updated_q_values = rewards_sample + gamma * keras.ops.amax(
+                future_rewards, axis=1
+            )
+
+            # If final frame set the last value to -1
+            updated_q_values = updated_q_values * (1 - done_sample) - done_sample
+
+            # Create a mask so we only calculate loss on the updated Q-values
+            masks = keras.ops.one_hot(action_sample, num_actions)
+
+            with tf.GradientTape() as tape:
+                # Train the model on the states and updated Q-values
+                q_values = model(state_sample)
+
+                # Apply the masks to the Q-values to get the Q-value for action taken
+                q_action = keras.ops.sum(keras.ops.multiply(q_values, masks), axis=1)
+                # Calculate loss between new Q-value and old Q-value
+                loss = loss_function(updated_q_values, q_action)
+
+            # Backpropagation
+            grads = tape.gradient(loss, model.trainable_variables)
+            optimizer.apply_gradients(zip(grads, model.trainable_variables))
+
+        if frame_count % update_target_network == 0:
+            # update the the target network with new weights
+            model_target.set_weights(model.get_weights())
+            # Log details
+            template = "running reward: {:.2f} at episode {}, frame count {}"
+            print(template.format(running_reward, episode_count, frame_count))
+
+        # Limit the state and reward history
+        if len(rewards_history) > max_memory_length:
+            del rewards_history[:1]
+            del state_history[:1]
+            del state_next_history[:1]
+            del action_history[:1]
+            del done_history[:1]
+
+        if done:
+            break
+
+    # Update running reward to check condition for solving
+    episode_reward_history.append(episode_reward)
+    if len(episode_reward_history) > 100:
+        del episode_reward_history[:1]
+    running_reward = np.mean(episode_reward_history)
+
+    episode_count += 1
+
+    if running_reward > 40:  # Condition to consider the task solved
+        print("Solved at episode {}!".format(episode_count))
+        break
+
+    if (
+        max_episodes > 0 and episode_count >= max_episodes
+    ):  # Maximum number of episodes reached
+        print("Stopped at episode {}!".format(episode_count))
+        break
+
+"""
+## Visualizations
+Before any training:
+![Imgur](https://i.imgur.com/rRxXF4H.gif)
+
+In early stages of training:
+![Imgur](https://i.imgur.com/X8ghdpL.gif)
+
+In later stages of training:
+![Imgur](https://i.imgur.com/Z1K6qBQ.gif)
+"""
diff --git a/knowledge_base/rl/ppo_cartpole.py b/knowledge_base/rl/ppo_cartpole.py
new file mode 100644
index 0000000000000000000000000000000000000000..815127c2b917d3c4aa7d4f9aa54330e262f1c184
--- /dev/null
+++ b/knowledge_base/rl/ppo_cartpole.py
@@ -0,0 +1,340 @@
+"""
+Title: Proximal Policy Optimization
+Author: [Ilias Chrysovergis](https://twitter.com/iliachry)
+Date created: 2021/06/24
+Last modified: 2024/03/12
+Description: Implementation of a Proximal Policy Optimization agent for the CartPole-v1 environment.
+Accelerator: None
+"""
+
+"""
+## Introduction
+
+This code example solves the CartPole-v1 environment using a Proximal Policy Optimization (PPO) agent.
+
+### CartPole-v1
+
+A pole is attached by an un-actuated joint to a cart, which moves along a frictionless track.
+The system is controlled by applying a force of +1 or -1 to the cart.
+The pendulum starts upright, and the goal is to prevent it from falling over.
+A reward of +1 is provided for every timestep that the pole remains upright.
+The episode ends when the pole is more than 15 degrees from vertical, or the cart moves more than 2.4 units from the center.
+After 200 steps the episode ends. Thus, the highest return we can get is equal to 200.
+
+[CartPole-v1](https://gymnasium.farama.org/environments/classic_control/cart_pole/)
+
+### Proximal Policy Optimization
+
+PPO is a policy gradient method and can be used for environments with either discrete or continuous action spaces.
+It trains a stochastic policy in an on-policy way. Also, it utilizes the actor critic method. The actor maps the
+observation to an action and the critic gives an expectation of the rewards of the agent for the observation given.
+Firstly, it collects a set of trajectories for each epoch by sampling from the latest version of the stochastic policy.
+Then, the rewards-to-go and the advantage estimates are computed in order to update the policy and fit the value function.
+The policy is updated via a stochastic gradient ascent optimizer, while the value function is fitted via some gradient descent algorithm.
+This procedure is applied for many epochs until the environment is solved.
+
+![Algorithm](https://i.imgur.com/rd5tda1.png)
+
+- [Proximal Policy Optimization Algorithms](https://arxiv.org/abs/1707.06347)
+- [OpenAI Spinning Up docs - PPO](https://spinningup.openai.com/en/latest/algorithms/ppo.html)
+
+### Note
+
+This code example uses Keras and Tensorflow v2. It is based on the PPO Original Paper,
+the OpenAI's Spinning Up docs for PPO, and the OpenAI's Spinning Up implementation of PPO using Tensorflow v1.
+
+[OpenAI Spinning Up Github - PPO](https://github.com/openai/spinningup/blob/master/spinup/algos/tf1/ppo/ppo.py)
+"""
+
+"""
+## Libraries
+
+For this example the following libraries are used:
+
+1. `numpy` for n-dimensional arrays
+2. `tensorflow` and `keras` for building the deep RL PPO agent
+3. `gymnasium` for getting everything we need about the environment
+4. `scipy.signal` for calculating the discounted cumulative sums of vectors
+"""
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import keras
+from keras import layers
+
+import numpy as np
+import tensorflow as tf
+import gymnasium as gym
+import scipy.signal
+
+"""
+## Functions and class
+"""
+
+
+def discounted_cumulative_sums(x, discount):
+    # Discounted cumulative sums of vectors for computing rewards-to-go and advantage estimates
+    return scipy.signal.lfilter([1], [1, float(-discount)], x[::-1], axis=0)[::-1]
+
+
+class Buffer:
+    # Buffer for storing trajectories
+    def __init__(self, observation_dimensions, size, gamma=0.99, lam=0.95):
+        # Buffer initialization
+        self.observation_buffer = np.zeros(
+            (size, observation_dimensions), dtype=np.float32
+        )
+        self.action_buffer = np.zeros(size, dtype=np.int32)
+        self.advantage_buffer = np.zeros(size, dtype=np.float32)
+        self.reward_buffer = np.zeros(size, dtype=np.float32)
+        self.return_buffer = np.zeros(size, dtype=np.float32)
+        self.value_buffer = np.zeros(size, dtype=np.float32)
+        self.logprobability_buffer = np.zeros(size, dtype=np.float32)
+        self.gamma, self.lam = gamma, lam
+        self.pointer, self.trajectory_start_index = 0, 0
+
+    def store(self, observation, action, reward, value, logprobability):
+        # Append one step of agent-environment interaction
+        self.observation_buffer[self.pointer] = observation
+        self.action_buffer[self.pointer] = action
+        self.reward_buffer[self.pointer] = reward
+        self.value_buffer[self.pointer] = value
+        self.logprobability_buffer[self.pointer] = logprobability
+        self.pointer += 1
+
+    def finish_trajectory(self, last_value=0):
+        # Finish the trajectory by computing advantage estimates and rewards-to-go
+        path_slice = slice(self.trajectory_start_index, self.pointer)
+        rewards = np.append(self.reward_buffer[path_slice], last_value)
+        values = np.append(self.value_buffer[path_slice], last_value)
+
+        deltas = rewards[:-1] + self.gamma * values[1:] - values[:-1]
+
+        self.advantage_buffer[path_slice] = discounted_cumulative_sums(
+            deltas, self.gamma * self.lam
+        )
+        self.return_buffer[path_slice] = discounted_cumulative_sums(
+            rewards, self.gamma
+        )[:-1]
+
+        self.trajectory_start_index = self.pointer
+
+    def get(self):
+        # Get all data of the buffer and normalize the advantages
+        self.pointer, self.trajectory_start_index = 0, 0
+        advantage_mean, advantage_std = (
+            np.mean(self.advantage_buffer),
+            np.std(self.advantage_buffer),
+        )
+        self.advantage_buffer = (self.advantage_buffer - advantage_mean) / advantage_std
+        return (
+            self.observation_buffer,
+            self.action_buffer,
+            self.advantage_buffer,
+            self.return_buffer,
+            self.logprobability_buffer,
+        )
+
+
+def mlp(x, sizes, activation=keras.activations.tanh, output_activation=None):
+    # Build a feedforward neural network
+    for size in sizes[:-1]:
+        x = layers.Dense(units=size, activation=activation)(x)
+    return layers.Dense(units=sizes[-1], activation=output_activation)(x)
+
+
+def logprobabilities(logits, a):
+    # Compute the log-probabilities of taking actions a by using the logits (i.e. the output of the actor)
+    logprobabilities_all = keras.ops.log_softmax(logits)
+    logprobability = keras.ops.sum(
+        keras.ops.one_hot(a, num_actions) * logprobabilities_all, axis=1
+    )
+    return logprobability
+
+
+seed_generator = keras.random.SeedGenerator(1337)
+
+
+# Sample action from actor
+@tf.function
+def sample_action(observation):
+    logits = actor(observation)
+    action = keras.ops.squeeze(
+        keras.random.categorical(logits, 1, seed=seed_generator), axis=1
+    )
+    return logits, action
+
+
+# Train the policy by maxizing the PPO-Clip objective
+@tf.function
+def train_policy(
+    observation_buffer, action_buffer, logprobability_buffer, advantage_buffer
+):
+    with tf.GradientTape() as tape:  # Record operations for automatic differentiation.
+        ratio = keras.ops.exp(
+            logprobabilities(actor(observation_buffer), action_buffer)
+            - logprobability_buffer
+        )
+        min_advantage = keras.ops.where(
+            advantage_buffer > 0,
+            (1 + clip_ratio) * advantage_buffer,
+            (1 - clip_ratio) * advantage_buffer,
+        )
+
+        policy_loss = -keras.ops.mean(
+            keras.ops.minimum(ratio * advantage_buffer, min_advantage)
+        )
+    policy_grads = tape.gradient(policy_loss, actor.trainable_variables)
+    policy_optimizer.apply_gradients(zip(policy_grads, actor.trainable_variables))
+
+    kl = keras.ops.mean(
+        logprobability_buffer
+        - logprobabilities(actor(observation_buffer), action_buffer)
+    )
+    kl = keras.ops.sum(kl)
+    return kl
+
+
+# Train the value function by regression on mean-squared error
+@tf.function
+def train_value_function(observation_buffer, return_buffer):
+    with tf.GradientTape() as tape:  # Record operations for automatic differentiation.
+        value_loss = keras.ops.mean((return_buffer - critic(observation_buffer)) ** 2)
+    value_grads = tape.gradient(value_loss, critic.trainable_variables)
+    value_optimizer.apply_gradients(zip(value_grads, critic.trainable_variables))
+
+
+"""
+## Hyperparameters
+"""
+
+# Hyperparameters of the PPO algorithm
+steps_per_epoch = 4000
+epochs = 30
+gamma = 0.99
+clip_ratio = 0.2
+policy_learning_rate = 3e-4
+value_function_learning_rate = 1e-3
+train_policy_iterations = 80
+train_value_iterations = 80
+lam = 0.97
+target_kl = 0.01
+hidden_sizes = (64, 64)
+
+# True if you want to render the environment
+render = False
+
+"""
+## Initializations
+"""
+
+# Initialize the environment and get the dimensionality of the
+# observation space and the number of possible actions
+env = gym.make("CartPole-v1")
+observation_dimensions = env.observation_space.shape[0]
+num_actions = env.action_space.n
+
+# Initialize the buffer
+buffer = Buffer(observation_dimensions, steps_per_epoch)
+
+# Initialize the actor and the critic as keras models
+observation_input = keras.Input(shape=(observation_dimensions,), dtype="float32")
+logits = mlp(observation_input, list(hidden_sizes) + [num_actions])
+actor = keras.Model(inputs=observation_input, outputs=logits)
+value = keras.ops.squeeze(mlp(observation_input, list(hidden_sizes) + [1]), axis=1)
+critic = keras.Model(inputs=observation_input, outputs=value)
+
+# Initialize the policy and the value function optimizers
+policy_optimizer = keras.optimizers.Adam(learning_rate=policy_learning_rate)
+value_optimizer = keras.optimizers.Adam(learning_rate=value_function_learning_rate)
+
+# Initialize the observation, episode return and episode length
+observation, _ = env.reset()
+episode_return, episode_length = 0, 0
+
+"""
+## Train
+"""
+# Iterate over the number of epochs
+for epoch in range(epochs):
+    # Initialize the sum of the returns, lengths and number of episodes for each epoch
+    sum_return = 0
+    sum_length = 0
+    num_episodes = 0
+
+    # Iterate over the steps of each epoch
+    for t in range(steps_per_epoch):
+        if render:
+            env.render()
+
+        # Get the logits, action, and take one step in the environment
+        observation = observation.reshape(1, -1)
+        logits, action = sample_action(observation)
+        observation_new, reward, done, _, _ = env.step(action[0].numpy())
+        episode_return += reward
+        episode_length += 1
+
+        # Get the value and log-probability of the action
+        value_t = critic(observation)
+        logprobability_t = logprobabilities(logits, action)
+
+        # Store obs, act, rew, v_t, logp_pi_t
+        buffer.store(observation, action, reward, value_t, logprobability_t)
+
+        # Update the observation
+        observation = observation_new
+
+        # Finish trajectory if reached to a terminal state
+        terminal = done
+        if terminal or (t == steps_per_epoch - 1):
+            last_value = 0 if done else critic(observation.reshape(1, -1))
+            buffer.finish_trajectory(last_value)
+            sum_return += episode_return
+            sum_length += episode_length
+            num_episodes += 1
+            observation, _ = env.reset()
+            episode_return, episode_length = 0, 0
+
+    # Get values from the buffer
+    (
+        observation_buffer,
+        action_buffer,
+        advantage_buffer,
+        return_buffer,
+        logprobability_buffer,
+    ) = buffer.get()
+
+    # Update the policy and implement early stopping using KL divergence
+    for _ in range(train_policy_iterations):
+        kl = train_policy(
+            observation_buffer, action_buffer, logprobability_buffer, advantage_buffer
+        )
+        if kl > 1.5 * target_kl:
+            # Early Stopping
+            break
+
+    # Update the value function
+    for _ in range(train_value_iterations):
+        train_value_function(observation_buffer, return_buffer)
+
+    # Print mean return and length for each epoch
+    print(
+        f" Epoch: {epoch + 1}. Mean Return: {sum_return / num_episodes}. Mean Length: {sum_length / num_episodes}"
+    )
+
+"""
+## Visualizations
+
+Before training:
+
+![Imgur](https://i.imgur.com/rKXDoMC.gif)
+
+After 8 epochs of training:
+
+![Imgur](https://i.imgur.com/M0FbhF0.gif)
+
+After 20 epochs of training:
+
+![Imgur](https://i.imgur.com/tKhTEaF.gif)
+"""
diff --git a/knowledge_base/structured_data/class_with_grn_and_vsn_with_hyperparameters_tuning.py b/knowledge_base/structured_data/class_with_grn_and_vsn_with_hyperparameters_tuning.py
new file mode 100644
index 0000000000000000000000000000000000000000..961dd8b2ad467f8b8b804d8649b11aaa2d5bf724
--- /dev/null
+++ b/knowledge_base/structured_data/class_with_grn_and_vsn_with_hyperparameters_tuning.py
@@ -0,0 +1,708 @@
+"""
+Title: Classification with Gated Residual and Variable Selection Networks with HyperParameters tuning
+Author: [Humbulani Ndou](https://github.com/Humbulani1234)
+Date created: 2025/03/17
+Last modified: 2025/03/17
+Description: Gated Residual and Variable Selection Networks prediction with HyperParameters tuning.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+The following example extends the script `structured_data/classification_with_grn_and_vsn.py` by incorporating hyperparameters tuning
+using [Autokeras](https://github.com/keras-team/autokeras) and [KerasTuner](https://github.com/keras-team/keras-tuner). Specifics regarding
+which APIs are used from the these two packages will be described in detail in the relevant code sections.
+
+This example demonstrates the use of Gated
+Residual Networks (GRN) and Variable Selection Networks (VSN), proposed by
+Bryan Lim et al. in
+[Temporal Fusion Transformers (TFT) for Interpretable Multi-horizon Time Series Forecasting](https://arxiv.org/abs/1912.09363),
+for structured data classification. GRNs give the flexibility to the model to apply
+non-linear processing only where needed. VSNs allow the model to softly remove any
+unnecessary noisy inputs which could negatively impact performance.
+Together, those techniques help improving the learning capacity of deep neural
+network models.
+
+Note that this example implements only the GRN and VSN components described in
+in the paper, rather than the whole TFT model, as GRN and VSN can be useful on
+their own for structured data learning tasks.
+
+
+To run the code you need to use TensorFlow 2.3 or higher.
+"""
+
+"""
+## The dataset
+
+[Our dataset](https://archive.ics.uci.edu/ml/datasets/heart+Disease) is provided by the
+Cleveland Clinic Foundation for Heart Disease.
+It's a CSV file with 303 rows. Each row contains information about a patient (a
+**sample**), and each column describes an attribute of the patient (a **feature**). We
+use the features to predict whether a patient has a heart disease (**binary
+classification**).
+
+Here's the description of each feature:
+
+Column| Description| Feature Type
+------------|--------------------|----------------------
+Age | Age in years | Numerical
+Sex | (1 = male; 0 = female) | Categorical
+CP | Chest pain type (0, 1, 2, 3, 4) | Categorical
+Trestbpd | Resting blood pressure (in mm Hg on admission) | Numerical
+Chol | Serum cholesterol in mg/dl | Numerical
+FBS | fasting blood sugar in 120 mg/dl (1 = true; 0 = false) | Categorical
+RestECG | Resting electrocardiogram results (0, 1, 2) | Categorical
+Thalach | Maximum heart rate achieved | Numerical
+Exang | Exercise induced angina (1 = yes; 0 = no) | Categorical
+Oldpeak | ST depression induced by exercise relative to rest | Numerical
+Slope | Slope of the peak exercise ST segment | Numerical
+CA | Number of major vessels (0-3) colored by fluoroscopy | Both numerical & categorical
+Thal | 3 = normal; 6 = fixed defect; 7 = reversible defect | Categorical
+Target | Diagnosis of heart disease (1 = true; 0 = false) | Target
+"""
+
+"""
+## Setup
+"""
+
+
+import os
+import subprocess
+import tarfile
+import numpy as np
+import pandas as pd
+import tree
+from typing import Optional, Union
+
+os.environ["KERAS_BACKEND"] = "tensorflow"  # or jax, or torch
+
+# Keras imports
+import keras
+from keras import layers
+
+# KerasTuner imports
+import keras_tuner
+from keras_tuner import HyperParameters
+
+# AutoKeras imports
+import autokeras as ak
+from autokeras.utils import utils, types
+
+
+"""
+## Preparing the data
+
+Let's download the data and load it into a Pandas dataframe:
+"""
+
+file_url = "http://storage.googleapis.com/download.tensorflow.org/data/heart.csv"
+dataframe = pd.read_csv(file_url)
+
+"""
+The dataset includes 303 samples with 14 columns per sample (13 features, plus the target
+label):
+"""
+
+dataframe.shape
+
+"""
+Here's a preview of a few samples:
+"""
+
+dataframe.head()
+
+"""
+The last column, "target", indicates whether the patient has a heart disease (1) or not
+(0).
+
+Let's split the data into a training and validation set:
+"""
+
+val_dataframe = dataframe.sample(frac=0.2, random_state=1337)
+train_dataframe = dataframe.drop(val_dataframe.index)
+
+print(
+    f"Using {len(train_dataframe)} samples for training "
+    f"and {len(val_dataframe)} for validation"
+)
+
+
+"""
+## Define dataset metadata
+
+Here, we define the metadata of the dataset that will be useful for reading and
+parsing the data into input features, and encoding the input features with respect
+to their types.
+"""
+
+COLUMN_NAMES = [
+    "age",
+    "sex",
+    "cp",
+    "trestbps",
+    "chol",
+    "fbs",
+    "restecg",
+    "thalach",
+    "exang",
+    "oldpeak",
+    "slope",
+    "ca",
+    "thal",
+    "target",
+]
+# Target feature name.
+TARGET_FEATURE_NAME = "target"
+# Numeric feature names.
+NUMERIC_FEATURE_NAMES = ["age", "trestbps", "thalach", "oldpeak", "slope", "chol"]
+# Categorical features and their vocabulary lists.
+# Note that we add 'v=' as a prefix to all categorical feature values to make
+# sure that they are treated as strings.
+
+CATEGORICAL_FEATURES_WITH_VOCABULARY = {
+    feature_name: sorted(
+        [
+            # Integer categorcal must be int and string must be str
+            value if dataframe[feature_name].dtype == "int64" else str(value)
+            for value in list(dataframe[feature_name].unique())
+        ]
+    )
+    for feature_name in COLUMN_NAMES
+    if feature_name not in list(NUMERIC_FEATURE_NAMES + [TARGET_FEATURE_NAME])
+}
+# All features names.
+FEATURE_NAMES = NUMERIC_FEATURE_NAMES + list(
+    CATEGORICAL_FEATURES_WITH_VOCABULARY.keys()
+)
+
+
+"""
+## Feature preprocessing with Keras layers
+
+
+The following features are categorical features encoded as integers:
+
+- `sex`
+- `cp`
+- `fbs`
+- `restecg`
+- `exang`
+- `ca`
+
+We will encode these features using **one-hot encoding**. We have two options
+here:
+
+ - Use `CategoryEncoding()`, which requires knowing the range of input values
+ and will error on input outside the range.
+ - Use `IntegerLookup()` which will build a lookup table for inputs and reserve
+ an output index for unkown input values.
+
+For this example, we want a simple solution that will handle out of range inputs
+at inference, so we will use `IntegerLookup()`.
+
+We also have a categorical feature encoded as a string: `thal`. We will create an
+index of all possible features and encode output using the `StringLookup()` layer.
+
+Finally, the following feature are continuous numerical features:
+
+- `age`
+- `trestbps`
+- `chol`
+- `thalach`
+- `oldpeak`
+- `slope`
+
+For each of these features, we will use a `Normalization()` layer to make sure the mean
+of each feature is 0 and its standard deviation is 1.
+
+Below, we define a utility function to do the operations:
+
+- `process` to one-hot encode string or integer categorical features.
+"""
+
+# Tensorflow required for tf.data.Dataset
+import tensorflow as tf
+
+
+# We process our datasets elements here (categorical) and convert them to indices to avoid this step
+# during model training since only tensorflow support strings.
+def encode_categorical(features, target):
+    for f in features:
+        if f in CATEGORICAL_FEATURES_WITH_VOCABULARY:
+            # Create a lookup to convert a string values to an integer indices.
+            # Since we are not using a mask token nor expecting any out of vocabulary
+            # (oov) token, we set mask_token to None and  num_oov_indices to 0.
+            cls = (
+                layers.StringLookup
+                if features[f].dtype == "string"
+                else layers.IntegerLookup
+            )
+            features[f] = cls(
+                vocabulary=CATEGORICAL_FEATURES_WITH_VOCABULARY[f],
+                mask_token=None,
+                num_oov_indices=0,
+                output_mode="binary",
+            )(features[f])
+
+    # Change features from OrderedDict to Dict to match Inputs as they are Dict.
+    return dict(features), target
+
+
+"""
+Let's generate `tf.data.Dataset` objects for each dataframe:
+"""
+
+
+def dataframe_to_dataset(dataframe):
+    dataframe = dataframe.copy()
+    labels = dataframe.pop("target")
+    ds = (
+        tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
+        .map(encode_categorical)
+        .shuffle(buffer_size=len(dataframe))
+    )
+    return ds
+
+
+train_ds = dataframe_to_dataset(train_dataframe)
+val_ds = dataframe_to_dataset(val_dataframe)
+
+
+"""
+Each `Dataset` yields a tuple `(input, target)` where `input` is a dictionary of features
+and `target` is the value `0` or `1`:
+"""
+
+for x, y in train_ds.take(1):
+    print("Input:", x)
+    print("Target:", y)
+
+"""
+Let's batch the datasets:
+"""
+
+train_ds = train_ds.batch(32)
+val_ds = val_ds.batch(32)
+
+"""
+## Subclassing Autokeras Graph
+
+Here we subclass the Autokeras `Graph`
+
+- `build`: we override this method to be able to handle model `Inputs` passed
+as dictionaries. In structured data analysis Inputs are normally passed as
+dictionaries for each feature of interest
+
+"""
+
+
+class Graph(ak.graph.Graph):
+
+    def build(self, hp):
+        """Build the HyperModel into a Keras Model."""
+        keras_nodes = {}
+        keras_input_nodes = []
+        for node in self.inputs:
+            node_id = self._node_to_id[node]
+            input_node = node.build_node(hp)
+            output_node = node.build(hp, input_node)
+            keras_input_nodes.append(input_node)
+            keras_nodes[node_id] = output_node
+        for block in self.blocks:
+            temp_inputs = (
+                {
+                    n.name: keras_nodes[self._node_to_id[n]]
+                    for n in block.inputs
+                    if isinstance(n, ak.Input)
+                }
+                if isinstance(block.inputs[0], ak.Input)
+                else [keras_nodes[self._node_to_id[n]] for n in block.inputs]
+            )
+            outputs = tree.flatten(block.build(hp, inputs=temp_inputs))
+            for n, o in zip(block.outputs, outputs):
+                keras_nodes[self._node_to_id[n]] = o
+        model = keras.models.Model(
+            keras_input_nodes,
+            [
+                keras_nodes[self._node_to_id[output_node]]
+                for output_node in self.outputs
+            ],
+        )
+        return self._compile_keras_model(hp, model)
+
+    def _compile_keras_model(self, hp, model):
+        # Specify hyperparameters from compile(...)
+        optimizer_name = hp.Choice(
+            "optimizer",
+            ["adam", "sgd"],
+            default="adam",
+        )
+        learning_rate = hp.Choice(
+            "learning_rate", [1e-1, 1e-2, 1e-3, 1e-4, 2e-5, 1e-5], default=1e-3
+        )
+        if optimizer_name == "adam":
+            optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
+        elif optimizer_name == "sgd":
+            optimizer = keras.optimizers.SGD(learning_rate=learning_rate)
+        model.compile(
+            optimizer=optimizer,
+            metrics=self._get_metrics(),
+            loss=self._get_loss(),
+        )
+        return model
+
+
+"""
+
+## Subclassing Autokeras `Input`
+
+Here we subclass the Autokeras Input node object and override the dtype attribute
+from None to a user supplied value. We also override the `build_node` method to
+use user supplied name for Inputs layers.
+
+"""
+
+
+class Input(ak.Input):
+    def __init__(self, dtype, name=None, **kwargs):
+        super().__init__(name=name, **kwargs)
+        # Override dtype to a user dtype value
+        self.dtype = dtype
+        self.name = name
+
+    def build_node(self, hp):
+        return keras.Input(name=self.name, shape=self.shape, dtype=self.dtype)
+
+
+"""
+
+## Subclassing ClassificationHead
+
+Here we subclass Autokeras ClassificationHead and override the __init__ method, and
+we add the method `get_expected_shape` to infer the labels shape.
+We remove the preprocessing fuctionality as we prefer to conduct such manually.
+"""
+
+
+class ClassifierHead(ak.ClassificationHead):
+
+    def __init__(
+        self,
+        num_classes: Optional[int] = None,
+        multi_label: bool = False,
+        loss: Optional[types.LossType] = None,
+        metrics: Optional[types.MetricsType] = None,
+        dropout: Optional[float] = None,
+        **kwargs,
+    ):
+        self.num_classes = num_classes
+        self.multi_label = multi_label
+        self.dropout = dropout
+        if metrics is None:
+            metrics = ["accuracy"]
+        if loss is None:
+            loss = self.infer_loss()
+        ak.Head.__init__(self, loss=loss, metrics=metrics, **kwargs)
+        self.shape = self.get_expected_shape()
+
+    def get_expected_shape(self):
+        # Compute expected shape from num_classes.
+        if self.num_classes == 2 and not self.multi_label:
+            return [1]
+        return [self.num_classes]
+
+
+"""
+## GatedLinearUnit Layer
+
+This is a keras layer defined in the script `structured_data/classification_with_grn_vsn.py`
+More details about this layer maybe found in the relevant script
+
+"""
+
+
+class GatedLinearUnit(layers.Layer):
+    def __init__(self, num_units, activation, **kwargs):
+        super().__init__(**kwargs)
+        self.linear = layers.Dense(num_units)
+        self.sigmoid = layers.Dense(num_units, activation=activation)
+
+    def call(self, inputs):
+        return self.linear(inputs) * self.sigmoid(inputs)
+
+    def build(self):
+        self.built = True
+
+
+"""
+## GatedResidualNetwork Layer
+
+This is a keras layer defined in the script `structured_data/classification_with_grn_vsn.py`
+More details about this layer maybe found in the relevant script
+
+"""
+
+
+class GatedResidualNetwork(layers.Layer):
+
+    def __init__(
+        self, num_units, dropout_rate, activation, use_layernorm=None, **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.num_units = num_units
+        self.use_layernorm = use_layernorm
+        self.elu_dense = layers.Dense(num_units, activation=activation)
+        self.linear_dense = layers.Dense(num_units)
+        self.dropout = layers.Dropout(dropout_rate)
+        self.gated_linear_unit = GatedLinearUnit(num_units, activation)
+        self.layer_norm = layers.LayerNormalization()
+        self.project = layers.Dense(num_units)
+
+    def call(self, inputs, hp):
+        x = self.elu_dense(inputs)
+        x = self.linear_dense(x)
+        x = self.dropout(x)
+        if inputs.shape[-1] != self.num_units:
+            inputs = self.project(inputs)
+        x = inputs + self.gated_linear_unit(x)
+        use_layernorm = self.use_layernorm
+        if use_layernorm is None:
+            use_layernorm = hp.Boolean("use_layernorm", default=True)
+        if use_layernorm:
+            x = self.layer_norm(x)
+        return x
+
+    def build(self):
+        self.built = True
+
+
+"""
+## Building the Autokeras `VariableSelection Block`
+
+We have converted the following keras layer to an Autokeras Block to include
+hyperapameters to tune. Refer to Autokeras blocks API for writing custom Blocks.
+
+"""
+
+
+class VariableSelection(ak.Block):
+    def __init__(
+        self,
+        num_units: Optional[Union[int, HyperParameters.Choice]] = None,
+        dropout_rate: Optional[Union[float, HyperParameters.Choice]] = None,
+        activation: Optional[Union[str, HyperParameters.Choice]] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.dropout = utils.get_hyperparameter(
+            dropout_rate,
+            HyperParameters().Choice("dropout", [0.0, 0.25, 0.5], default=0.0),
+            float,
+        )
+        self.num_units = utils.get_hyperparameter(
+            num_units,
+            HyperParameters().Choice(
+                "num_units", [16, 32, 64, 128, 256, 512, 1024], default=16
+            ),
+            int,
+        )
+        self.activation = utils.get_hyperparameter(
+            activation,
+            HyperParameters().Choice(
+                "vsn_activation", ["sigmoid", "elu"], default="sigmoid"
+            ),
+            str,
+        )
+
+    def build(self, hp, inputs):
+        num_units = utils.add_to_hp(self.num_units, hp, "num_units")
+        dropout_rate = utils.add_to_hp(self.dropout, hp, "dropout_rate")
+        activation = utils.add_to_hp(self.activation, hp, "activation")
+        concat_inputs = []
+        # Project the features to 'num_units' dimension
+        for input_ in inputs:
+            if input_ in CATEGORICAL_FEATURES_WITH_VOCABULARY:
+                concat_inputs.append(
+                    keras.layers.Dense(units=num_units)(inputs[input_])
+                )
+            else:
+                # Create a Normalization layer for our feature
+                normalizer = layers.Normalization()
+                # Prepare a Dataset that only yields our feature
+                feature_ds = train_ds.map(lambda x, y: x[input_]).map(
+                    lambda x: keras.ops.expand_dims(x, -1)
+                )
+                # Learn the statistics of the data
+                normalizer.adapt(feature_ds)
+                # Normalize the input feature
+                normal_feature = normalizer(inputs[input_])
+                concat_inputs.append(
+                    keras.layers.Dense(units=num_units)(normal_feature)
+                )
+        v = layers.concatenate(concat_inputs)
+        v = GatedResidualNetwork(
+            num_units=num_units, dropout_rate=dropout_rate, activation=activation
+        )(v, hp=hp)
+        v = keras.ops.expand_dims(
+            layers.Dense(units=len(inputs), activation=activation)(v), axis=-1
+        )
+        x = []
+        x += [
+            GatedResidualNetwork(num_units, dropout_rate, activation)(i, hp=hp)
+            for i in concat_inputs
+        ]
+        x = keras.ops.stack(x, axis=1)
+        return keras.ops.squeeze(
+            keras.ops.matmul(keras.ops.transpose(v, axes=[0, 2, 1]), x), axis=1
+        )
+
+
+"""
+
+# We create the HyperModel (from KerasTuner) Inputs which will be built into Keras Input objects
+
+"""
+
+
+# Categorical features have different shapes after the encoding, dependent on the
+# vocabulary or unique values of each feature. We create them accordinly to match the
+# input data elements generated by tf.data.Dataset after pre-processing them
+def create_model_inputs():
+    inputs = {
+        f: (
+            Input(
+                name=f,
+                shape=(len(CATEGORICAL_FEATURES_WITH_VOCABULARY[f]),),
+                dtype="int64",
+            )
+            if f in CATEGORICAL_FEATURES_WITH_VOCABULARY
+            else Input(name=f, shape=(1,), dtype="float32")
+        )
+        for f in FEATURE_NAMES
+    }
+    return inputs
+
+
+"""
+
+## KerasTuner `HyperModel`
+
+Here we use the Autokeras `Functional` API to construct a network of BlocksSSS which will
+be built into a KerasTuner HyperModel and finally to a Keras Model.
+
+"""
+
+
+class MyHyperModel(keras_tuner.HyperModel):
+
+    def build(self, hp):
+        inputs = create_model_inputs()
+        features = VariableSelection()(inputs)
+        outputs = ClassifierHead(num_classes=2, multi_label=False)(features)
+        model = Graph(inputs=inputs, outputs=outputs)
+        model = model.build(hp)
+        return model
+
+    def fit(self, hp, model, *args, **kwargs):
+        return model.fit(
+            *args,
+            # Tune whether to shuffle the data in each epoch.
+            shuffle=hp.Boolean("shuffle"),
+            **kwargs,
+        )
+
+
+"""
+
+##  Using `RandomSearch` Tuner to find best HyperParameters
+
+We use the RandomSearch tuner to serach for hyparameters in the search space
+We also display the search space
+
+"""
+
+print("Start training and searching for the best model...")
+
+tuner = keras_tuner.RandomSearch(
+    MyHyperModel(),
+    objective="val_accuracy",
+    max_trials=3,
+    overwrite=True,
+    directory="my_dir",
+    project_name="tune_hypermodel",
+)
+
+# Show the search space summary
+print("Tuner search space summary:\n")
+tuner.search_space_summary()
+# Search for best model
+tuner.search(train_ds, epochs=2, validation_data=val_ds)
+
+"""
+## Extracting the best model
+"""
+
+# Get the top model.
+models = tuner.get_best_models(num_models=1)
+best_model = models[0]
+best_model.summary()
+
+
+"""
+## Inference on new data
+
+To get a prediction for a new sample, you can simply call `model.predict()`. There are
+just two things you need to do:
+
+1. wrap scalars into a list so as to have a batch dimension (models only process batches
+of data, not single samples)
+2. Call `convert_to_tensor` on each feature
+"""
+
+sample = {
+    "age": 60,
+    "sex": 1,
+    "cp": 1,
+    "trestbps": 145,
+    "chol": 233,
+    "fbs": 1,
+    "restecg": 2,
+    "thalach": 150,
+    "exang": 0,
+    "oldpeak": 2.3,
+    "slope": 3,
+    "ca": 0,
+    "thal": "fixed",
+}
+
+
+# Given the category (in the sample above - key) and the category value (in the sample above - value),
+# we return its one-hot encoding
+def get_cat_encoding(cat, cat_value):
+    # Create a list of zeros with the same length as categories
+    encoding = [0] * len(cat)
+    # Find the index of category_value in categories and set the corresponding position to 1
+    if cat_value in cat:
+        encoding[cat.index(cat_value)] = 1
+    return encoding
+
+
+for name, value in sample.items():
+    if name in CATEGORICAL_FEATURES_WITH_VOCABULARY:
+        sample.update(
+            {
+                name: get_cat_encoding(
+                    CATEGORICAL_FEATURES_WITH_VOCABULARY[name], sample[name]
+                )
+            }
+        )
+# Convert inputs to tensors
+input_dict = {name: tf.convert_to_tensor([value]) for name, value in sample.items()}
+predictions = best_model.predict(input_dict)
+
+print(
+    f"This particular patient had a {100 * predictions[0][0]:.1f} "
+    "percent probability of having a heart disease, "
+    "as evaluated by our model."
+)
diff --git a/knowledge_base/structured_data/classification_with_grn_and_vsn.py b/knowledge_base/structured_data/classification_with_grn_and_vsn.py
new file mode 100644
index 0000000000000000000000000000000000000000..1209cb137b1cef0b9f5dcf13707db6e483091d5d
--- /dev/null
+++ b/knowledge_base/structured_data/classification_with_grn_and_vsn.py
@@ -0,0 +1,529 @@
+"""
+Title: Classification with Gated Residual and Variable Selection Networks
+Author: [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/)
+Date created: 2021/02/10
+Last modified: 2025/01/08
+Description: Using Gated Residual and Variable Selection Networks for income level prediction.
+Accelerator: GPU
+Converted to Keras 3 by: [Sitam Meur](https://github.com/sitamgithub-MSIT) and made backend-agnostic by: [Humbulani Ndou](https://github.com/Humbulani1234)
+"""
+
+"""
+## Introduction
+
+This example demonstrates the use of Gated
+Residual Networks (GRN) and Variable Selection Networks (VSN), proposed by
+Bryan Lim et al. in
+[Temporal Fusion Transformers (TFT) for Interpretable Multi-horizon Time Series Forecasting](https://arxiv.org/abs/1912.09363),
+for structured data classification. GRNs give the flexibility to the model to apply
+non-linear processing only where needed. VSNs allow the model to softly remove any
+unnecessary noisy inputs which could negatively impact performance.
+Together, those techniques help improving the learning capacity of deep neural
+network models.
+
+Note that this example implements only the GRN and VSN components described in
+in the paper, rather than the whole TFT model, as GRN and VSN can be useful on
+their own for structured data learning tasks.
+
+
+To run the code you need to use TensorFlow 2.3 or higher.
+"""
+
+"""
+## The dataset
+
+This example uses the
+[United States Census Income Dataset](https://archive.ics.uci.edu/ml/datasets/Census-Income+%28KDD%29)
+provided by the
+[UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php).
+The task is binary classification to determine whether a person makes over 50K a year.
+
+The dataset includes ~300K instances with 41 input features: 7 numerical features
+and 34 categorical features.
+"""
+
+"""
+## Setup
+"""
+
+import os
+import subprocess
+import tarfile
+
+os.environ["KERAS_BACKEND"] = "torch"  # or jax, or tensorflow
+
+import numpy as np
+import pandas as pd
+import keras
+from keras import layers
+
+"""
+## Prepare the data
+
+First we load the data from the UCI Machine Learning Repository into a Pandas DataFrame.
+"""
+
+# Column names.
+CSV_HEADER = [
+    "age",
+    "class_of_worker",
+    "detailed_industry_recode",
+    "detailed_occupation_recode",
+    "education",
+    "wage_per_hour",
+    "enroll_in_edu_inst_last_wk",
+    "marital_stat",
+    "major_industry_code",
+    "major_occupation_code",
+    "race",
+    "hispanic_origin",
+    "sex",
+    "member_of_a_labor_union",
+    "reason_for_unemployment",
+    "full_or_part_time_employment_stat",
+    "capital_gains",
+    "capital_losses",
+    "dividends_from_stocks",
+    "tax_filer_stat",
+    "region_of_previous_residence",
+    "state_of_previous_residence",
+    "detailed_household_and_family_stat",
+    "detailed_household_summary_in_household",
+    "instance_weight",
+    "migration_code-change_in_msa",
+    "migration_code-change_in_reg",
+    "migration_code-move_within_reg",
+    "live_in_this_house_1_year_ago",
+    "migration_prev_res_in_sunbelt",
+    "num_persons_worked_for_employer",
+    "family_members_under_18",
+    "country_of_birth_father",
+    "country_of_birth_mother",
+    "country_of_birth_self",
+    "citizenship",
+    "own_business_or_self_employed",
+    "fill_inc_questionnaire_for_veterans_admin",
+    "veterans_benefits",
+    "weeks_worked_in_year",
+    "year",
+    "income_level",
+]
+
+data_url = "https://archive.ics.uci.edu/static/public/117/census+income+kdd.zip"
+keras.utils.get_file(origin=data_url, extract=True)
+
+"""
+Determine the downloaded .tar.gz file path and
+extract the files from the downloaded .tar.gz file
+"""
+
+extracted_path = os.path.join(
+    os.path.expanduser("~"), ".keras", "datasets", "census+income+kdd.zip"
+)
+for root, dirs, files in os.walk(extracted_path):
+    for file in files:
+        if file.endswith(".tar.gz"):
+            tar_gz_path = os.path.join(root, file)
+            with tarfile.open(tar_gz_path, "r:gz") as tar:
+                tar.extractall(path=root)
+
+train_data_path = os.path.join(
+    os.path.expanduser("~"),
+    ".keras",
+    "datasets",
+    "census+income+kdd.zip",
+    "census-income.data",
+)
+test_data_path = os.path.join(
+    os.path.expanduser("~"),
+    ".keras",
+    "datasets",
+    "census+income+kdd.zip",
+    "census-income.test",
+)
+
+data = pd.read_csv(train_data_path, header=None, names=CSV_HEADER)
+test_data = pd.read_csv(test_data_path, header=None, names=CSV_HEADER)
+
+print(f"Data shape: {data.shape}")
+print(f"Test data shape: {test_data.shape}")
+
+
+"""
+We convert the target column from string to integer.
+"""
+
+data["income_level"] = data["income_level"].apply(
+    lambda x: 0 if x == " - 50000." else 1
+)
+test_data["income_level"] = test_data["income_level"].apply(
+    lambda x: 0 if x == " - 50000." else 1
+)
+
+
+"""
+Then, We split the dataset into train and validation sets.
+"""
+
+random_selection = np.random.rand(len(data.index)) <= 0.85
+train_data = data[random_selection]
+valid_data = data[~random_selection]
+
+
+"""
+Finally we store the train and test data splits locally to CSV files.
+"""
+
+train_data_file = "train_data.csv"
+valid_data_file = "valid_data.csv"
+test_data_file = "test_data.csv"
+
+train_data.to_csv(train_data_file, index=False, header=False)
+valid_data.to_csv(valid_data_file, index=False, header=False)
+test_data.to_csv(test_data_file, index=False, header=False)
+
+"""
+## Define dataset metadata
+
+Here, we define the metadata of the dataset that will be useful for reading and
+parsing the data into input features, and encoding the input features with respect
+to their types.
+"""
+
+# Target feature name.
+TARGET_FEATURE_NAME = "income_level"
+# Weight column name.
+WEIGHT_COLUMN_NAME = "instance_weight"
+# Numeric feature names.
+NUMERIC_FEATURE_NAMES = [
+    "age",
+    "wage_per_hour",
+    "capital_gains",
+    "capital_losses",
+    "dividends_from_stocks",
+    "num_persons_worked_for_employer",
+    "weeks_worked_in_year",
+]
+# Categorical features and their vocabulary lists.
+# Note that we add 'v=' as a prefix to all categorical feature values to make
+# sure that they are treated as strings.
+CATEGORICAL_FEATURES_WITH_VOCABULARY = {
+    feature_name: sorted([str(value) for value in list(data[feature_name].unique())])
+    for feature_name in CSV_HEADER
+    if feature_name
+    not in list(NUMERIC_FEATURE_NAMES + [WEIGHT_COLUMN_NAME, TARGET_FEATURE_NAME])
+}
+# All features names.
+FEATURE_NAMES = NUMERIC_FEATURE_NAMES + list(
+    CATEGORICAL_FEATURES_WITH_VOCABULARY.keys()
+)
+# Feature default values.
+COLUMN_DEFAULTS = [
+    (
+        [0.0]
+        if feature_name
+        in NUMERIC_FEATURE_NAMES + [TARGET_FEATURE_NAME, WEIGHT_COLUMN_NAME]
+        else ["NA"]
+    )
+    for feature_name in CSV_HEADER
+]
+
+"""
+## Create a `tf.data.Dataset` for training and evaluation
+
+We create an input function to read and parse the file, and convert features and
+labels into a [`tf.data.Dataset`](https://www.tensorflow.org/guide/datasets) for
+training and evaluation.
+"""
+
+# Tensorflow required for tf.data.Datasets
+import tensorflow as tf
+
+
+# We process our datasets elements here (categorical) and convert them to indices to avoid this step
+# during model training since only tensorflow support strings.
+def process(features, target):
+    for feature_name in features:
+        if feature_name in CATEGORICAL_FEATURES_WITH_VOCABULARY:
+            # Cast categorical feature values to string.
+            features[feature_name] = tf.cast(features[feature_name], "string")
+            vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
+            # Create a lookup to convert a string values to an integer indices.
+            # Since we are not using a mask token nor expecting any out of vocabulary
+            # (oov) token, we set mask_token to None and  num_oov_indices to 0.
+            index = layers.StringLookup(
+                vocabulary=vocabulary,
+                mask_token=None,
+                num_oov_indices=0,
+                output_mode="int",
+            )
+            # Convert the string input values into integer indices.
+            value_index = index(features[feature_name])
+            features[feature_name] = value_index
+        else:
+            # Do nothing for numerical features
+            pass
+
+    # Get the instance weight.
+    weight = features.pop(WEIGHT_COLUMN_NAME)
+    # Change features from OrderedDict to Dict to match Inputs as they are Dict.
+    return dict(features), target, weight
+
+
+def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):
+    dataset = tf.data.experimental.make_csv_dataset(
+        csv_file_path,
+        batch_size=batch_size,
+        column_names=CSV_HEADER,
+        column_defaults=COLUMN_DEFAULTS,
+        label_name=TARGET_FEATURE_NAME,
+        num_epochs=1,
+        header=False,
+        shuffle=shuffle,
+    ).map(process)
+
+    return dataset
+
+
+"""
+## Create model inputs
+"""
+
+
+def create_model_inputs():
+    inputs = {}
+    for feature_name in FEATURE_NAMES:
+        if feature_name in CATEGORICAL_FEATURES_WITH_VOCABULARY:
+            # Make them int64, they are Categorical (whole units)
+            inputs[feature_name] = layers.Input(
+                name=feature_name, shape=(), dtype="int64"
+            )
+        else:
+            # Make them float32, they are Real numbers
+            inputs[feature_name] = layers.Input(
+                name=feature_name, shape=(), dtype="float32"
+            )
+    return inputs
+
+
+"""
+## Implement the Gated Linear Unit
+
+[Gated Linear Units (GLUs)](https://arxiv.org/abs/1612.08083) provide the
+flexibility to suppress input that are not relevant for a given task.
+"""
+
+
+class GatedLinearUnit(layers.Layer):
+    def __init__(self, units):
+        super().__init__()
+        self.linear = layers.Dense(units)
+        self.sigmoid = layers.Dense(units, activation="sigmoid")
+
+    def call(self, inputs):
+        return self.linear(inputs) * self.sigmoid(inputs)
+
+    # Remove build warnings
+    def build(self):
+        self.built = True
+
+
+"""
+## Implement the Gated Residual Network
+
+The Gated Residual Network (GRN) works as follows:
+
+1. Applies the nonlinear ELU transformation to the inputs.
+2. Applies linear transformation followed by dropout.
+4. Applies GLU and adds the original inputs to the output of the GLU to perform skip
+(residual) connection.
+6. Applies layer normalization and produces the output.
+"""
+
+
+class GatedResidualNetwork(layers.Layer):
+    def __init__(self, units, dropout_rate):
+        super().__init__()
+        self.units = units
+        self.elu_dense = layers.Dense(units, activation="elu")
+        self.linear_dense = layers.Dense(units)
+        self.dropout = layers.Dropout(dropout_rate)
+        self.gated_linear_unit = GatedLinearUnit(units)
+        self.layer_norm = layers.LayerNormalization()
+        self.project = layers.Dense(units)
+
+    def call(self, inputs):
+        x = self.elu_dense(inputs)
+        x = self.linear_dense(x)
+        x = self.dropout(x)
+        if inputs.shape[-1] != self.units:
+            inputs = self.project(inputs)
+        x = inputs + self.gated_linear_unit(x)
+        x = self.layer_norm(x)
+        return x
+
+    # Remove build warnings
+    def build(self):
+        self.built = True
+
+
+"""
+## Implement the Variable Selection Network
+
+The Variable Selection Network (VSN) works as follows:
+
+1. Applies a GRN to each feature individually.
+2. Applies a GRN on the concatenation of all the features, followed by a softmax to
+produce feature weights.
+3. Produces a weighted sum of the output of the individual GRN.
+
+Note that the output of the VSN is [batch_size, encoding_size], regardless of the
+number of the input features.
+
+For categorical features, we encode them using `layers.Embedding` using the
+`encoding_size` as the embedding dimensions. For the numerical features,
+we apply linear transformation using `layers.Dense` to project each feature into
+`encoding_size`-dimensional vector. Thus, all the encoded features will have the
+same dimensionality.
+
+"""
+
+
+class VariableSelection(layers.Layer):
+    def __init__(self, num_features, units, dropout_rate):
+        super().__init__()
+        self.units = units
+        # Create an embedding layers with the specified dimensions
+        self.embeddings = dict()
+        for input_ in CATEGORICAL_FEATURES_WITH_VOCABULARY:
+            vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[input_]
+            embedding_encoder = layers.Embedding(
+                input_dim=len(vocabulary), output_dim=self.units, name=input_
+            )
+            self.embeddings[input_] = embedding_encoder
+
+        # Projection layers for numeric features
+        self.proj_layer = dict()
+        for input_ in NUMERIC_FEATURE_NAMES:
+            proj_layer = layers.Dense(units=self.units)
+            self.proj_layer[input_] = proj_layer
+
+        self.grns = list()
+        # Create a GRN for each feature independently
+        for idx in range(num_features):
+            grn = GatedResidualNetwork(units, dropout_rate)
+            self.grns.append(grn)
+        # Create a GRN for the concatenation of all the features
+        self.grn_concat = GatedResidualNetwork(units, dropout_rate)
+        self.softmax = layers.Dense(units=num_features, activation="softmax")
+
+    def call(self, inputs):
+        concat_inputs = []
+        for input_ in inputs:
+            if input_ in CATEGORICAL_FEATURES_WITH_VOCABULARY:
+                max_index = self.embeddings[input_].input_dim - 1  # Clamp the indices
+                # torch had some index errors during embedding hence the clip function
+                embedded_feature = self.embeddings[input_](
+                    keras.ops.clip(inputs[input_], 0, max_index)
+                )
+                concat_inputs.append(embedded_feature)
+            else:
+                # Project the numeric feature to encoding_size using linear transformation.
+                proj_feature = keras.ops.expand_dims(inputs[input_], -1)
+                proj_feature = self.proj_layer[input_](proj_feature)
+                concat_inputs.append(proj_feature)
+
+        v = layers.concatenate(concat_inputs)
+        v = self.grn_concat(v)
+        v = keras.ops.expand_dims(self.softmax(v), axis=-1)
+        x = []
+        for idx, input in enumerate(concat_inputs):
+            x.append(self.grns[idx](input))
+        x = keras.ops.stack(x, axis=1)
+        return keras.ops.squeeze(
+            keras.ops.matmul(keras.ops.transpose(v, axes=[0, 2, 1]), x), axis=1
+        )
+
+    # to remove the build warnings
+    def build(self):
+        self.built = True
+
+
+"""
+## Create Gated Residual and Variable Selection Networks model
+"""
+
+
+def create_model(encoding_size):
+    inputs = create_model_inputs()
+    num_features = len(inputs)
+    features = VariableSelection(num_features, encoding_size, dropout_rate)(inputs)
+    outputs = layers.Dense(units=1, activation="sigmoid")(features)
+    # Functional model
+    model = keras.Model(inputs=inputs, outputs=outputs)
+    return model
+
+
+"""
+## Compile, train, and evaluate the model
+"""
+
+learning_rate = 0.001
+dropout_rate = 0.15
+batch_size = 265
+num_epochs = 20  # may be adjusted to a desired value
+encoding_size = 16
+
+model = create_model(encoding_size)
+model.compile(
+    optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
+    loss=keras.losses.BinaryCrossentropy(),
+    metrics=[keras.metrics.BinaryAccuracy(name="accuracy")],
+)
+
+"""
+Let's visualize our connectivity graph:
+"""
+
+# `rankdir='LR'` is to make the graph horizontal.
+keras.utils.plot_model(model, show_shapes=True, show_layer_names=True, rankdir="LR")
+
+
+# Create an early stopping callback.
+early_stopping = keras.callbacks.EarlyStopping(
+    monitor="val_loss", patience=5, restore_best_weights=True
+)
+
+print("Start training the model...")
+train_dataset = get_dataset_from_csv(
+    train_data_file, shuffle=True, batch_size=batch_size
+)
+valid_dataset = get_dataset_from_csv(valid_data_file, batch_size=batch_size)
+model.fit(
+    train_dataset,
+    epochs=num_epochs,
+    validation_data=valid_dataset,
+    callbacks=[early_stopping],
+)
+print("Model training finished.")
+
+print("Evaluating model performance...")
+test_dataset = get_dataset_from_csv(test_data_file, batch_size=batch_size)
+_, accuracy = model.evaluate(test_dataset)
+print(f"Test accuracy: {round(accuracy * 100, 2)}%")
+
+"""
+You should achieve more than 95% accuracy on the test set.
+
+To increase the learning capacity of the model, you can try increasing the
+`encoding_size` value, or stacking multiple GRN layers on top of the VSN layer.
+This may require to also increase the `dropout_rate` value to avoid overfitting.
+"""
+
+"""
+**Example available on HuggingFace**
+
+| Trained Model | Demo |
+| :--: | :--: |
+| [![Generic badge](https://img.shields.io/badge/%F0%9F%A4%97%20Model-Classification%20With%20GRN%20%26%20VSN-red)](https://huggingface.co/keras-io/structured-data-classification-grn-vsn) | [![Generic badge](https://img.shields.io/badge/%F0%9F%A4%97%20Space-Classification%20With%20GRN%20%26%20VSN-red)](https://huggingface.co/spaces/keras-io/structured-data-classification-grn-vsn) |
+"""
diff --git a/knowledge_base/structured_data/classification_with_tfdf.py b/knowledge_base/structured_data/classification_with_tfdf.py
new file mode 100644
index 0000000000000000000000000000000000000000..e61285809f46ec4ad1a635bb8f9d643d4bfc2f36
--- /dev/null
+++ b/knowledge_base/structured_data/classification_with_tfdf.py
@@ -0,0 +1,670 @@
+"""
+Title: Classification with TensorFlow Decision Forests
+Author: [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/)
+Date created: 2022/01/25
+Last modified: 2022/01/25
+Description: Using TensorFlow Decision Forests for structured data classification.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+[TensorFlow Decision Forests](https://www.tensorflow.org/decision_forests)
+is a collection of state-of-the-art algorithms of Decision Forest models
+that are compatible with Keras APIs.
+The models include [Random Forests](https://www.tensorflow.org/decision_forests/api_docs/python/tfdf/keras/RandomForestModel),
+[Gradient Boosted Trees](https://www.tensorflow.org/decision_forests/api_docs/python/tfdf/keras/GradientBoostedTreesModel),
+and [CART](https://www.tensorflow.org/decision_forests/api_docs/python/tfdf/keras/CartModel),
+and can be used for regression, classification, and ranking task.
+For a beginner's guide to TensorFlow Decision Forests,
+please refer to this [tutorial](https://www.tensorflow.org/decision_forests/tutorials/beginner_colab).
+
+
+This example uses Gradient Boosted Trees model in binary classification of
+structured data, and covers the following scenarios:
+
+1. Build a decision forests model by specifying the input feature usage.
+2. Implement a custom *Binary Target encoder* as a [Keras Preprocessing layer](https://keras.io/api/layers/preprocessing_layers/)
+to encode the categorical features with respect to their target value co-occurrences,
+and then use the encoded features to build a decision forests model.
+3. Encode the categorical features as [embeddings](https://keras.io/api/layers/core_layers/embedding),
+train these embeddings in a simple NN model, and then use the
+trained embeddings as inputs to build decision forests model.
+
+This example uses TensorFlow 2.7 or higher,
+as well as [TensorFlow Decision Forests](https://www.tensorflow.org/decision_forests),
+which you can install using the following command:
+
+```python
+pip install -U tensorflow_decision_forests
+```
+"""
+
+"""
+## Setup
+"""
+
+import math
+import urllib
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+import tensorflow_decision_forests as tfdf
+
+"""
+## Prepare the data
+
+This example uses the
+[United States Census Income Dataset](https://archive.ics.uci.edu/ml/datasets/Census-Income+%28KDD%29)
+provided by the [UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php).
+The task is binary classification to determine whether a person makes over 50K a year.
+
+The dataset includes ~300K instances with 41 input features: 7 numerical features
+and 34 categorical features.
+
+First we load the data from the UCI Machine Learning Repository into a Pandas DataFrame.
+"""
+
+BASE_PATH = "https://kdd.ics.uci.edu/databases/census-income/census-income"
+CSV_HEADER = [
+    l.decode("utf-8").split(":")[0].replace(" ", "_")
+    for l in urllib.request.urlopen(f"{BASE_PATH}.names")
+    if not l.startswith(b"|")
+][2:]
+CSV_HEADER.append("income_level")
+
+train_data = pd.read_csv(
+    f"{BASE_PATH}.data.gz",
+    header=None,
+    names=CSV_HEADER,
+)
+test_data = pd.read_csv(
+    f"{BASE_PATH}.test.gz",
+    header=None,
+    names=CSV_HEADER,
+)
+
+"""
+## Define dataset metadata
+
+Here, we define the metadata of the dataset that will be useful for encoding
+the input features with respect to their types.
+"""
+
+# Target column name.
+TARGET_COLUMN_NAME = "income_level"
+# The labels of the target columns.
+TARGET_LABELS = [" - 50000.", " 50000+."]
+# Weight column name.
+WEIGHT_COLUMN_NAME = "instance_weight"
+# Numeric feature names.
+NUMERIC_FEATURE_NAMES = [
+    "age",
+    "wage_per_hour",
+    "capital_gains",
+    "capital_losses",
+    "dividends_from_stocks",
+    "num_persons_worked_for_employer",
+    "weeks_worked_in_year",
+]
+# Categorical features and their vocabulary lists.
+CATEGORICAL_FEATURE_NAMES = [
+    "class_of_worker",
+    "detailed_industry_recode",
+    "detailed_occupation_recode",
+    "education",
+    "enroll_in_edu_inst_last_wk",
+    "marital_stat",
+    "major_industry_code",
+    "major_occupation_code",
+    "race",
+    "hispanic_origin",
+    "sex",
+    "member_of_a_labor_union",
+    "reason_for_unemployment",
+    "full_or_part_time_employment_stat",
+    "tax_filer_stat",
+    "region_of_previous_residence",
+    "state_of_previous_residence",
+    "detailed_household_and_family_stat",
+    "detailed_household_summary_in_household",
+    "migration_code-change_in_msa",
+    "migration_code-change_in_reg",
+    "migration_code-move_within_reg",
+    "live_in_this_house_1_year_ago",
+    "migration_prev_res_in_sunbelt",
+    "family_members_under_18",
+    "country_of_birth_father",
+    "country_of_birth_mother",
+    "country_of_birth_self",
+    "citizenship",
+    "own_business_or_self_employed",
+    "fill_inc_questionnaire_for_veteran's_admin",
+    "veterans_benefits",
+    "year",
+]
+
+
+"""
+Now we perform basic data preparation.
+"""
+
+
+def prepare_dataframe(dataframe):
+    # Convert the target labels from string to integer.
+    dataframe[TARGET_COLUMN_NAME] = dataframe[TARGET_COLUMN_NAME].map(
+        TARGET_LABELS.index
+    )
+    # Cast the categorical features to string.
+    for feature_name in CATEGORICAL_FEATURE_NAMES:
+        dataframe[feature_name] = dataframe[feature_name].astype(str)
+
+
+prepare_dataframe(train_data)
+prepare_dataframe(test_data)
+
+"""
+Now let's show the shapes of the training and test dataframes, and display some instances.
+"""
+
+print(f"Train data shape: {train_data.shape}")
+print(f"Test data shape: {test_data.shape}")
+print(train_data.head().T)
+
+"""
+## Configure hyperparameters
+
+You can find all the parameters of the Gradient Boosted Tree model in the
+[documentation](https://www.tensorflow.org/decision_forests/api_docs/python/tfdf/keras/GradientBoostedTreesModel)
+"""
+
+# Maximum number of decision trees. The effective number of trained trees can be smaller if early stopping is enabled.
+NUM_TREES = 250
+# Minimum number of examples in a node.
+MIN_EXAMPLES = 6
+# Maximum depth of the tree. max_depth=1 means that all trees will be roots.
+MAX_DEPTH = 5
+# Ratio of the dataset (sampling without replacement) used to train individual trees for the random sampling method.
+SUBSAMPLE = 0.65
+# Control the sampling of the datasets used to train individual trees.
+SAMPLING_METHOD = "RANDOM"
+# Ratio of the training dataset used to monitor the training. Require to be >0 if early stopping is enabled.
+VALIDATION_RATIO = 0.1
+
+"""
+## Implement a training and evaluation procedure
+
+The `run_experiment()` method is responsible loading the train and test datasets,
+training a given model, and evaluating the trained model.
+
+Note that when training a Decision Forests model, only one epoch is needed to
+read the full dataset. Any extra steps will result in unnecessary slower training.
+Therefore, the default `num_epochs=1` is used in the `run_experiment()` method.
+"""
+
+
+def run_experiment(model, train_data, test_data, num_epochs=1, batch_size=None):
+    train_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(
+        train_data, label=TARGET_COLUMN_NAME, weight=WEIGHT_COLUMN_NAME
+    )
+    test_dataset = tfdf.keras.pd_dataframe_to_tf_dataset(
+        test_data, label=TARGET_COLUMN_NAME, weight=WEIGHT_COLUMN_NAME
+    )
+
+    model.fit(train_dataset, epochs=num_epochs, batch_size=batch_size)
+    _, accuracy = model.evaluate(test_dataset, verbose=0)
+    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
+
+
+"""
+## Experiment 1: Decision Forests with raw features
+"""
+
+"""
+### Specify model input feature usages
+
+You can attach semantics to each feature to control how it is used by the model.
+If not specified, the semantics are inferred from the representation type.
+It is recommended to specify the [feature usages](https://www.tensorflow.org/decision_forests/api_docs/python/tfdf/keras/FeatureUsage)
+explicitly to avoid incorrect inferred semantics is incorrect.
+For example, a categorical value identifier (integer) will be be inferred as numerical,
+while it is semantically categorical.
+
+For numerical features, you can set the `discretized` parameters to the number
+of buckets by which the numerical feature should be discretized.
+This makes the training faster but may lead to worse models.
+"""
+
+
+def specify_feature_usages():
+    feature_usages = []
+
+    for feature_name in NUMERIC_FEATURE_NAMES:
+        feature_usage = tfdf.keras.FeatureUsage(
+            name=feature_name, semantic=tfdf.keras.FeatureSemantic.NUMERICAL
+        )
+        feature_usages.append(feature_usage)
+
+    for feature_name in CATEGORICAL_FEATURE_NAMES:
+        feature_usage = tfdf.keras.FeatureUsage(
+            name=feature_name, semantic=tfdf.keras.FeatureSemantic.CATEGORICAL
+        )
+        feature_usages.append(feature_usage)
+
+    return feature_usages
+
+
+"""
+### Create a Gradient Boosted Trees model
+
+When compiling a decision forests model, you may only provide extra evaluation metrics.
+The loss is specified in the model construction,
+and the optimizer is irrelevant to decision forests models.
+"""
+
+
+def create_gbt_model():
+    # See all the model parameters in https://www.tensorflow.org/decision_forests/api_docs/python/tfdf/keras/GradientBoostedTreesModel
+    gbt_model = tfdf.keras.GradientBoostedTreesModel(
+        features=specify_feature_usages(),
+        exclude_non_specified_features=True,
+        num_trees=NUM_TREES,
+        max_depth=MAX_DEPTH,
+        min_examples=MIN_EXAMPLES,
+        subsample=SUBSAMPLE,
+        validation_ratio=VALIDATION_RATIO,
+        task=tfdf.keras.Task.CLASSIFICATION,
+    )
+
+    gbt_model.compile(metrics=[keras.metrics.BinaryAccuracy(name="accuracy")])
+    return gbt_model
+
+
+"""
+### Train and evaluate the model
+"""
+
+gbt_model = create_gbt_model()
+run_experiment(gbt_model, train_data, test_data)
+
+"""
+### Inspect the model
+
+The `model.summary()` method will display several types of information about
+your decision trees model, model type, task, input features, and feature importance.
+"""
+
+print(gbt_model.summary())
+
+"""
+## Experiment 2: Decision Forests with target encoding
+
+[Target encoding](https://dl.acm.org/doi/10.1145/507533.507538) is a common preprocessing
+technique for categorical features that convert them into numerical features.
+Using categorical features with high cardinality as-is may lead to overfitting.
+Target encoding aims to replace each categorical feature value with one or more
+numerical values that represent its co-occurrence with the target labels.
+
+More precisely, given a categorical feature, the binary target encoder in this example
+will produce three new numerical features:
+
+1. `positive_frequency`: How many times each feature value occurred with a positive target label.
+2. `negative_frequency`: How many times each feature value occurred with a negative target label.
+3. `positive_probability`: The probability that the target label is positive,
+given the feature value, which is computed as
+`positive_frequency / (positive_frequency + negative_frequency + correction)`.
+The `correction` term is added in to make the division more stable for rare categorical values.
+The default value for `correction` is 1.0.
+
+
+
+Note that target encoding is effective with models that cannot automatically
+learn dense representations to categorical features, such as decision forests
+or kernel methods. If neural network models are used, its recommended to
+encode categorical features as embeddings.
+"""
+
+"""
+### Implement Binary Target Encoder
+
+For simplicity, we assume that the inputs for the `adapt` and `call` methods
+are in the expected data types and shapes, so no validation logic is added.
+
+It is recommended to pass the `vocabulary_size` of the categorical feature to the
+`BinaryTargetEncoding` constructor. If not specified, it will be computed during
+the `adapt()` method execution.
+"""
+
+
+class BinaryTargetEncoding(layers.Layer):
+    def __init__(self, vocabulary_size=None, correction=1.0, **kwargs):
+        super().__init__(**kwargs)
+        self.vocabulary_size = vocabulary_size
+        self.correction = correction
+
+    def adapt(self, data):
+        # data is expected to be an integer numpy array to a Tensor shape [num_exmples, 2].
+        # This contains feature values for a given feature in the dataset, and target values.
+
+        # Convert the data to a tensor.
+        data = tf.convert_to_tensor(data)
+        # Separate the feature values and target values
+        feature_values = tf.cast(data[:, 0], tf.dtypes.int32)
+        target_values = tf.cast(data[:, 1], tf.dtypes.bool)
+
+        # Compute the vocabulary_size of not specified.
+        if self.vocabulary_size is None:
+            self.vocabulary_size = tf.unique(feature_values).y.shape[0]
+
+        # Filter the data where the target label is positive.
+        positive_indices = tf.where(condition=target_values)
+        positive_feature_values = tf.gather_nd(
+            params=feature_values, indices=positive_indices
+        )
+        # Compute how many times each feature value occurred with a positive target label.
+        positive_frequency = tf.math.unsorted_segment_sum(
+            data=tf.ones(
+                shape=(positive_feature_values.shape[0], 1), dtype=tf.dtypes.float64
+            ),
+            segment_ids=positive_feature_values,
+            num_segments=self.vocabulary_size,
+        )
+
+        # Filter the data where the target label is negative.
+        negative_indices = tf.where(condition=tf.math.logical_not(target_values))
+        negative_feature_values = tf.gather_nd(
+            params=feature_values, indices=negative_indices
+        )
+        # Compute how many times each feature value occurred with a negative target label.
+        negative_frequency = tf.math.unsorted_segment_sum(
+            data=tf.ones(
+                shape=(negative_feature_values.shape[0], 1), dtype=tf.dtypes.float64
+            ),
+            segment_ids=negative_feature_values,
+            num_segments=self.vocabulary_size,
+        )
+        # Compute positive probability for the input feature values.
+        positive_probability = positive_frequency / (
+            positive_frequency + negative_frequency + self.correction
+        )
+        # Concatenate the computed statistics for traget_encoding.
+        target_encoding_statistics = tf.cast(
+            tf.concat(
+                [positive_frequency, negative_frequency, positive_probability], axis=1
+            ),
+            dtype=tf.dtypes.float32,
+        )
+        self.target_encoding_statistics = tf.constant(target_encoding_statistics)
+
+    def call(self, inputs):
+        # inputs is expected to be an integer numpy array to a Tensor shape [num_exmples, 1].
+        # This includes the feature values for a given feature in the dataset.
+
+        # Raise an error if the target encoding statistics are not computed.
+        if self.target_encoding_statistics == None:
+            raise ValueError(
+                f"You need to call the adapt method to compute target encoding statistics."
+            )
+
+        # Convert the inputs to a tensor.
+        inputs = tf.convert_to_tensor(inputs)
+        # Cast the inputs int64 a tensor.
+        inputs = tf.cast(inputs, tf.dtypes.int64)
+        # Lookup target encoding statistics for the input feature values.
+        target_encoding_statistics = tf.cast(
+            tf.gather_nd(self.target_encoding_statistics, inputs),
+            dtype=tf.dtypes.float32,
+        )
+        return target_encoding_statistics
+
+
+"""
+Let's test the binary target encoder
+"""
+
+data = tf.constant(
+    [
+        [0, 1],
+        [2, 0],
+        [0, 1],
+        [1, 1],
+        [1, 1],
+        [2, 0],
+        [1, 0],
+        [0, 1],
+        [2, 1],
+        [1, 0],
+        [0, 1],
+        [2, 0],
+        [0, 1],
+        [1, 1],
+        [1, 1],
+        [2, 0],
+        [1, 0],
+        [0, 1],
+        [2, 0],
+    ]
+)
+
+binary_target_encoder = BinaryTargetEncoding()
+binary_target_encoder.adapt(data)
+print(binary_target_encoder([[0], [1], [2]]))
+
+"""
+### Create model inputs
+"""
+
+
+def create_model_inputs():
+    inputs = {}
+
+    for feature_name in NUMERIC_FEATURE_NAMES:
+        inputs[feature_name] = layers.Input(
+            name=feature_name, shape=(), dtype=tf.float32
+        )
+
+    for feature_name in CATEGORICAL_FEATURE_NAMES:
+        inputs[feature_name] = layers.Input(
+            name=feature_name, shape=(), dtype=tf.string
+        )
+
+    return inputs
+
+
+"""
+### Implement a feature encoding with target encoding
+"""
+
+
+def create_target_encoder():
+    inputs = create_model_inputs()
+    target_values = train_data[[TARGET_COLUMN_NAME]].to_numpy()
+    encoded_features = []
+    for feature_name in inputs:
+        if feature_name in CATEGORICAL_FEATURE_NAMES:
+            # Get the vocabulary of the categorical feature.
+            vocabulary = sorted(
+                [str(value) for value in list(train_data[feature_name].unique())]
+            )
+            # Create a lookup to convert string values to an integer indices.
+            # Since we are not using a mask token nor expecting any out of vocabulary
+            # (oov) token, we set mask_token to None and  num_oov_indices to 0.
+            lookup = layers.StringLookup(
+                vocabulary=vocabulary, mask_token=None, num_oov_indices=0
+            )
+            # Convert the string input values into integer indices.
+            value_indices = lookup(inputs[feature_name])
+            # Prepare the data to adapt the target encoding.
+            print("### Adapting target encoding for:", feature_name)
+            feature_values = train_data[[feature_name]].to_numpy().astype(str)
+            feature_value_indices = lookup(feature_values)
+            data = tf.concat([feature_value_indices, target_values], axis=1)
+            feature_encoder = BinaryTargetEncoding()
+            feature_encoder.adapt(data)
+            # Convert the feature value indices to target encoding representations.
+            encoded_feature = feature_encoder(tf.expand_dims(value_indices, -1))
+        else:
+            # Expand the dimensions of the numerical input feature and use it as-is.
+            encoded_feature = tf.expand_dims(inputs[feature_name], -1)
+        # Add the encoded feature to the list.
+        encoded_features.append(encoded_feature)
+    # Concatenate all the encoded features.
+    encoded_features = tf.concat(encoded_features, axis=1)
+    # Create and return a Keras model with encoded features as outputs.
+    return keras.Model(inputs=inputs, outputs=encoded_features)
+
+
+"""
+### Create a Gradient Boosted Trees model with a preprocessor
+
+In this scenario, we use the target encoding as a preprocessor for the Gradient Boosted Tree model,
+and let the model infer semantics of the input features.
+"""
+
+
+def create_gbt_with_preprocessor(preprocessor):
+    gbt_model = tfdf.keras.GradientBoostedTreesModel(
+        preprocessing=preprocessor,
+        num_trees=NUM_TREES,
+        max_depth=MAX_DEPTH,
+        min_examples=MIN_EXAMPLES,
+        subsample=SUBSAMPLE,
+        validation_ratio=VALIDATION_RATIO,
+        task=tfdf.keras.Task.CLASSIFICATION,
+    )
+
+    gbt_model.compile(metrics=[keras.metrics.BinaryAccuracy(name="accuracy")])
+
+    return gbt_model
+
+
+"""
+### Train and evaluate the model
+"""
+
+gbt_model = create_gbt_with_preprocessor(create_target_encoder())
+run_experiment(gbt_model, train_data, test_data)
+
+"""
+## Experiment 3: Decision Forests with trained embeddings
+
+In this scenario, we build an encoder model that codes the categorical
+features to embeddings, where the size of the embedding for a given categorical
+feature is the square root to the size of its vocabulary.
+
+We train these embeddings in a simple NN model through backpropagation.
+After the embedding encoder is trained, we used it as a preprocessor to the
+input features of a Gradient Boosted Tree model.
+
+Note that the embeddings and a decision forest model cannot be trained
+synergically in one phase, since decision forest models do not train with backpropagation.
+Rather, embeddings has to be trained in an initial phase,
+and then used as static inputs to the decision forest model.
+"""
+
+"""
+### Implement feature encoding with embeddings
+"""
+
+
+def create_embedding_encoder(size=None):
+    inputs = create_model_inputs()
+    encoded_features = []
+    for feature_name in inputs:
+        if feature_name in CATEGORICAL_FEATURE_NAMES:
+            # Get the vocabulary of the categorical feature.
+            vocabulary = sorted(
+                [str(value) for value in list(train_data[feature_name].unique())]
+            )
+            # Create a lookup to convert string values to an integer indices.
+            # Since we are not using a mask token nor expecting any out of vocabulary
+            # (oov) token, we set mask_token to None and  num_oov_indices to 0.
+            lookup = layers.StringLookup(
+                vocabulary=vocabulary, mask_token=None, num_oov_indices=0
+            )
+            # Convert the string input values into integer indices.
+            value_index = lookup(inputs[feature_name])
+            # Create an embedding layer with the specified dimensions
+            vocabulary_size = len(vocabulary)
+            embedding_size = int(math.sqrt(vocabulary_size))
+            feature_encoder = layers.Embedding(
+                input_dim=len(vocabulary), output_dim=embedding_size
+            )
+            # Convert the index values to embedding representations.
+            encoded_feature = feature_encoder(value_index)
+        else:
+            # Expand the dimensions of the numerical input feature and use it as-is.
+            encoded_feature = tf.expand_dims(inputs[feature_name], -1)
+        # Add the encoded feature to the list.
+        encoded_features.append(encoded_feature)
+    # Concatenate all the encoded features.
+    encoded_features = layers.concatenate(encoded_features, axis=1)
+    # Apply dropout.
+    encoded_features = layers.Dropout(rate=0.25)(encoded_features)
+    # Perform non-linearity projection.
+    encoded_features = layers.Dense(
+        units=size if size else encoded_features.shape[-1], activation="gelu"
+    )(encoded_features)
+    # Create and return a Keras model with encoded features as outputs.
+    return keras.Model(inputs=inputs, outputs=encoded_features)
+
+
+"""
+### Build an NN model to train the embeddings
+"""
+
+
+def create_nn_model(encoder):
+    inputs = create_model_inputs()
+    embeddings = encoder(inputs)
+    output = layers.Dense(units=1, activation="sigmoid")(embeddings)
+
+    nn_model = keras.Model(inputs=inputs, outputs=output)
+    nn_model.compile(
+        optimizer=keras.optimizers.Adam(),
+        loss=keras.losses.BinaryCrossentropy(),
+        metrics=[keras.metrics.BinaryAccuracy("accuracy")],
+    )
+    return nn_model
+
+
+embedding_encoder = create_embedding_encoder(size=64)
+run_experiment(
+    create_nn_model(embedding_encoder),
+    train_data,
+    test_data,
+    num_epochs=5,
+    batch_size=256,
+)
+
+"""
+### Train and evaluate a Gradient Boosted Tree model with embeddings
+"""
+
+gbt_model = create_gbt_with_preprocessor(embedding_encoder)
+run_experiment(gbt_model, train_data, test_data)
+
+"""
+## Concluding remarks
+
+TensorFlow Decision Forests provide powerful models, especially with structured data.
+In our experiments, the Gradient Boosted Tree model achieved 95.79% test accuracy.
+When using the target encoding with categorical feature, the same model achieved 95.81% test accuracy.
+When pretraining embeddings to be used as inputs to the Gradient Boosted Tree model,
+we achieved 95.82% test accuracy.
+
+Decision Forests can be used with Neural Networks, either by
+1) using Neural Networks to learn useful representation of the input data,
+and then using Decision Forests for the supervised learning task, or by
+2) creating an ensemble of both Decision Forests and Neural Network models.
+
+Note that TensorFlow Decision Forests does not (yet) support hardware accelerators.
+All training and inference is done on the CPU.
+Besides, Decision Forests require a finite dataset that fits in memory
+for their training procedures. However, there are diminishing returns
+for increasing the size of the dataset, and Decision Forests algorithms
+arguably need fewer examples for convergence than large Neural Network models.
+"""
diff --git a/knowledge_base/structured_data/collaborative_filtering_movielens.py b/knowledge_base/structured_data/collaborative_filtering_movielens.py
new file mode 100644
index 0000000000000000000000000000000000000000..f5558ff9c3fa1286ae469551d803d286c3e5c88a
--- /dev/null
+++ b/knowledge_base/structured_data/collaborative_filtering_movielens.py
@@ -0,0 +1,230 @@
+"""
+Title: Collaborative Filtering for Movie Recommendations
+Author: [Siddhartha Banerjee](https://twitter.com/sidd2006)
+Date created: 2020/05/24
+Last modified: 2020/05/24
+Description: Recommending movies using a model trained on Movielens dataset.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+This example demonstrates
+[Collaborative filtering](https://en.wikipedia.org/wiki/Collaborative_filtering)
+using the [Movielens dataset](https://www.kaggle.com/c/movielens-100k)
+to recommend movies to users.
+The MovieLens ratings dataset lists the ratings given by a set of users to a set of movies.
+Our goal is to be able to predict ratings for movies a user has not yet watched.
+The movies with the highest predicted ratings can then be recommended to the user.
+
+The steps in the model are as follows:
+
+1. Map user ID to a "user vector" via an embedding matrix
+2. Map movie ID to a "movie vector" via an embedding matrix
+3. Compute the dot product between the user vector and movie vector, to obtain
+the a match score between the user and the movie (predicted rating).
+4. Train the embeddings via gradient descent using all known user-movie pairs.
+
+**References:**
+
+- [Collaborative Filtering](https://dl.acm.org/doi/pdf/10.1145/371920.372071)
+- [Neural Collaborative Filtering](https://dl.acm.org/doi/pdf/10.1145/3038912.3052569)
+"""
+
+import pandas as pd
+from pathlib import Path
+import matplotlib.pyplot as plt
+import numpy as np
+from zipfile import ZipFile
+
+import keras
+from keras import layers
+from keras import ops
+
+"""
+## First, load the data and apply preprocessing
+"""
+
+# Download the actual data from http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
+# Use the ratings.csv file
+movielens_data_file_url = (
+    "http://files.grouplens.org/datasets/movielens/ml-latest-small.zip"
+)
+movielens_zipped_file = keras.utils.get_file(
+    "ml-latest-small.zip", movielens_data_file_url, extract=False
+)
+keras_datasets_path = Path(movielens_zipped_file).parents[0]
+movielens_dir = keras_datasets_path / "ml-latest-small"
+
+# Only extract the data the first time the script is run.
+if not movielens_dir.exists():
+    with ZipFile(movielens_zipped_file, "r") as zip:
+        # Extract files
+        print("Extracting all the files now...")
+        zip.extractall(path=keras_datasets_path)
+        print("Done!")
+
+ratings_file = movielens_dir / "ratings.csv"
+df = pd.read_csv(ratings_file)
+
+"""
+First, need to perform some preprocessing to encode users and movies as integer indices.
+"""
+user_ids = df["userId"].unique().tolist()
+user2user_encoded = {x: i for i, x in enumerate(user_ids)}
+userencoded2user = {i: x for i, x in enumerate(user_ids)}
+movie_ids = df["movieId"].unique().tolist()
+movie2movie_encoded = {x: i for i, x in enumerate(movie_ids)}
+movie_encoded2movie = {i: x for i, x in enumerate(movie_ids)}
+df["user"] = df["userId"].map(user2user_encoded)
+df["movie"] = df["movieId"].map(movie2movie_encoded)
+
+num_users = len(user2user_encoded)
+num_movies = len(movie_encoded2movie)
+df["rating"] = df["rating"].values.astype(np.float32)
+# min and max ratings will be used to normalize the ratings later
+min_rating = min(df["rating"])
+max_rating = max(df["rating"])
+
+print(
+    "Number of users: {}, Number of Movies: {}, Min rating: {}, Max rating: {}".format(
+        num_users, num_movies, min_rating, max_rating
+    )
+)
+
+"""
+## Prepare training and validation data
+"""
+df = df.sample(frac=1, random_state=42)
+x = df[["user", "movie"]].values
+# Normalize the targets between 0 and 1. Makes it easy to train.
+y = df["rating"].apply(lambda x: (x - min_rating) / (max_rating - min_rating)).values
+# Assuming training on 90% of the data and validating on 10%.
+train_indices = int(0.9 * df.shape[0])
+x_train, x_val, y_train, y_val = (
+    x[:train_indices],
+    x[train_indices:],
+    y[:train_indices],
+    y[train_indices:],
+)
+
+"""
+## Create the model
+
+We embed both users and movies in to 50-dimensional vectors.
+
+The model computes a match score between user and movie embeddings via a dot product,
+and adds a per-movie and per-user bias. The match score is scaled to the `[0, 1]`
+interval via a sigmoid (since our ratings are normalized to this range).
+"""
+EMBEDDING_SIZE = 50
+
+
+class RecommenderNet(keras.Model):
+    def __init__(self, num_users, num_movies, embedding_size, **kwargs):
+        super().__init__(**kwargs)
+        self.num_users = num_users
+        self.num_movies = num_movies
+        self.embedding_size = embedding_size
+        self.user_embedding = layers.Embedding(
+            num_users,
+            embedding_size,
+            embeddings_initializer="he_normal",
+            embeddings_regularizer=keras.regularizers.l2(1e-6),
+        )
+        self.user_bias = layers.Embedding(num_users, 1)
+        self.movie_embedding = layers.Embedding(
+            num_movies,
+            embedding_size,
+            embeddings_initializer="he_normal",
+            embeddings_regularizer=keras.regularizers.l2(1e-6),
+        )
+        self.movie_bias = layers.Embedding(num_movies, 1)
+
+    def call(self, inputs):
+        user_vector = self.user_embedding(inputs[:, 0])
+        user_bias = self.user_bias(inputs[:, 0])
+        movie_vector = self.movie_embedding(inputs[:, 1])
+        movie_bias = self.movie_bias(inputs[:, 1])
+        dot_user_movie = ops.tensordot(user_vector, movie_vector, 2)
+        # Add all the components (including bias)
+        x = dot_user_movie + user_bias + movie_bias
+        # The sigmoid activation forces the rating to between 0 and 1
+        return ops.nn.sigmoid(x)
+
+
+model = RecommenderNet(num_users, num_movies, EMBEDDING_SIZE)
+model.compile(
+    loss=keras.losses.BinaryCrossentropy(),
+    optimizer=keras.optimizers.Adam(learning_rate=0.001),
+)
+
+"""
+## Train the model based on the data split
+"""
+history = model.fit(
+    x=x_train,
+    y=y_train,
+    batch_size=64,
+    epochs=5,
+    verbose=1,
+    validation_data=(x_val, y_val),
+)
+
+"""
+## Plot training and validation loss
+"""
+plt.plot(history.history["loss"])
+plt.plot(history.history["val_loss"])
+plt.title("model loss")
+plt.ylabel("loss")
+plt.xlabel("epoch")
+plt.legend(["train", "test"], loc="upper left")
+plt.show()
+
+"""
+## Show top 10 movie recommendations to a user
+"""
+
+movie_df = pd.read_csv(movielens_dir / "movies.csv")
+
+# Let us get a user and see the top recommendations.
+user_id = df.userId.sample(1).iloc[0]
+movies_watched_by_user = df[df.userId == user_id]
+movies_not_watched = movie_df[
+    ~movie_df["movieId"].isin(movies_watched_by_user.movieId.values)
+]["movieId"]
+movies_not_watched = list(
+    set(movies_not_watched).intersection(set(movie2movie_encoded.keys()))
+)
+movies_not_watched = [[movie2movie_encoded.get(x)] for x in movies_not_watched]
+user_encoder = user2user_encoded.get(user_id)
+user_movie_array = np.hstack(
+    ([[user_encoder]] * len(movies_not_watched), movies_not_watched)
+)
+ratings = model.predict(user_movie_array).flatten()
+top_ratings_indices = ratings.argsort()[-10:][::-1]
+recommended_movie_ids = [
+    movie_encoded2movie.get(movies_not_watched[x][0]) for x in top_ratings_indices
+]
+
+print("Showing recommendations for user: {}".format(user_id))
+print("====" * 9)
+print("Movies with high ratings from user")
+print("----" * 8)
+top_movies_user = (
+    movies_watched_by_user.sort_values(by="rating", ascending=False)
+    .head(5)
+    .movieId.values
+)
+movie_df_rows = movie_df[movie_df["movieId"].isin(top_movies_user)]
+for row in movie_df_rows.itertuples():
+    print(row.title, ":", row.genres)
+
+print("----" * 8)
+print("Top 10 movie recommendations")
+print("----" * 8)
+recommended_movies = movie_df[movie_df["movieId"].isin(recommended_movie_ids)]
+for row in recommended_movies.itertuples():
+    print(row.title, ":", row.genres)
diff --git a/knowledge_base/structured_data/customer_lifetime_value.py b/knowledge_base/structured_data/customer_lifetime_value.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ca97b17c402a8d221e9882debb05b7be4af683c
--- /dev/null
+++ b/knowledge_base/structured_data/customer_lifetime_value.py
@@ -0,0 +1,561 @@
+"""
+Title: Deep Learning for Customer Lifetime Value
+Author: [Praveen Hosdrug](https://www.linkedin.com/in/praveenhosdrug/)
+Date created: 2024/11/23
+Last modified: 2024/11/27
+Description: A hybrid deep learning architecture for predicting customer purchase patterns and lifetime value.
+Accelerator: None
+"""
+
+"""
+## Introduction
+
+A hybrid deep learning architecture combining Transformer encoders and LSTM networks
+for predicting customer purchase patterns and lifetime value using transaction history.
+While many existing review articles focus on classic parametric models and traditional machine learning algorithms
+,this implementation leverages recent advancements in Transformer-based models for time series prediction.
+The approach handles multi-granularity prediction across different temporal scales.
+
+"""
+
+"""
+## Setting up Libraries for the Deep Learning Project
+"""
+import subprocess
+
+
+def install_packages(packages):
+    """
+    Install a list of packages using pip.
+
+    Args:
+        packages (list): A list of package names to install.
+    """
+    for package in packages:
+        subprocess.run(["pip", "install", package], check=True)
+
+
+"""
+## List of Packages to Install
+
+1. uciml: For the purpose of the tutorial; we will be using
+          the UK Retail [Dataset](https://archive.ics.uci.edu/dataset/352/online+retail)
+2. keras_hub: Access to the transformer encoder layer.
+"""
+
+packages_to_install = ["ucimlrepo", "keras_hub"]
+
+# Install the packages
+install_packages(packages_to_install)
+
+# Core data processing and numerical libraries
+import os
+
+os.environ["KERAS_BACKEND"] = "jax"
+import keras
+import numpy as np
+import pandas as pd
+from typing import Dict
+
+
+# Visualization
+import matplotlib.pyplot as plt
+
+# Keras imports
+from keras import layers
+from keras import Model
+from keras import ops
+from keras_hub.layers import TransformerEncoder
+from keras import regularizers
+
+# UK Retail Dataset
+from ucimlrepo import fetch_ucirepo
+
+"""
+## Preprocessing the UK Retail dataset
+"""
+
+
+def prepare_time_series_data(data):
+    """
+    Preprocess retail transaction data for deep learning.
+
+    Args:
+        data: Raw transaction data containing InvoiceDate, UnitPrice, etc.
+    Returns:
+        Processed DataFrame with calculated features
+    """
+    processed_data = data.copy()
+
+    # Essential datetime handling for temporal ordering
+    processed_data["InvoiceDate"] = pd.to_datetime(processed_data["InvoiceDate"])
+
+    # Basic business constraints and calculations
+    processed_data = processed_data[processed_data["UnitPrice"] > 0]
+    processed_data["Amount"] = processed_data["UnitPrice"] * processed_data["Quantity"]
+    processed_data["CustomerID"] = processed_data["CustomerID"].fillna(99999.0)
+
+    # Handle outliers in Amount using statistical thresholds
+    q1 = processed_data["Amount"].quantile(0.25)
+    q3 = processed_data["Amount"].quantile(0.75)
+
+    # Define bounds - using 1.5 IQR rule
+    lower_bound = q1 - 1.5 * (q3 - q1)
+    upper_bound = q3 + 1.5 * (q3 - q1)
+
+    # Filter outliers
+    processed_data = processed_data[
+        (processed_data["Amount"] >= lower_bound)
+        & (processed_data["Amount"] <= upper_bound)
+    ]
+
+    return processed_data
+
+
+# Load Data
+
+online_retail = fetch_ucirepo(id=352)
+raw_data = online_retail.data.features
+transformed_data = prepare_time_series_data(raw_data)
+
+
+def prepare_data_for_modeling(
+    df: pd.DataFrame,
+    input_sequence_length: int = 6,
+    output_sequence_length: int = 6,
+) -> Dict:
+    """
+    Transform retail data into sequence-to-sequence format with separate
+    temporal and trend components.
+    """
+    df = df.copy()
+
+    # Daily aggregation
+    daily_purchases = (
+        df.groupby(["CustomerID", pd.Grouper(key="InvoiceDate", freq="D")])
+        .agg({"Amount": "sum", "Quantity": "sum", "Country": "first"})
+        .reset_index()
+    )
+
+    daily_purchases["frequency"] = np.where(daily_purchases["Amount"] > 0, 1, 0)
+
+    # Monthly resampling
+    monthly_purchases = (
+        daily_purchases.set_index("InvoiceDate")
+        .groupby("CustomerID")
+        .resample("M")
+        .agg(
+            {"Amount": "sum", "Quantity": "sum", "frequency": "sum", "Country": "first"}
+        )
+        .reset_index()
+    )
+
+    # Add cyclical temporal features
+    def prepare_temporal_features(input_window: pd.DataFrame) -> np.ndarray:
+
+        month = input_window["InvoiceDate"].dt.month
+        month_sin = np.sin(2 * np.pi * month / 12)
+        month_cos = np.cos(2 * np.pi * month / 12)
+        is_quarter_start = (month % 3 == 1).astype(int)
+
+        temporal_features = np.column_stack(
+            [
+                month,
+                input_window["InvoiceDate"].dt.year,
+                month_sin,
+                month_cos,
+                is_quarter_start,
+            ]
+        )
+        return temporal_features
+
+    # Prepare trend features with lagged values
+    def prepare_trend_features(input_window: pd.DataFrame, lag: int = 3) -> np.ndarray:
+
+        lagged_data = pd.DataFrame()
+        for i in range(1, lag + 1):
+            lagged_data[f"Amount_lag_{i}"] = input_window["Amount"].shift(i)
+            lagged_data[f"Quantity_lag_{i}"] = input_window["Quantity"].shift(i)
+            lagged_data[f"frequency_lag_{i}"] = input_window["frequency"].shift(i)
+
+        lagged_data = lagged_data.fillna(0)
+
+        trend_features = np.column_stack(
+            [
+                input_window["Amount"].values,
+                input_window["Quantity"].values,
+                input_window["frequency"].values,
+                lagged_data.values,
+            ]
+        )
+        return trend_features
+
+    sequence_containers = {
+        "temporal_sequences": [],
+        "trend_sequences": [],
+        "static_features": [],
+        "output_sequences": [],
+    }
+
+    # Process sequences for each customer
+    for customer_id, customer_data in monthly_purchases.groupby("CustomerID"):
+        customer_data = customer_data.sort_values("InvoiceDate")
+        sequence_ranges = (
+            len(customer_data) - input_sequence_length - output_sequence_length + 1
+        )
+
+        country = customer_data["Country"].iloc[0]
+
+        for i in range(sequence_ranges):
+            input_window = customer_data.iloc[i : i + input_sequence_length]
+            output_window = customer_data.iloc[
+                i
+                + input_sequence_length : i
+                + input_sequence_length
+                + output_sequence_length
+            ]
+
+            if (
+                len(input_window) == input_sequence_length
+                and len(output_window) == output_sequence_length
+            ):
+                temporal_features = prepare_temporal_features(input_window)
+                trend_features = prepare_trend_features(input_window)
+
+                sequence_containers["temporal_sequences"].append(temporal_features)
+                sequence_containers["trend_sequences"].append(trend_features)
+                sequence_containers["static_features"].append(country)
+                sequence_containers["output_sequences"].append(
+                    output_window["Amount"].values
+                )
+
+    return {
+        "temporal_sequences": (
+            np.array(sequence_containers["temporal_sequences"], dtype=np.float32)
+        ),
+        "trend_sequences": (
+            np.array(sequence_containers["trend_sequences"], dtype=np.float32)
+        ),
+        "static_features": np.array(sequence_containers["static_features"]),
+        "output_sequences": (
+            np.array(sequence_containers["output_sequences"], dtype=np.float32)
+        ),
+    }
+
+
+# Transform data with input and output sequences into a Output dictionary
+output = prepare_data_for_modeling(
+    df=transformed_data, input_sequence_length=6, output_sequence_length=6
+)
+
+"""
+## Scaling and Splitting
+"""
+
+
+def robust_scale(data):
+    """
+    Min-Max scaling function since standard deviation is high.
+    """
+    data = np.array(data)
+    data_min = np.min(data)
+    data_max = np.max(data)
+    scaled = (data - data_min) / (data_max - data_min)
+    return scaled
+
+
+def create_temporal_splits_with_scaling(
+    prepared_data: Dict[str, np.ndarray],
+    test_ratio: float = 0.2,
+    val_ratio: float = 0.2,
+):
+    total_sequences = len(prepared_data["trend_sequences"])
+    # Calculate split points
+    test_size = int(total_sequences * test_ratio)
+    val_size = int(total_sequences * val_ratio)
+    train_size = total_sequences - (test_size + val_size)
+
+    # Scale trend sequences
+    trend_shape = prepared_data["trend_sequences"].shape
+    scaled_trends = np.zeros_like(prepared_data["trend_sequences"])
+
+    # Scale each feature independently
+    for i in range(trend_shape[-1]):
+        scaled_trends[..., i] = robust_scale(prepared_data["trend_sequences"][..., i])
+    # Scale output sequences
+    scaled_outputs = robust_scale(prepared_data["output_sequences"])
+
+    # Create splits
+    train_data = {
+        "trend_sequences": scaled_trends[:train_size],
+        "temporal_sequences": prepared_data["temporal_sequences"][:train_size],
+        "static_features": prepared_data["static_features"][:train_size],
+        "output_sequences": scaled_outputs[:train_size],
+    }
+
+    val_data = {
+        "trend_sequences": scaled_trends[train_size : train_size + val_size],
+        "temporal_sequences": prepared_data["temporal_sequences"][
+            train_size : train_size + val_size
+        ],
+        "static_features": prepared_data["static_features"][
+            train_size : train_size + val_size
+        ],
+        "output_sequences": scaled_outputs[train_size : train_size + val_size],
+    }
+
+    test_data = {
+        "trend_sequences": scaled_trends[train_size + val_size :],
+        "temporal_sequences": prepared_data["temporal_sequences"][
+            train_size + val_size :
+        ],
+        "static_features": prepared_data["static_features"][train_size + val_size :],
+        "output_sequences": scaled_outputs[train_size + val_size :],
+    }
+
+    return train_data, val_data, test_data
+
+
+# Usage
+train_data, val_data, test_data = create_temporal_splits_with_scaling(output)
+
+"""
+## Evaluation
+"""
+
+
+def calculate_metrics(y_true, y_pred):
+    """
+    Calculates RMSE, MAE and R²
+    """
+    # Convert inputs to "float32"
+    y_true = ops.cast(y_true, dtype="float32")
+    y_pred = ops.cast(y_pred, dtype="float32")
+
+    # RMSE
+    rmse = np.sqrt(np.mean(np.square(y_true - y_pred)))
+
+    # R² (coefficient of determination)
+    ss_res = np.sum(np.square(y_true - y_pred))
+    ss_tot = np.sum(np.square(y_true - np.mean(y_true)))
+    r2 = 1 - (ss_res / ss_tot)
+
+    return {"mae": np.mean(np.abs(y_true - y_pred)), "rmse": rmse, "r2": r2}
+
+
+def plot_lorenz_analysis(y_true, y_pred):
+    """
+    Plots Lorenz curves to show distribution of high and low value users
+    """
+    # Convert to numpy arrays and flatten
+    y_true = np.array(y_true).flatten()
+    y_pred = np.array(y_pred).flatten()
+
+    # Sort values in descending order (for high-value users analysis)
+    true_sorted = np.sort(-y_true)
+    pred_sorted = np.sort(-y_pred)
+
+    # Calculate cumulative sums
+    true_cumsum = np.cumsum(true_sorted)
+    pred_cumsum = np.cumsum(pred_sorted)
+
+    # Normalize to percentages
+    true_cumsum_pct = true_cumsum / true_cumsum[-1]
+    pred_cumsum_pct = pred_cumsum / pred_cumsum[-1]
+
+    # Generate percentiles for x-axis
+    percentiles = np.linspace(0, 1, len(y_true))
+
+    # Calculate Mutual Gini (area between curves)
+    mutual_gini = np.abs(
+        np.trapz(true_cumsum_pct, percentiles) - np.trapz(pred_cumsum_pct, percentiles)
+    )
+
+    # Create plot
+    plt.figure(figsize=(10, 6))
+    plt.plot(percentiles, true_cumsum_pct, "g-", label="True Values")
+    plt.plot(percentiles, pred_cumsum_pct, "r-", label="Predicted Values")
+    plt.xlabel("Cumulative % of Users (Descending Order)")
+    plt.ylabel("Cumulative % of LTV")
+    plt.title("Lorenz Curves: True vs Predicted Values")
+    plt.legend()
+    plt.grid(True)
+    print(f"\nMutual Gini: {mutual_gini:.4f} (lower is better)")
+    plt.show()
+
+    return mutual_gini
+
+
+"""
+## Hybrid Transformer / LSTM model architecture
+
+The hybrid nature of this model is particularly significant because it combines RNN's
+ability to handle sequential data with Transformer's attention mechanisms for capturing
+global patterns across countries and seasonality.
+"""
+
+
+def build_hybrid_model(
+    input_sequence_length: int,
+    output_sequence_length: int,
+    num_countries: int,
+    d_model: int = 8,
+    num_heads: int = 4,
+):
+
+    keras.utils.set_random_seed(seed=42)
+
+    # Inputs
+    temporal_inputs = layers.Input(
+        shape=(input_sequence_length, 5), name="temporal_inputs"
+    )
+    trend_inputs = layers.Input(shape=(input_sequence_length, 12), name="trend_inputs")
+    country_inputs = layers.Input(
+        shape=(num_countries,), dtype="int32", name="country_inputs"
+    )
+
+    # Process country features
+    country_embedding = layers.Embedding(
+        input_dim=num_countries,
+        output_dim=d_model,
+        mask_zero=False,
+        name="country_embedding",
+    )(
+        country_inputs
+    )  # Output shape: (batch_size, 1, d_model)
+
+    # Flatten the embedding output
+    country_embedding = layers.Flatten(name="flatten_country_embedding")(
+        country_embedding
+    )
+
+    # Repeat the country embedding across timesteps
+    country_embedding_repeated = layers.RepeatVector(
+        input_sequence_length, name="repeat_country_embedding"
+    )(country_embedding)
+
+    # Projection of temporal inputs to match Transformer dimensions
+    temporal_projection = layers.Dense(
+        d_model, activation="tanh", name="temporal_projection"
+    )(temporal_inputs)
+
+    # Combine all features
+    combined_features = layers.Concatenate()(
+        [temporal_projection, country_embedding_repeated]
+    )
+
+    transformer_output = combined_features
+    for _ in range(3):
+        transformer_output = TransformerEncoder(
+            intermediate_dim=16, num_heads=num_heads
+        )(transformer_output)
+
+    lstm_output = layers.LSTM(units=64, name="lstm_trend")(trend_inputs)
+
+    transformer_flattened = layers.GlobalAveragePooling1D(name="flatten_transformer")(
+        transformer_output
+    )
+    transformer_flattened = layers.Dense(1, activation="sigmoid")(transformer_flattened)
+    # Concatenate flattened Transformer output with LSTM output
+    merged_features = layers.Concatenate(name="concatenate_transformer_lstm")(
+        [transformer_flattened, lstm_output]
+    )
+    # Repeat the merged features to match the output sequence length
+    decoder_initial = layers.RepeatVector(
+        output_sequence_length, name="repeat_merged_features"
+    )(merged_features)
+
+    decoder_lstm = layers.LSTM(
+        units=64,
+        return_sequences=True,
+        recurrent_regularizer=regularizers.L1L2(l1=1e-5, l2=1e-4),
+    )(decoder_initial)
+
+    # Output Dense layer
+    output = layers.Dense(units=1, activation="linear", name="output_dense")(
+        decoder_lstm
+    )
+
+    model = Model(
+        inputs=[temporal_inputs, trend_inputs, country_inputs], outputs=output
+    )
+
+    model.compile(
+        optimizer=keras.optimizers.Adam(learning_rate=0.001),
+        loss="mse",
+        metrics=["mse"],
+    )
+
+    return model
+
+
+# Create the hybrid model
+model = build_hybrid_model(
+    input_sequence_length=6,
+    output_sequence_length=6,
+    num_countries=len(np.unique(train_data["static_features"])) + 1,
+    d_model=8,
+    num_heads=4,
+)
+
+# Configure StringLookup
+label_encoder = layers.StringLookup(output_mode="one_hot", num_oov_indices=1)
+
+# Adapt and encode
+label_encoder.adapt(train_data["static_features"])
+
+train_static_encoded = label_encoder(train_data["static_features"])
+val_static_encoded = label_encoder(val_data["static_features"])
+test_static_encoded = label_encoder(test_data["static_features"])
+
+# Convert sequences with proper type casting
+x_train_seq = np.asarray(train_data["trend_sequences"]).astype(np.float32)
+x_val_seq = np.asarray(val_data["trend_sequences"]).astype(np.float32)
+x_train_temporal = np.asarray(train_data["temporal_sequences"]).astype(np.float32)
+x_val_temporal = np.asarray(val_data["temporal_sequences"]).astype(np.float32)
+train_outputs = np.asarray(train_data["output_sequences"]).astype(np.float32)
+val_outputs = np.asarray(val_data["output_sequences"]).astype(np.float32)
+test_output = np.asarray(test_data["output_sequences"]).astype(np.float32)
+# Training setup
+keras.utils.set_random_seed(seed=42)
+
+history = model.fit(
+    [x_train_temporal, x_train_seq, train_static_encoded],
+    train_outputs,
+    validation_data=(
+        [x_val_temporal, x_val_seq, val_static_encoded],
+        val_data["output_sequences"].astype(np.float32),
+    ),
+    epochs=20,
+    batch_size=30,
+)
+
+# Make predictions
+predictions = model.predict(
+    [
+        test_data["temporal_sequences"].astype(np.float32),
+        test_data["trend_sequences"].astype(np.float32),
+        test_static_encoded,
+    ]
+)
+
+# Calculate the predictions
+predictions = np.squeeze(predictions)
+
+# Calculate basic metrics
+hybrid_metrics = calculate_metrics(test_data["output_sequences"], predictions)
+
+# Plot Lorenz curves and get Mutual Gini
+hybrid_mutual_gini = plot_lorenz_analysis(test_data["output_sequences"], predictions)
+
+"""
+## Conclusion
+
+While LSTMs excel at sequence to sequence learning as demonstrated through the work of Sutskever, I., Vinyals,
+O., & Le, Q. V. (2014) Sequence to sequence learning with neural networks.
+The hybrid approach here enhances this foundation. The addition of attention mechanisms allows the model to adaptively
+focus on relevant temporal/geographical patterns while maintaining the LSTM's inherent strengths in sequence learning.
+This combination has proven especially effective for handling both periodic patterns and special events in time
+series forecasting from Zhou, H., Zhang, S., Peng, J., Zhang, S., Li, J., Xiong, H., & Zhang, W. (2021).
+Informer: Beyond Efficient Transformer for Long Sequence Time-Series Forecasting.
+"""
diff --git a/knowledge_base/structured_data/deep_neural_decision_forests.py b/knowledge_base/structured_data/deep_neural_decision_forests.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a3e38fc06e67b3d2f6cda0eef36cdc344e7149f
--- /dev/null
+++ b/knowledge_base/structured_data/deep_neural_decision_forests.py
@@ -0,0 +1,460 @@
+"""
+Title: Classification with Neural Decision Forests
+Author: [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/)
+Date created: 2021/01/15
+Last modified: 2021/01/15
+Description: How to train differentiable decision trees for end-to-end learning in deep neural networks.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+This example provides an implementation of the
+[Deep Neural Decision Forest](https://ieeexplore.ieee.org/document/7410529)
+model introduced by P. Kontschieder et al. for structured data classification.
+It demonstrates how to build a stochastic and differentiable decision tree model,
+train it end-to-end, and unify decision trees with deep representation learning.
+
+## The dataset
+
+This example uses the
+[United States Census Income Dataset](https://archive.ics.uci.edu/ml/datasets/census+income)
+provided by the
+[UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php).
+The task is binary classification
+to predict whether a person is likely to be making over USD 50,000 a year.
+
+The dataset includes 48,842 instances with 14 input features (such as age, work class, education, occupation, and so on): 5 numerical features
+and 9 categorical features.
+"""
+
+"""
+## Setup
+"""
+
+import keras
+from keras import layers
+from keras.layers import StringLookup
+from keras import ops
+
+
+from tensorflow import data as tf_data
+import numpy as np
+import pandas as pd
+
+import math
+
+
+"""
+## Prepare the data
+"""
+
+CSV_HEADER = [
+    "age",
+    "workclass",
+    "fnlwgt",
+    "education",
+    "education_num",
+    "marital_status",
+    "occupation",
+    "relationship",
+    "race",
+    "gender",
+    "capital_gain",
+    "capital_loss",
+    "hours_per_week",
+    "native_country",
+    "income_bracket",
+]
+
+train_data_url = (
+    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
+)
+train_data = pd.read_csv(train_data_url, header=None, names=CSV_HEADER)
+
+test_data_url = (
+    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"
+)
+test_data = pd.read_csv(test_data_url, header=None, names=CSV_HEADER)
+
+print(f"Train dataset shape: {train_data.shape}")
+print(f"Test dataset shape: {test_data.shape}")
+
+"""
+Remove the first record (because it is not a valid data example) and a trailing
+'dot' in the class labels.
+"""
+
+test_data = test_data[1:]
+test_data.income_bracket = test_data.income_bracket.apply(
+    lambda value: value.replace(".", "")
+)
+
+"""
+We store the training and test data splits locally as CSV files.
+"""
+
+train_data_file = "train_data.csv"
+test_data_file = "test_data.csv"
+
+train_data.to_csv(train_data_file, index=False, header=False)
+test_data.to_csv(test_data_file, index=False, header=False)
+
+"""
+## Define dataset metadata
+
+Here, we define the metadata of the dataset that will be useful for reading and parsing
+and encoding input features.
+"""
+
+# A list of the numerical feature names.
+NUMERIC_FEATURE_NAMES = [
+    "age",
+    "education_num",
+    "capital_gain",
+    "capital_loss",
+    "hours_per_week",
+]
+# A dictionary of the categorical features and their vocabulary.
+CATEGORICAL_FEATURES_WITH_VOCABULARY = {
+    "workclass": sorted(list(train_data["workclass"].unique())),
+    "education": sorted(list(train_data["education"].unique())),
+    "marital_status": sorted(list(train_data["marital_status"].unique())),
+    "occupation": sorted(list(train_data["occupation"].unique())),
+    "relationship": sorted(list(train_data["relationship"].unique())),
+    "race": sorted(list(train_data["race"].unique())),
+    "gender": sorted(list(train_data["gender"].unique())),
+    "native_country": sorted(list(train_data["native_country"].unique())),
+}
+# A list of the columns to ignore from the dataset.
+IGNORE_COLUMN_NAMES = ["fnlwgt"]
+# A list of the categorical feature names.
+CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())
+# A list of all the input features.
+FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES
+# A list of column default values for each feature.
+COLUMN_DEFAULTS = [
+    [0.0] if feature_name in NUMERIC_FEATURE_NAMES + IGNORE_COLUMN_NAMES else ["NA"]
+    for feature_name in CSV_HEADER
+]
+# The name of the target feature.
+TARGET_FEATURE_NAME = "income_bracket"
+# A list of the labels of the target features.
+TARGET_LABELS = [" <=50K", " >50K"]
+
+"""
+## Create `tf_data.Dataset` objects for training and validation
+
+We create an input function to read and parse the file, and convert features and labels
+into a [`tf_data.Dataset`](https://www.tensorflow.org/guide/datasets)
+for training and validation. We also preprocess the input by mapping the target label
+to an index.
+"""
+
+
+target_label_lookup = StringLookup(
+    vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0
+)
+
+
+lookup_dict = {}
+for feature_name in CATEGORICAL_FEATURE_NAMES:
+    vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
+    # Create a lookup to convert a string values to an integer indices.
+    # Since we are not using a mask token, nor expecting any out of vocabulary
+    # (oov) token, we set mask_token to None and num_oov_indices to 0.
+    lookup = StringLookup(vocabulary=vocabulary, mask_token=None, num_oov_indices=0)
+    lookup_dict[feature_name] = lookup
+
+
+def encode_categorical(batch_x, batch_y):
+    for feature_name in CATEGORICAL_FEATURE_NAMES:
+        batch_x[feature_name] = lookup_dict[feature_name](batch_x[feature_name])
+
+    return batch_x, batch_y
+
+
+def get_dataset_from_csv(csv_file_path, shuffle=False, batch_size=128):
+    dataset = (
+        tf_data.experimental.make_csv_dataset(
+            csv_file_path,
+            batch_size=batch_size,
+            column_names=CSV_HEADER,
+            column_defaults=COLUMN_DEFAULTS,
+            label_name=TARGET_FEATURE_NAME,
+            num_epochs=1,
+            header=False,
+            na_value="?",
+            shuffle=shuffle,
+        )
+        .map(lambda features, target: (features, target_label_lookup(target)))
+        .map(encode_categorical)
+    )
+
+    return dataset.cache()
+
+
+"""
+## Create model inputs
+"""
+
+
+def create_model_inputs():
+    inputs = {}
+    for feature_name in FEATURE_NAMES:
+        if feature_name in NUMERIC_FEATURE_NAMES:
+            inputs[feature_name] = layers.Input(
+                name=feature_name, shape=(), dtype="float32"
+            )
+        else:
+            inputs[feature_name] = layers.Input(
+                name=feature_name, shape=(), dtype="int32"
+            )
+    return inputs
+
+
+"""
+## Encode input features
+"""
+
+
+def encode_inputs(inputs):
+    encoded_features = []
+    for feature_name in inputs:
+        if feature_name in CATEGORICAL_FEATURE_NAMES:
+            vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
+            # Create a lookup to convert a string values to an integer indices.
+            # Since we are not using a mask token, nor expecting any out of vocabulary
+            # (oov) token, we set mask_token to None and num_oov_indices to 0.
+            value_index = inputs[feature_name]
+            embedding_dims = int(math.sqrt(lookup.vocabulary_size()))
+            # Create an embedding layer with the specified dimensions.
+            embedding = layers.Embedding(
+                input_dim=lookup.vocabulary_size(), output_dim=embedding_dims
+            )
+            # Convert the index values to embedding representations.
+            encoded_feature = embedding(value_index)
+        else:
+            # Use the numerical features as-is.
+            encoded_feature = inputs[feature_name]
+            if inputs[feature_name].shape[-1] is None:
+                encoded_feature = keras.ops.expand_dims(encoded_feature, -1)
+
+        encoded_features.append(encoded_feature)
+
+    encoded_features = layers.concatenate(encoded_features)
+    return encoded_features
+
+
+"""
+## Deep Neural Decision Tree
+
+A neural decision tree model has two sets of weights to learn. The first set is `pi`,
+which represents the probability distribution of the classes in the tree leaves.
+The second set is the weights of the routing layer `decision_fn`, which represents the probability
+of going to each leave. The forward pass of the model works as follows:
+
+1. The model expects input `features` as a single vector encoding all the features of an instance
+in the batch. This vector can be generated from a Convolution Neural Network (CNN) applied to images
+or dense transformations applied to structured data features.
+2. The model first applies a `used_features_mask` to randomly select a subset of input features to use.
+3. Then, the model computes the probabilities (`mu`) for the input instances to reach the tree leaves
+by iteratively performing a *stochastic* routing throughout the tree levels.
+4. Finally, the probabilities of reaching the leaves are combined by the class probabilities at the
+leaves to produce the final `outputs`.
+"""
+
+
+class NeuralDecisionTree(keras.Model):
+    def __init__(self, depth, num_features, used_features_rate, num_classes):
+        super().__init__()
+        self.depth = depth
+        self.num_leaves = 2**depth
+        self.num_classes = num_classes
+
+        # Create a mask for the randomly selected features.
+        num_used_features = int(num_features * used_features_rate)
+        one_hot = np.eye(num_features)
+        sampled_feature_indices = np.random.choice(
+            np.arange(num_features), num_used_features, replace=False
+        )
+        self.used_features_mask = ops.convert_to_tensor(
+            one_hot[sampled_feature_indices], dtype="float32"
+        )
+
+        # Initialize the weights of the classes in leaves.
+        self.pi = self.add_weight(
+            initializer="random_normal",
+            shape=[self.num_leaves, self.num_classes],
+            dtype="float32",
+            trainable=True,
+        )
+
+        # Initialize the stochastic routing layer.
+        self.decision_fn = layers.Dense(
+            units=self.num_leaves, activation="sigmoid", name="decision"
+        )
+
+    def call(self, features):
+        batch_size = ops.shape(features)[0]
+
+        # Apply the feature mask to the input features.
+        features = ops.matmul(
+            features, ops.transpose(self.used_features_mask)
+        )  # [batch_size, num_used_features]
+        # Compute the routing probabilities.
+        decisions = ops.expand_dims(
+            self.decision_fn(features), axis=2
+        )  # [batch_size, num_leaves, 1]
+        # Concatenate the routing probabilities with their complements.
+        decisions = layers.concatenate(
+            [decisions, 1 - decisions], axis=2
+        )  # [batch_size, num_leaves, 2]
+
+        mu = ops.ones([batch_size, 1, 1])
+
+        begin_idx = 1
+        end_idx = 2
+        # Traverse the tree in breadth-first order.
+        for level in range(self.depth):
+            mu = ops.reshape(mu, [batch_size, -1, 1])  # [batch_size, 2 ** level, 1]
+            mu = ops.tile(mu, (1, 1, 2))  # [batch_size, 2 ** level, 2]
+            level_decisions = decisions[
+                :, begin_idx:end_idx, :
+            ]  # [batch_size, 2 ** level, 2]
+            mu = mu * level_decisions  # [batch_size, 2**level, 2]
+            begin_idx = end_idx
+            end_idx = begin_idx + 2 ** (level + 1)
+
+        mu = ops.reshape(mu, [batch_size, self.num_leaves])  # [batch_size, num_leaves]
+        probabilities = keras.activations.softmax(self.pi)  # [num_leaves, num_classes]
+        outputs = ops.matmul(mu, probabilities)  # [batch_size, num_classes]
+        return outputs
+
+
+"""
+## Deep Neural Decision Forest
+
+The neural decision forest model consists of a set of neural decision trees that are
+trained simultaneously. The output of the forest model is the average outputs of its trees.
+"""
+
+
+class NeuralDecisionForest(keras.Model):
+    def __init__(self, num_trees, depth, num_features, used_features_rate, num_classes):
+        super().__init__()
+        self.ensemble = []
+        # Initialize the ensemble by adding NeuralDecisionTree instances.
+        # Each tree will have its own randomly selected input features to use.
+        for _ in range(num_trees):
+            self.ensemble.append(
+                NeuralDecisionTree(depth, num_features, used_features_rate, num_classes)
+            )
+
+    def call(self, inputs):
+        # Initialize the outputs: a [batch_size, num_classes] matrix of zeros.
+        batch_size = ops.shape(inputs)[0]
+        outputs = ops.zeros([batch_size, num_classes])
+
+        # Aggregate the outputs of trees in the ensemble.
+        for tree in self.ensemble:
+            outputs += tree(inputs)
+        # Divide the outputs by the ensemble size to get the average.
+        outputs /= len(self.ensemble)
+        return outputs
+
+
+"""
+Finally, let's set up the code that will train and evaluate the model.
+"""
+
+learning_rate = 0.01
+batch_size = 265
+num_epochs = 10
+
+
+def run_experiment(model):
+    model.compile(
+        optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
+        loss=keras.losses.SparseCategoricalCrossentropy(),
+        metrics=[keras.metrics.SparseCategoricalAccuracy()],
+    )
+
+    print("Start training the model...")
+    train_dataset = get_dataset_from_csv(
+        train_data_file, shuffle=True, batch_size=batch_size
+    )
+
+    model.fit(train_dataset, epochs=num_epochs)
+    print("Model training finished")
+
+    print("Evaluating the model on the test data...")
+    test_dataset = get_dataset_from_csv(test_data_file, batch_size=batch_size)
+
+    _, accuracy = model.evaluate(test_dataset)
+    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
+
+
+"""
+## Experiment 1: train a decision tree model
+
+In this experiment, we train a single neural decision tree model
+where we use all input features.
+"""
+
+num_trees = 10
+depth = 10
+used_features_rate = 1.0
+num_classes = len(TARGET_LABELS)
+
+
+def create_tree_model():
+    inputs = create_model_inputs()
+    features = encode_inputs(inputs)
+    features = layers.BatchNormalization()(features)
+    num_features = features.shape[1]
+
+    tree = NeuralDecisionTree(depth, num_features, used_features_rate, num_classes)
+
+    outputs = tree(features)
+    model = keras.Model(inputs=inputs, outputs=outputs)
+    return model
+
+
+tree_model = create_tree_model()
+run_experiment(tree_model)
+
+
+"""
+## Experiment 2: train a forest model
+
+In this experiment, we train a neural decision forest with `num_trees` trees
+where each tree uses randomly selected 50% of the input features. You can control the number
+of features to be used in each tree by setting the `used_features_rate` variable.
+In addition, we set the depth to 5 instead of 10 compared to the previous experiment.
+"""
+
+num_trees = 25
+depth = 5
+used_features_rate = 0.5
+
+
+def create_forest_model():
+    inputs = create_model_inputs()
+    features = encode_inputs(inputs)
+    features = layers.BatchNormalization()(features)
+    num_features = features.shape[1]
+
+    forest_model = NeuralDecisionForest(
+        num_trees, depth, num_features, used_features_rate, num_classes
+    )
+
+    outputs = forest_model(features)
+    model = keras.Model(inputs=inputs, outputs=outputs)
+    return model
+
+
+forest_model = create_forest_model()
+
+run_experiment(forest_model)
diff --git a/knowledge_base/structured_data/feature_space_advanced.py b/knowledge_base/structured_data/feature_space_advanced.py
new file mode 100644
index 0000000000000000000000000000000000000000..12826728c5dddf4d8781672586d462b0af651e3e
--- /dev/null
+++ b/knowledge_base/structured_data/feature_space_advanced.py
@@ -0,0 +1,608 @@
+"""
+Title: FeatureSpace advanced use cases
+Author: [Dimitre Oliveira](https://www.linkedin.com/in/dimitre-oliveira-7a1a0113a/)
+Date created: 2023/07/01
+Last modified: 2025/01/03
+Description: How to use FeatureSpace for advanced preprocessing use cases.
+Accelerator: None
+"""
+
+"""
+## Introduction
+
+This example is an extension of the
+[Structured data classification with FeatureSpace](https://keras.io/examples/structured_data/structured_data_classification_with_feature_space/)
+code example, and here we will extend it to cover more complex use
+cases of the [`keras.utils.FeatureSpace`](https://keras.io/api/utils/feature_space/)
+preprocessing utility, like feature hashing, feature crosses, handling missing values and
+integrating [Keras preprocessing layers](https://keras.io/api/layers/preprocessing_layers/)
+with FeatureSpace.
+
+The general task still is structured data classification (also known as tabular data
+classification) using a data that includes numerical features, integer categorical
+features, and string categorical features.
+"""
+
+"""
+### The dataset
+
+[Our dataset](https://archive.ics.uci.edu/dataset/222/bank+marketing) is provided by a
+Portuguese banking institution.
+It's a CSV file with 4119 rows. Each row contains information about marketing campaigns
+based on phone calls, and each column describes an attribute of the client. We use the
+features to predict whether the client subscribed ('yes') or not ('no') to the product
+(bank term deposit).
+
+Here's the description of each feature:
+
+Column| Description| Feature Type
+------|------------|-------------
+Age | Age of the client | Numerical
+Job | Type of job | Categorical
+Marital | Marital status | Categorical
+Education | Education level of the client | Categorical
+Default | Has credit in default? | Categorical
+Housing | Has housing loan? | Categorical
+Loan | Has personal loan? | Categorical
+Contact | Contact communication type | Categorical
+Month | Last contact month of year | Categorical
+Day_of_week | Last contact day of the week | Categorical
+Duration | Last contact duration, in seconds | Numerical
+Campaign | Number of contacts performed during this campaign and for this client | Numerical
+Pdays | Number of days that passed by after the client was last contacted from a previous campaign | Numerical
+Previous | Number of contacts performed before this campaign and for this client | Numerical
+Poutcome | Outcome of the previous marketing campaign | Categorical
+Emp.var.rate | Employment variation rate | Numerical
+Cons.price.idx | Consumer price index | Numerical
+Cons.conf.idx | Consumer confidence index | Numerical
+Euribor3m | Euribor 3 month rate | Numerical
+Nr.employed | Number of employees | Numerical
+Y | Has the client subscribed a term deposit? | Target
+
+**Important note regarding the feature `duration`**:  this attribute highly affects the
+output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a
+call is performed. Also, after the end of the call y is obviously known. Thus, this input
+should only be included for benchmark purposes and should be discarded if the intention
+is to have a realistic predictive model. For this reason we will drop it.
+
+"""
+
+"""
+## Setup
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import keras
+from keras.utils import FeatureSpace
+import pandas as pd
+import tensorflow as tf
+from pathlib import Path
+from zipfile import ZipFile
+
+"""
+## Load the data
+
+Let's download the data and load it into a Pandas dataframe:
+"""
+
+data_url = "https://archive.ics.uci.edu/static/public/222/bank+marketing.zip"
+data_zipped_path = keras.utils.get_file("bank_marketing.zip", data_url, extract=True)
+keras_datasets_path = Path(data_zipped_path)
+with ZipFile(f"{keras_datasets_path}/bank-additional.zip", "r") as zip:
+    # Extract files
+    zip.extractall(path=keras_datasets_path)
+
+dataframe = pd.read_csv(
+    f"{keras_datasets_path}/bank-additional/bank-additional.csv", sep=";"
+)
+
+"""
+We will create a new feature `previously_contacted` to be able to demonstrate some useful
+preprocessing techniques, this feature is based on `pdays`. According to the dataset
+information if `pdays = 999` it means that the client was not previously contacted, so
+let's create a feature to capture that.
+"""
+
+# Droping `duration` to avoid target leak
+dataframe.drop("duration", axis=1, inplace=True)
+# Creating the new feature `previously_contacted`
+dataframe["previously_contacted"] = dataframe["pdays"].map(
+    lambda x: 0 if x == 999 else 1
+)
+
+"""
+The dataset includes 4119 samples with 21 columns per sample (20 features, plus the
+target label), here's a preview of a few samples:
+"""
+
+print(f"Dataframe shape: {dataframe.shape}")
+print(dataframe.head())
+
+"""
+The column, "y", indicates whether the client has subscribed a term deposit or not.
+"""
+
+"""
+## Train/validation split
+
+Let's split the data into a training and validation set:
+"""
+
+valid_dataframe = dataframe.sample(frac=0.2, random_state=0)
+train_dataframe = dataframe.drop(valid_dataframe.index)
+
+print(
+    f"Using {len(train_dataframe)} samples for training and "
+    f"{len(valid_dataframe)} for validation"
+)
+
+"""
+## Generating TF datasets
+
+Let's generate
+[`tf.data.Dataset`](https://www.tensorflow.org/api_docs/python/tf/data/Dataset) objects
+for each dataframe, since our target column `y` is a string we also need to encode it as
+an integer to be able to train our model with it. To achieve this we will create a
+`StringLookup` layer that will map the strings "no" and "yes" into "0" and "1"
+respectively.
+"""
+
+label_lookup = keras.layers.StringLookup(
+    # the order here is important since the first index will be encoded as 0
+    vocabulary=["no", "yes"],
+    num_oov_indices=0,
+)
+
+
+def encode_label(x, y):
+    encoded_y = label_lookup(y)
+    return x, encoded_y
+
+
+def dataframe_to_dataset(dataframe):
+    dataframe = dataframe.copy()
+    labels = dataframe.pop("y")
+    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
+    ds = ds.map(encode_label, num_parallel_calls=tf.data.AUTOTUNE)
+    ds = ds.shuffle(buffer_size=len(dataframe))
+    return ds
+
+
+train_ds = dataframe_to_dataset(train_dataframe)
+valid_ds = dataframe_to_dataset(valid_dataframe)
+
+"""
+Each `Dataset` yields a tuple `(input, target)` where `input` is a dictionary of features
+and `target` is the value `0` or `1`:
+"""
+
+for x, y in dataframe_to_dataset(train_dataframe).take(1):
+    print(f"Input: {x}")
+    print(f"Target: {y}")
+
+"""
+## Preprocessing
+
+Usually our data is not on the proper or best format for modeling, this is why most of
+the time we need to do some kind of preprocessing on the features to make them compatible
+with the model or to extract the most of them for the task. We need to do this
+preprocessing step for training but but at inference we also need to make sure that the
+data goes through the same process, this where a utility like `FeatureSpace` shines, we
+can define all the preprocessing once and re-use it at different stages of our system.
+
+Here we will see how to use `FeatureSpace` to perform more complex transformations and
+its flexibility, then combine everything together into a single component to preprocess
+data for our model.
+"""
+
+"""
+The `FeatureSpace` utility learns how to process the data by using the `adapt()` function
+to learn from it, this requires a dataset containing only feature, so let's create it
+together with a utility function to show the preprocessing example in practice:
+"""
+
+train_ds_with_no_labels = train_ds.map(lambda x, _: x)
+
+
+def example_feature_space(dataset, feature_space, feature_names):
+    feature_space.adapt(dataset)
+    for x in dataset.take(1):
+        inputs = {feature_name: x[feature_name] for feature_name in feature_names}
+        preprocessed_x = feature_space(inputs)
+        print(f"Input: {[{k:v.numpy()} for k, v in inputs.items()]}")
+        print(
+            f"Preprocessed output: {[{k:v.numpy()} for k, v in preprocessed_x.items()]}"
+        )
+
+
+"""
+### Feature hashing
+"""
+
+"""
+**Feature hashing** means hashing or encoding a set of values into a defined number of
+bins, in this case we have `campaign` (number of contacts performed during this campaign
+and for a client) which is a numerical feature that can assume a varying range of values
+and we will hash it into 4 bins, this means that any possible value of the original
+feature will be placed into one of those possible 4 bins. The output here can be a
+one-hot encoded vector or a single number.
+"""
+
+feature_space = FeatureSpace(
+    features={
+        "campaign": FeatureSpace.integer_hashed(num_bins=4, output_mode="one_hot")
+    },
+    output_mode="dict",
+)
+example_feature_space(train_ds_with_no_labels, feature_space, ["campaign"])
+
+"""
+**Feature hashing** can also be used for string features.
+"""
+
+feature_space = FeatureSpace(
+    features={
+        "education": FeatureSpace.string_hashed(num_bins=3, output_mode="one_hot")
+    },
+    output_mode="dict",
+)
+example_feature_space(train_ds_with_no_labels, feature_space, ["education"])
+
+"""
+For numerical features we can get a similar behavior by using the `float_discretized`
+option, the main difference between this and `integer_hashed` is that with the former we
+bin the values while keeping some numerical relationship (close values will likely be
+placed at the same bin) while the later (hashing) we cannot guarantee that those numbers
+will be hashed into the same bin, it depends on the hashing function.
+"""
+
+feature_space = FeatureSpace(
+    features={"age": FeatureSpace.float_discretized(num_bins=3, output_mode="one_hot")},
+    output_mode="dict",
+)
+example_feature_space(train_ds_with_no_labels, feature_space, ["age"])
+
+"""
+### Feature indexing
+"""
+
+"""
+**Indexing** a string feature essentially means creating a discrete numerical
+representation for it, this is especially important for string features since most models
+only accept numerical features. This transformation will place the string values into
+different categories. The output here can be a one-hot encoded vector or a single number.
+
+Note that by specifying `num_oov_indices=1` we leave one spot at our output vector for
+OOV (out of vocabulary) values this is an important tool to handle missing or unseen
+values after the training (values that were not seen during the `adapt()` step)
+"""
+
+feature_space = FeatureSpace(
+    features={
+        "default": FeatureSpace.string_categorical(
+            num_oov_indices=1, output_mode="one_hot"
+        )
+    },
+    output_mode="dict",
+)
+example_feature_space(train_ds_with_no_labels, feature_space, ["default"])
+
+"""
+We also can do **feature indexing** for integer features, this can be quite important for
+some datasets where categorical features are replaced by numbers, for instance features
+like `sex` or `gender` where values like (`1 and 0`) do not have a numerical relationship
+between them, they are just different categories, this behavior can be perfectly captured
+by this transformation.
+
+On this dataset we can use the feature that we created `previously_contacted`. For this
+case we want to explicitly set `num_oov_indices=0`, the reason is that we only expect two
+possible values for the feature, anything else would be either wrong input or an issue
+with the data creation, for this reason we would probably just want the code to throw an
+error so that we can be aware of the issue and fix it.
+"""
+
+feature_space = FeatureSpace(
+    features={
+        "previously_contacted": FeatureSpace.integer_categorical(
+            num_oov_indices=0, output_mode="one_hot"
+        )
+    },
+    output_mode="dict",
+)
+example_feature_space(train_ds_with_no_labels, feature_space, ["previously_contacted"])
+
+"""
+### Feature crosses (mixing features of diverse types)
+
+With **crosses** we can do feature interactions between an arbitrary number of features
+of mixed types as long as they are categorical features, you can think of instead of
+having a feature {'age': 20} and another {'job': 'entrepreneur'} we can have
+{'age_X_job': 20_entrepreneur}, but with `FeatureSpace` and **crosses** we can apply
+specific preprocessing to each individual feature and to the feature cross itself. This
+option can be very powerful for specific use cases, here might be a good option since age
+combined with job can have different meanings for the banking domain.
+
+We will cross `age` and `job` and hash the combination output of them into a vector
+representation of size 8. The output here can be a one-hot encoded vector or a single
+number.
+
+Sometimes the combination of multiple features can result into on a super large feature
+space, think about crossing someone's ZIP code with its last name, the possibilities
+would be in the thousands, that is why the `crossing_dim` parameter is so important it
+limits the output dimension of the cross feature.
+
+Note that the combination of possible values of the 6 bins of `age` and the 12 values of
+`job` would be 72, so by choosing `crossing_dim = 8` we are choosing to constrain the
+output vector.
+"""
+
+feature_space = FeatureSpace(
+    features={
+        "age": FeatureSpace.integer_hashed(num_bins=6, output_mode="one_hot"),
+        "job": FeatureSpace.string_categorical(
+            num_oov_indices=0, output_mode="one_hot"
+        ),
+    },
+    crosses=[
+        FeatureSpace.cross(
+            feature_names=("age", "job"),
+            crossing_dim=8,
+            output_mode="one_hot",
+        )
+    ],
+    output_mode="dict",
+)
+example_feature_space(train_ds_with_no_labels, feature_space, ["age", "job"])
+
+"""
+### FeatureSpace using a Keras preprocessing layer
+
+To be a really flexible and extensible feature we cannot only rely on those pre-defined
+transformation, we must be able to re-use other transformations from the Keras/TensorFlow
+ecosystem and customize our own, this is why `FeatureSpace` is also designed to work with
+[Keras preprocessing layers](https://keras.io/api/layers/preprocessing_layers/), this way we
+can use sophisticated data transformations provided by the framework, you can even create
+your own custom Keras preprocessing layers and use it in the same way.
+
+Here we are going to use the
+[`keras.layers.TextVectorization`](https://keras.io/api/layers/preprocessing_layers/text/text_vectorization/#textvectorization-class)
+preprocessing layer to create a TF-IDF
+feature from our data. Note that this feature is not a really good use case for TF-IDF,
+this is just for demonstration purposes.
+"""
+
+custom_layer = keras.layers.TextVectorization(output_mode="tf_idf")
+
+feature_space = FeatureSpace(
+    features={
+        "education": FeatureSpace.feature(
+            preprocessor=custom_layer, dtype="string", output_mode="float"
+        )
+    },
+    output_mode="dict",
+)
+example_feature_space(train_ds_with_no_labels, feature_space, ["education"])
+
+"""
+## Configuring the final `FeatureSpace`
+
+Now that we know how to use `FeatureSpace` for more complex use cases let's pick the ones
+that looks more useful for this task and create the final `FeatureSpace` component.
+
+To configure how each feature should be preprocessed,
+we instantiate a `keras.utils.FeatureSpace`, and we
+pass to it a dictionary that maps the name of our features
+to the feature transformation function.
+
+"""
+
+feature_space = FeatureSpace(
+    features={
+        # Categorical features encoded as integers
+        "previously_contacted": FeatureSpace.integer_categorical(num_oov_indices=0),
+        # Categorical features encoded as string
+        "marital": FeatureSpace.string_categorical(num_oov_indices=0),
+        "education": FeatureSpace.string_categorical(num_oov_indices=0),
+        "default": FeatureSpace.string_categorical(num_oov_indices=0),
+        "housing": FeatureSpace.string_categorical(num_oov_indices=0),
+        "loan": FeatureSpace.string_categorical(num_oov_indices=0),
+        "contact": FeatureSpace.string_categorical(num_oov_indices=0),
+        "month": FeatureSpace.string_categorical(num_oov_indices=0),
+        "day_of_week": FeatureSpace.string_categorical(num_oov_indices=0),
+        "poutcome": FeatureSpace.string_categorical(num_oov_indices=0),
+        # Categorical features to hash and bin
+        "job": FeatureSpace.string_hashed(num_bins=3),
+        # Numerical features to hash and bin
+        "pdays": FeatureSpace.integer_hashed(num_bins=4),
+        # Numerical features to normalize and bin
+        "age": FeatureSpace.float_discretized(num_bins=4),
+        # Numerical features to normalize
+        "campaign": FeatureSpace.float_normalized(),
+        "previous": FeatureSpace.float_normalized(),
+        "emp.var.rate": FeatureSpace.float_normalized(),
+        "cons.price.idx": FeatureSpace.float_normalized(),
+        "cons.conf.idx": FeatureSpace.float_normalized(),
+        "euribor3m": FeatureSpace.float_normalized(),
+        "nr.employed": FeatureSpace.float_normalized(),
+    },
+    # Specify feature cross with a custom crossing dim.
+    crosses=[
+        FeatureSpace.cross(feature_names=("age", "job"), crossing_dim=8),
+        FeatureSpace.cross(feature_names=("housing", "loan"), crossing_dim=6),
+        FeatureSpace.cross(
+            feature_names=("poutcome", "previously_contacted"), crossing_dim=2
+        ),
+    ],
+    output_mode="concat",
+)
+
+"""
+## Adapt the `FeatureSpace` to the training data
+
+Before we start using the `FeatureSpace` to build a model, we have
+to adapt it to the training data. During `adapt()`, the `FeatureSpace` will:
+
+- Index the set of possible values for categorical features.
+- Compute the mean and variance for numerical features to normalize.
+- Compute the value boundaries for the different bins for numerical features to
+discretize.
+- Any other kind of preprocessing required by custom layers.
+
+Note that `adapt()` should be called on a `tf.data.Dataset` which yields dicts
+of feature values -- no labels.
+
+But first let's batch the datasets
+"""
+
+train_ds = train_ds.batch(32)
+valid_ds = valid_ds.batch(32)
+
+train_ds_with_no_labels = train_ds.map(lambda x, _: x)
+feature_space.adapt(train_ds_with_no_labels)
+
+"""
+At this point, the `FeatureSpace` can be called on a dict of raw feature values, and
+because we set `output_mode="concat"` it will return a single concatenate vector for each
+sample, combining encoded features and feature crosses.
+"""
+
+for x, _ in train_ds.take(1):
+    preprocessed_x = feature_space(x)
+    print(f"preprocessed_x shape: {preprocessed_x.shape}")
+    print(f"preprocessed_x sample: \n{preprocessed_x[0]}")
+
+"""
+## Saving the `FeatureSpace`
+
+At this point we can choose to save our `FeatureSpace` component, this have many
+advantages like re-using it on different experiments that use the same model, saving time
+if you need to re-run the preprocessing step, and mainly for model deployment, where by
+loading it you can be sure that you will be applying the same preprocessing steps don't
+matter the device or environment, this is a great way to reduce
+[training/servingskew](https://developers.google.com/machine-learning/guides/rules-of-ml#training-serving_skew).
+"""
+
+feature_space.save("myfeaturespace.keras")
+
+"""
+## Preprocessing with `FeatureSpace` as part of the tf.data pipeline
+
+We will opt to use our component asynchronously by making it part of the tf.data
+pipeline, as noted at the
+[previous guide](https://keras.io/examples/structured_data/structured_data_classification_with_feature_space/)
+This enables asynchronous parallel preprocessing of the data on CPU before it
+hits the model. Usually, this is always the right thing to do during training.
+
+Let's create a training and validation dataset of preprocessed batches:
+"""
+
+preprocessed_train_ds = train_ds.map(
+    lambda x, y: (feature_space(x), y), num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+
+preprocessed_valid_ds = valid_ds.map(
+    lambda x, y: (feature_space(x), y), num_parallel_calls=tf.data.AUTOTUNE
+).prefetch(tf.data.AUTOTUNE)
+
+"""
+## Model
+
+We will take advantage of our `FeatureSpace` component to build the model, as we want the
+model to be compatible with our preprocessing function, let's use the the `FeatureSpace`
+feature map as the input of our model.
+"""
+
+encoded_features = feature_space.get_encoded_features()
+print(encoded_features)
+
+"""
+This model is quite trivial only for demonstration purposes so don't pay too much
+attention to the architecture.
+"""
+
+x = keras.layers.Dense(64, activation="relu")(encoded_features)
+x = keras.layers.Dropout(0.5)(x)
+output = keras.layers.Dense(1, activation="sigmoid")(x)
+
+model = keras.Model(inputs=encoded_features, outputs=output)
+model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])
+
+"""
+## Training
+
+Let's train our model for 20 epochs. Note that feature preprocessing is happening as part
+of the tf.data pipeline, not as part of the model.
+"""
+
+model.fit(
+    preprocessed_train_ds, validation_data=preprocessed_valid_ds, epochs=10, verbose=2
+)
+
+"""
+## Inference on new data with the end-to-end model
+
+Now, we can build our inference model (which includes the `FeatureSpace`) to make
+predictions based on dicts of raw features values, as follows:
+"""
+
+"""
+### Loading the `FeatureSpace`
+
+First let's load the `FeatureSpace` that we saved a few moment ago, this can be quite
+handy if you train a model but want to do inference at different time, possibly using a
+different device or environment.
+"""
+
+loaded_feature_space = keras.saving.load_model("myfeaturespace.keras")
+
+"""
+### Building the inference end-to-end model
+
+To build the inference model we need both the feature input map and the preprocessing
+encoded Keras tensors.
+"""
+
+dict_inputs = loaded_feature_space.get_inputs()
+encoded_features = loaded_feature_space.get_encoded_features()
+print(encoded_features)
+
+print(dict_inputs)
+
+outputs = model(encoded_features)
+inference_model = keras.Model(inputs=dict_inputs, outputs=outputs)
+
+sample = {
+    "age": 30,
+    "job": "blue-collar",
+    "marital": "married",
+    "education": "basic.9y",
+    "default": "no",
+    "housing": "yes",
+    "loan": "no",
+    "contact": "cellular",
+    "month": "may",
+    "day_of_week": "fri",
+    "campaign": 2,
+    "pdays": 999,
+    "previous": 0,
+    "poutcome": "nonexistent",
+    "emp.var.rate": -1.8,
+    "cons.price.idx": 92.893,
+    "cons.conf.idx": -46.2,
+    "euribor3m": 1.313,
+    "nr.employed": 5099.1,
+    "previously_contacted": 0,
+}
+
+input_dict = {
+    name: keras.ops.convert_to_tensor([value]) for name, value in sample.items()
+}
+predictions = inference_model.predict(input_dict)
+
+print(
+    f"This particular client has a {100 * predictions[0][0]:.2f}% probability "
+    "of subscribing a term deposit, as evaluated by our model."
+)
diff --git a/knowledge_base/structured_data/imbalanced_classification.py b/knowledge_base/structured_data/imbalanced_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..663e0024c324c2c676c5c3281b781427cca005ec
--- /dev/null
+++ b/knowledge_base/structured_data/imbalanced_classification.py
@@ -0,0 +1,149 @@
+"""
+Title: Imbalanced classification: credit card fraud detection
+Author: [fchollet](https://twitter.com/fchollet)
+Date created: 2019/05/28
+Last modified: 2020/04/17
+Description: Demonstration of how to handle highly imbalanced classification problems.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+This example looks at the
+[Kaggle Credit Card Fraud Detection](https://www.kaggle.com/mlg-ulb/creditcardfraud/)
+dataset to demonstrate how
+to train a classification model on data with highly imbalanced classes.
+"""
+
+"""
+## First, vectorize the CSV data
+"""
+
+import csv
+import numpy as np
+
+# Get the real data from https://www.kaggle.com/mlg-ulb/creditcardfraud/
+fname = "/Users/fchollet/Downloads/creditcard.csv"
+
+all_features = []
+all_targets = []
+with open(fname) as f:
+    for i, line in enumerate(f):
+        if i == 0:
+            print("HEADER:", line.strip())
+            continue  # Skip header
+        fields = line.strip().split(",")
+        all_features.append([float(v.replace('"', "")) for v in fields[:-1]])
+        all_targets.append([int(fields[-1].replace('"', ""))])
+        if i == 1:
+            print("EXAMPLE FEATURES:", all_features[-1])
+
+features = np.array(all_features, dtype="float32")
+targets = np.array(all_targets, dtype="uint8")
+print("features.shape:", features.shape)
+print("targets.shape:", targets.shape)
+
+"""
+## Prepare a validation set
+"""
+
+num_val_samples = int(len(features) * 0.2)
+train_features = features[:-num_val_samples]
+train_targets = targets[:-num_val_samples]
+val_features = features[-num_val_samples:]
+val_targets = targets[-num_val_samples:]
+
+print("Number of training samples:", len(train_features))
+print("Number of validation samples:", len(val_features))
+
+"""
+## Analyze class imbalance in the targets
+"""
+
+counts = np.bincount(train_targets[:, 0])
+print(
+    "Number of positive samples in training data: {} ({:.2f}% of total)".format(
+        counts[1], 100 * float(counts[1]) / len(train_targets)
+    )
+)
+
+weight_for_0 = 1.0 / counts[0]
+weight_for_1 = 1.0 / counts[1]
+
+"""
+## Normalize the data using training set statistics
+"""
+
+mean = np.mean(train_features, axis=0)
+train_features -= mean
+val_features -= mean
+std = np.std(train_features, axis=0)
+train_features /= std
+val_features /= std
+
+"""
+## Build a binary classification model
+"""
+
+import keras
+
+model = keras.Sequential(
+    [
+        keras.Input(shape=train_features.shape[1:]),
+        keras.layers.Dense(256, activation="relu"),
+        keras.layers.Dense(256, activation="relu"),
+        keras.layers.Dropout(0.3),
+        keras.layers.Dense(256, activation="relu"),
+        keras.layers.Dropout(0.3),
+        keras.layers.Dense(1, activation="sigmoid"),
+    ]
+)
+model.summary()
+
+"""
+## Train the model with `class_weight` argument
+"""
+
+metrics = [
+    keras.metrics.FalseNegatives(name="fn"),
+    keras.metrics.FalsePositives(name="fp"),
+    keras.metrics.TrueNegatives(name="tn"),
+    keras.metrics.TruePositives(name="tp"),
+    keras.metrics.Precision(name="precision"),
+    keras.metrics.Recall(name="recall"),
+]
+
+model.compile(
+    optimizer=keras.optimizers.Adam(1e-2), loss="binary_crossentropy", metrics=metrics
+)
+
+callbacks = [keras.callbacks.ModelCheckpoint("fraud_model_at_epoch_{epoch}.keras")]
+class_weight = {0: weight_for_0, 1: weight_for_1}
+
+model.fit(
+    train_features,
+    train_targets,
+    batch_size=2048,
+    epochs=30,
+    verbose=2,
+    callbacks=callbacks,
+    validation_data=(val_features, val_targets),
+    class_weight=class_weight,
+)
+
+"""
+## Conclusions
+
+At the end of training, out of 56,961 validation transactions, we are:
+
+- Correctly identifying 66 of them as fraudulent
+- Missing 9 fraudulent transactions
+- At the cost of incorrectly flagging 441 legitimate transactions
+
+In the real world, one would put an even higher weight on class 1,
+so as to reflect that False Negatives are more costly than False Positives.
+
+Next time your credit card gets  declined in an online purchase -- this is why.
+
+"""
diff --git a/knowledge_base/structured_data/movielens_recommendations_transformers.py b/knowledge_base/structured_data/movielens_recommendations_transformers.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6e20bed0cbd51556c926e5717e132efc651e947
--- /dev/null
+++ b/knowledge_base/structured_data/movielens_recommendations_transformers.py
@@ -0,0 +1,578 @@
+"""
+Title: A Transformer-based recommendation system
+Author: [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/)
+Date created: 2020/12/30
+Last modified: 2025/01/27
+Description: Rating rate prediction using the Behavior Sequence Transformer (BST) model on the Movielens.
+Accelerator: GPU
+Made backend-agnostic by: [Humbulani Ndou](https://github.com/Humbulani1234)
+"""
+
+"""
+## Introduction
+
+This example demonstrates the [Behavior Sequence Transformer (BST)](https://arxiv.org/abs/1905.06874)
+model, by Qiwei Chen et al., using the [Movielens dataset](https://grouplens.org/datasets/movielens/).
+The BST model leverages the sequential behaviour of the users in watching and rating movies,
+as well as user profile and movie features, to predict the rating of the user to a target movie.
+
+More precisely, the BST model aims to predict the rating of a target movie by accepting
+the following inputs:
+
+1. A fixed-length *sequence* of `movie_ids` watched by a user.
+2. A fixed-length *sequence* of the `ratings` for the movies watched by a user.
+3. A *set* of user features, including `user_id`, `sex`, `occupation`, and `age_group`.
+4. A *set* of `genres` for each movie in the input sequence and the target movie.
+5. A `target_movie_id` for which to predict the rating.
+
+This example modifies the original BST model in the following ways:
+
+1. We incorporate the movie features (genres) into the processing of the embedding of each
+movie of the input sequence and the target movie, rather than treating them as "other features"
+outside the transformer layer.
+2. We utilize the ratings of movies in the input sequence, along with the their positions
+in the sequence, to update them before feeding them into the self-attention layer.
+
+
+Note that this example should be run with TensorFlow 2.4 or higher.
+"""
+
+"""
+## The dataset
+
+We use the [1M version of the Movielens dataset](https://grouplens.org/datasets/movielens/1m/).
+The dataset includes around 1 million ratings from 6000 users on 4000 movies,
+along with some user features, movie genres. In addition, the timestamp of each user-movie
+rating is provided, which allows creating sequences of movie ratings for each user,
+as expected by the BST model.
+"""
+
+"""
+## Setup
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "jax"  # or torch, or tensorflow
+
+import math
+from zipfile import ZipFile
+from urllib.request import urlretrieve
+import numpy as np
+import pandas as pd
+
+import keras
+from keras import layers, ops
+from keras.layers import StringLookup
+
+"""
+## Prepare the data
+
+### Download and prepare the DataFrames
+
+First, let's download the movielens data.
+
+The downloaded folder will contain three data files: `users.dat`, `movies.dat`,
+and `ratings.dat`.
+"""
+
+urlretrieve("http://files.grouplens.org/datasets/movielens/ml-1m.zip", "movielens.zip")
+ZipFile("movielens.zip", "r").extractall()
+
+"""
+Then, we load the data into pandas DataFrames with their proper column names.
+"""
+
+users = pd.read_csv(
+    "ml-1m/users.dat",
+    sep="::",
+    names=["user_id", "sex", "age_group", "occupation", "zip_code"],
+    encoding="ISO-8859-1",
+    engine="python",
+)
+
+ratings = pd.read_csv(
+    "ml-1m/ratings.dat",
+    sep="::",
+    names=["user_id", "movie_id", "rating", "unix_timestamp"],
+    encoding="ISO-8859-1",
+    engine="python",
+)
+
+movies = pd.read_csv(
+    "ml-1m/movies.dat",
+    sep="::",
+    names=["movie_id", "title", "genres"],
+    encoding="ISO-8859-1",
+    engine="python",
+)
+
+"""
+Here, we do some simple data processing to fix the data types of the columns.
+"""
+
+users["user_id"] = users["user_id"].apply(lambda x: f"user_{x}")
+users["age_group"] = users["age_group"].apply(lambda x: f"group_{x}")
+users["occupation"] = users["occupation"].apply(lambda x: f"occupation_{x}")
+
+movies["movie_id"] = movies["movie_id"].apply(lambda x: f"movie_{x}")
+
+ratings["movie_id"] = ratings["movie_id"].apply(lambda x: f"movie_{x}")
+ratings["user_id"] = ratings["user_id"].apply(lambda x: f"user_{x}")
+ratings["rating"] = ratings["rating"].apply(lambda x: float(x))
+
+"""
+Each movie has multiple genres. We split them into separate columns in the `movies`
+DataFrame.
+"""
+
+genres = ["Action", "Adventure", "Animation", "Children's", "Comedy", "Crime"]
+genres += ["Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical"]
+genres += ["Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]
+
+for genre in genres:
+    movies[genre] = movies["genres"].apply(
+        lambda values: int(genre in values.split("|"))
+    )
+
+
+"""
+### Transform the movie ratings data into sequences
+
+First, let's sort the the ratings data using the `unix_timestamp`, and then group the
+`movie_id` values and the `rating` values by `user_id`.
+
+The output DataFrame will have a record for each `user_id`, with two ordered lists
+(sorted by rating datetime): the movies they have rated, and their ratings of these movies.
+"""
+
+ratings_group = ratings.sort_values(by=["unix_timestamp"]).groupby("user_id")
+
+ratings_data = pd.DataFrame(
+    data={
+        "user_id": list(ratings_group.groups.keys()),
+        "movie_ids": list(ratings_group.movie_id.apply(list)),
+        "ratings": list(ratings_group.rating.apply(list)),
+        "timestamps": list(ratings_group.unix_timestamp.apply(list)),
+    }
+)
+
+
+"""
+Now, let's split the `movie_ids` list into a set of sequences of a fixed length.
+We do the same for the `ratings`. Set the `sequence_length` variable to change the length
+of the input sequence to the model. You can also change the `step_size` to control the
+number of sequences to generate for each user.
+"""
+
+sequence_length = 4
+step_size = 2
+
+
+def create_sequences(values, window_size, step_size):
+    sequences = []
+    start_index = 0
+    while True:
+        end_index = start_index + window_size
+        seq = values[start_index:end_index]
+        if len(seq) < window_size:
+            seq = values[-window_size:]
+            if len(seq) == window_size:
+                sequences.append(seq)
+            break
+        sequences.append(seq)
+        start_index += step_size
+    return sequences
+
+
+ratings_data.movie_ids = ratings_data.movie_ids.apply(
+    lambda ids: create_sequences(ids, sequence_length, step_size)
+)
+
+ratings_data.ratings = ratings_data.ratings.apply(
+    lambda ids: create_sequences(ids, sequence_length, step_size)
+)
+
+del ratings_data["timestamps"]
+
+"""
+After that, we process the output to have each sequence in a separate records in
+the DataFrame. In addition, we join the user features with the ratings data.
+"""
+
+ratings_data_movies = ratings_data[["user_id", "movie_ids"]].explode(
+    "movie_ids", ignore_index=True
+)
+ratings_data_rating = ratings_data[["ratings"]].explode("ratings", ignore_index=True)
+ratings_data_transformed = pd.concat([ratings_data_movies, ratings_data_rating], axis=1)
+ratings_data_transformed = ratings_data_transformed.join(
+    users.set_index("user_id"), on="user_id"
+)
+ratings_data_transformed.movie_ids = ratings_data_transformed.movie_ids.apply(
+    lambda x: ",".join(x)
+)
+ratings_data_transformed.ratings = ratings_data_transformed.ratings.apply(
+    lambda x: ",".join([str(v) for v in x])
+)
+
+del ratings_data_transformed["zip_code"]
+
+ratings_data_transformed.rename(
+    columns={"movie_ids": "sequence_movie_ids", "ratings": "sequence_ratings"},
+    inplace=True,
+)
+
+"""
+With `sequence_length` of 4 and `step_size` of 2, we end up with 498,623 sequences.
+
+Finally, we split the data into training and testing splits, with 85% and 15% of
+the instances, respectively, and store them to CSV files.
+"""
+
+random_selection = np.random.rand(len(ratings_data_transformed.index)) <= 0.85
+train_data = ratings_data_transformed[random_selection]
+test_data = ratings_data_transformed[~random_selection]
+
+train_data.to_csv("train_data.csv", index=False, sep="|", header=False)
+test_data.to_csv("test_data.csv", index=False, sep="|", header=False)
+
+"""
+## Define metadata
+"""
+
+CSV_HEADER = list(ratings_data_transformed.columns)
+
+CATEGORICAL_FEATURES_WITH_VOCABULARY = {
+    "user_id": list(users.user_id.unique()),
+    "movie_id": list(movies.movie_id.unique()),
+    "sex": list(users.sex.unique()),
+    "age_group": list(users.age_group.unique()),
+    "occupation": list(users.occupation.unique()),
+}
+
+USER_FEATURES = ["sex", "age_group", "occupation"]
+
+MOVIE_FEATURES = ["genres"]
+
+
+"""
+## Encode input features
+
+The `encode_input_features` function works as follows:
+
+1. Each categorical user feature is encoded using `layers.Embedding`, with embedding
+dimension equals to the square root of the vocabulary size of the feature.
+The embeddings of these features are concatenated to form a single input tensor.
+
+2. Each movie in the movie sequence and the target movie is encoded `layers.Embedding`,
+where the dimension size is the square root of the number of movies.
+
+3. A multi-hot genres vector for each movie is concatenated with its embedding vector,
+and processed using a non-linear `layers.Dense` to output a vector of the same movie
+embedding dimensions.
+
+4. A positional embedding is added to each movie embedding in the sequence, and then
+multiplied by its rating from the ratings sequence.
+
+5. The target movie embedding is concatenated to the sequence movie embeddings, producing
+a tensor with the shape of `[batch size, sequence length, embedding size]`, as expected
+by the attention layer for the transformer architecture.
+
+6. The method returns a tuple of two elements:  `encoded_transformer_features` and
+`encoded_other_features`.
+"""
+
+# Required for tf.data.Dataset
+import tensorflow as tf
+
+
+def get_dataset_from_csv(csv_file_path, batch_size, shuffle=True):
+
+    def process(features):
+        movie_ids_string = features["sequence_movie_ids"]
+        sequence_movie_ids = tf.strings.split(movie_ids_string, ",").to_tensor()
+        # The last movie id in the sequence is the target movie.
+        features["target_movie_id"] = sequence_movie_ids[:, -1]
+        features["sequence_movie_ids"] = sequence_movie_ids[:, :-1]
+        # Sequence ratings
+        ratings_string = features["sequence_ratings"]
+        sequence_ratings = tf.strings.to_number(
+            tf.strings.split(ratings_string, ","), tf.dtypes.float32
+        ).to_tensor()
+        # The last rating in the sequence is the target for the model to predict.
+        target = sequence_ratings[:, -1]
+        features["sequence_ratings"] = sequence_ratings[:, :-1]
+
+        def encoding_helper(feature_name):
+
+            # This are target_movie_id and sequence_movie_ids and they have the same
+            # vocabulary as movie_id.
+            if feature_name not in CATEGORICAL_FEATURES_WITH_VOCABULARY:
+                vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY["movie_id"]
+                index_lookup = StringLookup(
+                    vocabulary=vocabulary, mask_token=None, num_oov_indices=0
+                )
+                # Convert the string input values into integer indices.
+                value_index = index_lookup(features[feature_name])
+                features[feature_name] = value_index
+            else:
+                # movie_id is not part of the features, hence not processed. It was mainly required
+                # for its vocabulary above.
+                if feature_name == "movie_id":
+                    pass
+                else:
+                    vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
+                    index_lookup = StringLookup(
+                        vocabulary=vocabulary, mask_token=None, num_oov_indices=0
+                    )
+                    # Convert the string input values into integer indices.
+                    value_index = index_lookup(features[feature_name])
+                    features[feature_name] = value_index
+
+        # Encode the user features
+        for feature_name in CATEGORICAL_FEATURES_WITH_VOCABULARY:
+            encoding_helper(feature_name)
+        # Encoding target_movie_id and returning it as the target variable
+        encoding_helper("target_movie_id")
+        # Encoding sequence movie_ids.
+        encoding_helper("sequence_movie_ids")
+        return dict(features), target
+
+    dataset = tf.data.experimental.make_csv_dataset(
+        csv_file_path,
+        batch_size=batch_size,
+        column_names=CSV_HEADER,
+        num_epochs=1,
+        header=False,
+        field_delim="|",
+        shuffle=shuffle,
+    ).map(process)
+    return dataset
+
+
+def encode_input_features(
+    inputs,
+    include_user_id,
+    include_user_features,
+    include_movie_features,
+):
+    encoded_transformer_features = []
+    encoded_other_features = []
+
+    other_feature_names = []
+    if include_user_id:
+        other_feature_names.append("user_id")
+    if include_user_features:
+        other_feature_names.extend(USER_FEATURES)
+
+    ## Encode user features
+    for feature_name in other_feature_names:
+        vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
+        # Compute embedding dimensions
+        embedding_dims = int(math.sqrt(len(vocabulary)))
+        # Create an embedding layer with the specified dimensions.
+        embedding_encoder = layers.Embedding(
+            input_dim=len(vocabulary),
+            output_dim=embedding_dims,
+            name=f"{feature_name}_embedding",
+        )
+        # Convert the index values to embedding representations.
+        encoded_other_features.append(embedding_encoder(inputs[feature_name]))
+
+    ## Create a single embedding vector for the user features
+    if len(encoded_other_features) > 1:
+        encoded_other_features = layers.concatenate(encoded_other_features)
+    elif len(encoded_other_features) == 1:
+        encoded_other_features = encoded_other_features[0]
+    else:
+        encoded_other_features = None
+
+    ## Create a movie embedding encoder
+    movie_vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY["movie_id"]
+    movie_embedding_dims = int(math.sqrt(len(movie_vocabulary)))
+    # Create an embedding layer with the specified dimensions.
+    movie_embedding_encoder = layers.Embedding(
+        input_dim=len(movie_vocabulary),
+        output_dim=movie_embedding_dims,
+        name=f"movie_embedding",
+    )
+    # Create a vector lookup for movie genres.
+    genre_vectors = movies[genres].to_numpy()
+    movie_genres_lookup = layers.Embedding(
+        input_dim=genre_vectors.shape[0],
+        output_dim=genre_vectors.shape[1],
+        embeddings_initializer=keras.initializers.Constant(genre_vectors),
+        trainable=False,
+        name="genres_vector",
+    )
+    # Create a processing layer for genres.
+    movie_embedding_processor = layers.Dense(
+        units=movie_embedding_dims,
+        activation="relu",
+        name="process_movie_embedding_with_genres",
+    )
+
+    ## Define a function to encode a given movie id.
+    def encode_movie(movie_id):
+        # Convert the string input values into integer indices.
+        movie_embedding = movie_embedding_encoder(movie_id)
+        encoded_movie = movie_embedding
+        if include_movie_features:
+            movie_genres_vector = movie_genres_lookup(movie_id)
+            encoded_movie = movie_embedding_processor(
+                layers.concatenate([movie_embedding, movie_genres_vector])
+            )
+        return encoded_movie
+
+    ## Encoding target_movie_id
+    target_movie_id = inputs["target_movie_id"]
+    encoded_target_movie = encode_movie(target_movie_id)
+
+    ## Encoding sequence movie_ids.
+    sequence_movies_ids = inputs["sequence_movie_ids"]
+    encoded_sequence_movies = encode_movie(sequence_movies_ids)
+    # Create positional embedding.
+    position_embedding_encoder = layers.Embedding(
+        input_dim=sequence_length,
+        output_dim=movie_embedding_dims,
+        name="position_embedding",
+    )
+    positions = ops.arange(start=0, stop=sequence_length - 1, step=1)
+    encodded_positions = position_embedding_encoder(positions)
+    # Retrieve sequence ratings to incorporate them into the encoding of the movie.
+    sequence_ratings = inputs["sequence_ratings"]
+    sequence_ratings = ops.expand_dims(sequence_ratings, -1)
+    # Add the positional encoding to the movie encodings and multiply them by rating.
+    encoded_sequence_movies_with_poistion_and_rating = layers.Multiply()(
+        [(encoded_sequence_movies + encodded_positions), sequence_ratings]
+    )
+
+    # Construct the transformer inputs.
+    for i in range(sequence_length - 1):
+        feature = encoded_sequence_movies_with_poistion_and_rating[:, i, ...]
+        feature = ops.expand_dims(feature, 1)
+        encoded_transformer_features.append(feature)
+    encoded_transformer_features.append(encoded_target_movie)
+    encoded_transformer_features = layers.concatenate(
+        encoded_transformer_features, axis=1
+    )
+    return encoded_transformer_features, encoded_other_features
+
+
+"""
+## Create model inputs
+"""
+
+
+def create_model_inputs():
+    return {
+        "user_id": keras.Input(name="user_id", shape=(1,), dtype="int32"),
+        "sequence_movie_ids": keras.Input(
+            name="sequence_movie_ids", shape=(sequence_length - 1,), dtype="int32"
+        ),
+        "target_movie_id": keras.Input(
+            name="target_movie_id", shape=(1,), dtype="int32"
+        ),
+        "sequence_ratings": keras.Input(
+            name="sequence_ratings", shape=(sequence_length - 1,), dtype="float32"
+        ),
+        "sex": keras.Input(name="sex", shape=(1,), dtype="int32"),
+        "age_group": keras.Input(name="age_group", shape=(1,), dtype="int32"),
+        "occupation": keras.Input(name="occupation", shape=(1,), dtype="int32"),
+    }
+
+
+"""
+## Create a BST model
+"""
+
+include_user_id = False
+include_user_features = False
+include_movie_features = False
+
+hidden_units = [256, 128]
+dropout_rate = 0.1
+num_heads = 3
+
+
+def create_model():
+
+    inputs = create_model_inputs()
+    transformer_features, other_features = encode_input_features(
+        inputs, include_user_id, include_user_features, include_movie_features
+    )
+    # Create a multi-headed attention layer.
+    attention_output = layers.MultiHeadAttention(
+        num_heads=num_heads, key_dim=transformer_features.shape[2], dropout=dropout_rate
+    )(transformer_features, transformer_features)
+
+    # Transformer block.
+    attention_output = layers.Dropout(dropout_rate)(attention_output)
+    x1 = layers.Add()([transformer_features, attention_output])
+    x1 = layers.LayerNormalization()(x1)
+    x2 = layers.LeakyReLU()(x1)
+    x2 = layers.Dense(units=x2.shape[-1])(x2)
+    x2 = layers.Dropout(dropout_rate)(x2)
+    transformer_features = layers.Add()([x1, x2])
+    transformer_features = layers.LayerNormalization()(transformer_features)
+    features = layers.Flatten()(transformer_features)
+
+    # Included the other_features.
+    if other_features is not None:
+        features = layers.concatenate(
+            [features, layers.Reshape([other_features.shape[-1]])(other_features)]
+        )
+
+    # Fully-connected layers.
+    for num_units in hidden_units:
+        features = layers.Dense(num_units)(features)
+        features = layers.BatchNormalization()(features)
+        features = layers.LeakyReLU()(features)
+        features = layers.Dropout(dropout_rate)(features)
+    outputs = layers.Dense(units=1)(features)
+    model = keras.Model(inputs=inputs, outputs=outputs)
+    return model
+
+
+model = create_model()
+
+"""
+## Run training and evaluation experiment
+"""
+
+# Compile the model.
+model.compile(
+    optimizer=keras.optimizers.Adagrad(learning_rate=0.01),
+    loss=keras.losses.MeanSquaredError(),
+    metrics=[keras.metrics.MeanAbsoluteError()],
+)
+
+# Read the training data.
+
+train_dataset = get_dataset_from_csv("train_data.csv", batch_size=265, shuffle=True)
+
+# Fit the model with the training data.
+model.fit(train_dataset, epochs=2)
+
+# Read the test data.
+test_dataset = get_dataset_from_csv("test_data.csv", batch_size=265)
+
+# Evaluate the model on the test data.
+_, rmse = model.evaluate(test_dataset, verbose=0)
+print(f"Test MAE: {round(rmse, 3)}")
+
+"""
+You should achieve a Mean Absolute Error (MAE) at or around 0.7 on the test data.
+"""
+
+"""
+## Conclusion
+
+The BST model uses the Transformer layer in its architecture to capture the sequential signals underlying
+users’ behavior sequences for recommendation.
+
+You can try training this model with different configurations, for example, by increasing
+the input sequence length and training the model for a larger number of epochs. In addition,
+you can try including other features like movie release year and customer
+zipcode, and including cross features like sex X genre.
+"""
diff --git a/knowledge_base/structured_data/structured_data_classification_from_scratch.py b/knowledge_base/structured_data/structured_data_classification_from_scratch.py
new file mode 100644
index 0000000000000000000000000000000000000000..818c1f7bec419d2182021ac70545d50f775e8a90
--- /dev/null
+++ b/knowledge_base/structured_data/structured_data_classification_from_scratch.py
@@ -0,0 +1,428 @@
+"""
+Title: Structured data classification from scratch
+Author: [fchollet](https://twitter.com/fchollet)
+Date created: 2020/06/09
+Last modified: 2020/06/09
+Description: Binary classification of structured data including numerical and categorical features.
+Accelerator: GPU
+Made backend-agnostic by: [Humbulani Ndou](https://github.com/Humbulani1234)
+"""
+
+"""
+## Introduction
+
+This example demonstrates how to do structured data classification, starting from a raw
+CSV file. Our data includes both numerical and categorical features. We will use Keras
+preprocessing layers to normalize the numerical features and vectorize the categorical
+ones.
+
+Note that this example should be run with TensorFlow 2.5 or higher.
+
+### The dataset
+
+[Our dataset](https://archive.ics.uci.edu/ml/datasets/heart+Disease) is provided by the
+Cleveland Clinic Foundation for Heart Disease.
+It's a CSV file with 303 rows. Each row contains information about a patient (a
+**sample**), and each column describes an attribute of the patient (a **feature**). We
+use the features to predict whether a patient has a heart disease (**binary
+classification**).
+
+Here's the description of each feature:
+
+Column| Description| Feature Type
+------------|--------------------|----------------------
+Age | Age in years | Numerical
+Sex | (1 = male; 0 = female) | Categorical
+CP | Chest pain type (0, 1, 2, 3, 4) | Categorical
+Trestbpd | Resting blood pressure (in mm Hg on admission) | Numerical
+Chol | Serum cholesterol in mg/dl | Numerical
+FBS | fasting blood sugar in 120 mg/dl (1 = true; 0 = false) | Categorical
+RestECG | Resting electrocardiogram results (0, 1, 2) | Categorical
+Thalach | Maximum heart rate achieved | Numerical
+Exang | Exercise induced angina (1 = yes; 0 = no) | Categorical
+Oldpeak | ST depression induced by exercise relative to rest | Numerical
+Slope | Slope of the peak exercise ST segment | Numerical
+CA | Number of major vessels (0-3) colored by fluoroscopy | Both numerical & categorical
+Thal | 3 = normal; 6 = fixed defect; 7 = reversible defect | Categorical
+Target | Diagnosis of heart disease (1 = true; 0 = false) | Target
+"""
+
+"""
+## Setup
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "torch"  # or torch, or tensorflow
+
+import pandas as pd
+import keras
+from keras import layers
+
+"""
+## Preparing the data
+
+Let's download the data and load it into a Pandas dataframe:
+"""
+
+file_url = "http://storage.googleapis.com/download.tensorflow.org/data/heart.csv"
+dataframe = pd.read_csv(file_url)
+
+"""
+The dataset includes 303 samples with 14 columns per sample (13 features, plus the target
+label):
+"""
+
+dataframe.shape
+
+"""
+Here's a preview of a few samples:
+"""
+
+dataframe.head()
+
+"""
+The last column, "target", indicates whether the patient has a heart disease (1) or not
+(0).
+
+Let's split the data into a training and validation set:
+"""
+
+val_dataframe = dataframe.sample(frac=0.2, random_state=1337)
+train_dataframe = dataframe.drop(val_dataframe.index)
+
+print(
+    f"Using {len(train_dataframe)} samples for training "
+    f"and {len(val_dataframe)} for validation"
+)
+
+
+"""
+## Define dataset metadata
+
+Here, we define the metadata of the dataset that will be useful for reading and
+parsing the data into input features, and encoding the input features with respect
+to their types.
+"""
+
+COLUMN_NAMES = [
+    "age",
+    "sex",
+    "cp",
+    "trestbps",
+    "chol",
+    "fbs",
+    "restecg",
+    "thalach",
+    "exang",
+    "oldpeak",
+    "slope",
+    "ca",
+    "thal",
+    "target",
+]
+# Target feature name.
+TARGET_FEATURE_NAME = "target"
+# Numeric feature names.
+NUMERIC_FEATURE_NAMES = ["age", "trestbps", "thalach", "oldpeak", "slope", "chol"]
+# Categorical features and their vocabulary lists.
+# Note that we add 'v=' as a prefix to all categorical feature values to make
+# sure that they are treated as strings.
+
+CATEGORICAL_FEATURES_WITH_VOCABULARY = {
+    feature_name: sorted(
+        [
+            # Integer categorcal must be int and string must be str
+            value if dataframe[feature_name].dtype == "int64" else str(value)
+            for value in list(dataframe[feature_name].unique())
+        ]
+    )
+    for feature_name in COLUMN_NAMES
+    if feature_name not in list(NUMERIC_FEATURE_NAMES + [TARGET_FEATURE_NAME])
+}
+# All features names.
+FEATURE_NAMES = NUMERIC_FEATURE_NAMES + list(
+    CATEGORICAL_FEATURES_WITH_VOCABULARY.keys()
+)
+
+
+"""
+## Feature preprocessing with Keras layers
+
+
+The following features are categorical features encoded as integers:
+
+- `sex`
+- `cp`
+- `fbs`
+- `restecg`
+- `exang`
+- `ca`
+
+We will encode these features using **one-hot encoding**. We have two options
+here:
+
+ - Use `CategoryEncoding()`, which requires knowing the range of input values
+ and will error on input outside the range.
+ - Use `IntegerLookup()` which will build a lookup table for inputs and reserve
+ an output index for unkown input values.
+
+For this example, we want a simple solution that will handle out of range inputs
+at inference, so we will use `IntegerLookup()`.
+
+We also have a categorical feature encoded as a string: `thal`. We will create an
+index of all possible features and encode output using the `StringLookup()` layer.
+
+Finally, the following feature are continuous numerical features:
+
+- `age`
+- `trestbps`
+- `chol`
+- `thalach`
+- `oldpeak`
+- `slope`
+
+For each of these features, we will use a `Normalization()` layer to make sure the mean
+of each feature is 0 and its standard deviation is 1.
+
+Below, we define 2 utility functions to do the operations:
+
+- `encode_numerical_feature` to apply featurewise normalization to numerical features.
+- `process` to one-hot encode string or integer categorical features.
+"""
+
+# Tensorflow required for tf.data.Dataset
+import tensorflow as tf
+
+
+# We process our datasets elements here (categorical) and convert them to indices to avoid this step
+# during model training since only tensorflow support strings.
+def encode_categorical(features, target):
+    for feature_name in features:
+        if feature_name in CATEGORICAL_FEATURES_WITH_VOCABULARY:
+            lookup_class = (
+                layers.StringLookup
+                if features[feature_name].dtype == "string"
+                else layers.IntegerLookup
+            )
+            vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
+            # Create a lookup to convert a string values to an integer indices.
+            # Since we are not using a mask token nor expecting any out of vocabulary
+            # (oov) token, we set mask_token to None and  num_oov_indices to 0.
+            index = lookup_class(
+                vocabulary=vocabulary,
+                mask_token=None,
+                num_oov_indices=0,
+                output_mode="binary",
+            )
+            # Convert the string input values into integer indices.
+            value_index = index(features[feature_name])
+            features[feature_name] = value_index
+
+        else:
+            pass
+
+    # Change features from OrderedDict to Dict to match Inputs as they are Dict.
+    return dict(features), target
+
+
+def encode_numerical_feature(feature, name, dataset):
+    # Create a Normalization layer for our feature
+    normalizer = layers.Normalization()
+    # Prepare a Dataset that only yields our feature
+    feature_ds = dataset.map(lambda x, y: x[name])
+    feature_ds = feature_ds.map(lambda x: tf.expand_dims(x, -1))
+    # Learn the statistics of the data
+    normalizer.adapt(feature_ds)
+    # Normalize the input feature
+    encoded_feature = normalizer(feature)
+    return encoded_feature
+
+
+"""
+Let's generate `tf.data.Dataset` objects for each dataframe:
+"""
+
+
+def dataframe_to_dataset(dataframe):
+    dataframe = dataframe.copy()
+    labels = dataframe.pop("target")
+    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels)).map(
+        encode_categorical
+    )
+    ds = ds.shuffle(buffer_size=len(dataframe))
+    return ds
+
+
+train_ds = dataframe_to_dataset(train_dataframe)
+val_ds = dataframe_to_dataset(val_dataframe)
+
+"""
+Each `Dataset` yields a tuple `(input, target)` where `input` is a dictionary of features
+and `target` is the value `0` or `1`:
+"""
+
+for x, y in train_ds.take(1):
+    print("Input:", x)
+    print("Target:", y)
+
+"""
+Let's batch the datasets:
+"""
+
+train_ds = train_ds.batch(32)
+val_ds = val_ds.batch(32)
+
+
+"""
+## Build a model
+
+With this done, we can create our end-to-end model:
+"""
+
+
+# Categorical features have different shapes after the encoding, dependent on the
+# vocabulary or unique values of each feature. We create them accordinly to match the
+# input data elements generated by tf.data.Dataset after pre-processing them
+def create_model_inputs():
+    inputs = {}
+
+    # This a helper function for creating categorical features
+    def create_input_helper(feature_name):
+        num_categories = len(CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name])
+        inputs[feature_name] = layers.Input(
+            name=feature_name, shape=(num_categories,), dtype="int64"
+        )
+        return inputs
+
+    for feature_name in FEATURE_NAMES:
+        if feature_name in CATEGORICAL_FEATURES_WITH_VOCABULARY:
+            # Categorical features
+            create_input_helper(feature_name)
+        else:
+            # Make them float32, they are Real numbers
+            feature_input = layers.Input(name=feature_name, shape=(1,), dtype="float32")
+            # Process the Inputs here
+            inputs[feature_name] = encode_numerical_feature(
+                feature_input, feature_name, train_ds
+            )
+    return inputs
+
+
+# This Layer defines the logic of the Model to perform the classification
+class Classifier(keras.layers.Layer):
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.dense_1 = layers.Dense(32, activation="relu")
+        self.dropout = layers.Dropout(0.5)
+        self.dense_2 = layers.Dense(1, activation="sigmoid")
+
+    def call(self, inputs):
+        all_features = layers.concatenate(list(inputs.values()))
+        x = self.dense_1(all_features)
+        x = self.dropout(x)
+        output = self.dense_2(x)
+        return output
+
+    # Surpress build warnings
+    def build(self, input_shape):
+        self.built = True
+
+
+# Create the Classifier model
+def create_model():
+    all_inputs = create_model_inputs()
+    output = Classifier()(all_inputs)
+    model = keras.Model(all_inputs, output)
+    return model
+
+
+model = create_model()
+model.compile("adam", "binary_crossentropy", metrics=["accuracy"])
+
+"""
+Let's visualize our connectivity graph:
+"""
+
+# `rankdir='LR'` is to make the graph horizontal.
+keras.utils.plot_model(model, show_shapes=True, rankdir="LR")
+
+"""
+## Train the model
+"""
+
+model.fit(train_ds, epochs=50, validation_data=val_ds)
+
+
+"""
+We quickly get to 80% validation accuracy.
+"""
+
+"""
+## Inference on new data
+
+To get a prediction for a new sample, you can simply call `model.predict()`. There are
+just two things you need to do:
+
+1. wrap scalars into a list so as to have a batch dimension (models only process batches
+of data, not single samples)
+2. Call `convert_to_tensor` on each feature
+"""
+
+sample = {
+    "age": 60,
+    "sex": 1,
+    "cp": 1,
+    "trestbps": 145,
+    "chol": 233,
+    "fbs": 1,
+    "restecg": 2,
+    "thalach": 150,
+    "exang": 0,
+    "oldpeak": 2.3,
+    "slope": 3,
+    "ca": 0,
+    "thal": "fixed",
+}
+
+
+# Given the category (in the sample above - key) and the category value (in the sample above - value),
+# we return its one-hot encoding
+def get_cat_encoding(cat, cat_value):
+    # Create a list of zeros with the same length as categories
+    encoding = [0] * len(cat)
+    # Find the index of category_value in categories and set the corresponding position to 1
+    if cat_value in cat:
+        encoding[cat.index(cat_value)] = 1
+    return encoding
+
+
+for name, value in sample.items():
+    if name in CATEGORICAL_FEATURES_WITH_VOCABULARY:
+        sample.update(
+            {
+                name: get_cat_encoding(
+                    CATEGORICAL_FEATURES_WITH_VOCABULARY[name], sample[name]
+                )
+            }
+        )
+# Convert inputs to tensors
+input_dict = {name: tf.convert_to_tensor([value]) for name, value in sample.items()}
+predictions = model.predict(input_dict)
+
+print(
+    f"This particular patient had a {100 * predictions[0][0]:.1f} "
+    "percent probability of having a heart disease, "
+    "as evaluated by our model."
+)
+
+"""
+## Conclusions
+
+- The orignal model (the one that runs only on tensorflow) converges quickly to around 80% and remains
+there for extended periods and at times hits 85%
+- The updated model (the backed-agnostic) model may fluctuate between 78% and 83% and at times hitting 86%
+validation accuracy and converges around 80% also.
+
+"""
diff --git a/knowledge_base/structured_data/structured_data_classification_with_feature_space.py b/knowledge_base/structured_data/structured_data_classification_with_feature_space.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a521983891ee33b1df7cbb6452dcdb3599432fa
--- /dev/null
+++ b/knowledge_base/structured_data/structured_data_classification_with_feature_space.py
@@ -0,0 +1,383 @@
+"""
+Title: Structured data classification with FeatureSpace
+Author: [fchollet](https://twitter.com/fchollet)
+Date created: 2022/11/09
+Last modified: 2022/11/09
+Description: Classify tabular data in a few lines of code.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+This example demonstrates how to do structured data classification
+(also known as tabular data classification), starting from a raw
+CSV file. Our data includes numerical features,
+and integer categorical features, and string categorical features.
+We will use the utility `keras.utils.FeatureSpace` to index,
+preprocess, and encode our features.
+
+The code is adapted from the example
+[Structured data classification from scratch](https://keras.io/examples/structured_data/structured_data_classification_from_scratch/).
+While the previous example managed its own low-level feature preprocessing and
+encoding with Keras preprocessing layers, in this example we
+delegate everything to `FeatureSpace`, making the workflow
+extremely quick and easy.
+
+### The dataset
+
+[Our dataset](https://archive.ics.uci.edu/ml/datasets/heart+Disease) is provided by the
+Cleveland Clinic Foundation for Heart Disease.
+It's a CSV file with 303 rows. Each row contains information about a patient (a
+**sample**), and each column describes an attribute of the patient (a **feature**). We
+use the features to predict whether a patient has a heart disease
+(**binary classification**).
+
+Here's the description of each feature:
+
+Column| Description| Feature Type
+------------|--------------------|----------------------
+Age | Age in years | Numerical
+Sex | (1 = male; 0 = female) | Categorical
+CP | Chest pain type (0, 1, 2, 3, 4) | Categorical
+Trestbpd | Resting blood pressure (in mm Hg on admission) | Numerical
+Chol | Serum cholesterol in mg/dl | Numerical
+FBS | fasting blood sugar in 120 mg/dl (1 = true; 0 = false) | Categorical
+RestECG | Resting electrocardiogram results (0, 1, 2) | Categorical
+Thalach | Maximum heart rate achieved | Numerical
+Exang | Exercise induced angina (1 = yes; 0 = no) | Categorical
+Oldpeak | ST depression induced by exercise relative to rest | Numerical
+Slope | Slope of the peak exercise ST segment | Numerical
+CA | Number of major vessels (0-3) colored by fluoroscopy | Both numerical & categorical
+Thal | 3 = normal; 6 = fixed defect; 7 = reversible defect | Categorical
+Target | Diagnosis of heart disease (1 = true; 0 = false) | Target
+"""
+
+"""
+## Setup
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import tensorflow as tf
+import pandas as pd
+import keras
+from keras.utils import FeatureSpace
+
+"""
+## Preparing the data
+
+Let's download the data and load it into a Pandas dataframe:
+"""
+
+file_url = "http://storage.googleapis.com/download.tensorflow.org/data/heart.csv"
+dataframe = pd.read_csv(file_url)
+
+"""
+The dataset includes 303 samples with 14 columns per sample
+(13 features, plus the target label):
+"""
+
+print(dataframe.shape)
+
+"""
+Here's a preview of a few samples:
+"""
+
+dataframe.head()
+
+"""
+The last column, "target", indicates whether the patient
+has a heart disease (1) or not (0).
+
+Let's split the data into a training and validation set:
+"""
+
+val_dataframe = dataframe.sample(frac=0.2, random_state=1337)
+train_dataframe = dataframe.drop(val_dataframe.index)
+
+print(
+    "Using %d samples for training and %d for validation"
+    % (len(train_dataframe), len(val_dataframe))
+)
+
+"""
+Let's generate `tf.data.Dataset` objects for each dataframe:
+"""
+
+
+def dataframe_to_dataset(dataframe):
+    dataframe = dataframe.copy()
+    labels = dataframe.pop("target")
+    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
+    ds = ds.shuffle(buffer_size=len(dataframe))
+    return ds
+
+
+train_ds = dataframe_to_dataset(train_dataframe)
+val_ds = dataframe_to_dataset(val_dataframe)
+
+"""
+Each `Dataset` yields a tuple `(input, target)` where `input` is a dictionary of features
+and `target` is the value `0` or `1`:
+"""
+
+for x, y in train_ds.take(1):
+    print("Input:", x)
+    print("Target:", y)
+
+"""
+Let's batch the datasets:
+"""
+
+train_ds = train_ds.batch(32)
+val_ds = val_ds.batch(32)
+
+"""
+## Configuring a `FeatureSpace`
+
+To configure how each feature should be preprocessed,
+we instantiate a `keras.utils.FeatureSpace`, and we
+pass to it a dictionary that maps the name of our features
+to a string that describes the feature type.
+
+We have a few "integer categorical" features such as `"FBS"`,
+one "string categorical" feature (`"thal"`),
+and a few numerical features, which we'd like to normalize
+-- except `"age"`, which we'd like to discretize into
+a number of bins.
+
+We also use the `crosses` argument
+to capture *feature interactions* for some categorical
+features, that is to say, create additional features
+that represent value co-occurrences for these categorical features.
+You can compute feature crosses like this for arbitrary sets of
+categorical features -- not just tuples of two features.
+Because the resulting co-occurences are hashed
+into a fixed-sized vector, you don't need to worry about whether
+the co-occurence space is too large.
+"""
+
+feature_space = FeatureSpace(
+    features={
+        # Categorical features encoded as integers
+        "sex": "integer_categorical",
+        "cp": "integer_categorical",
+        "fbs": "integer_categorical",
+        "restecg": "integer_categorical",
+        "exang": "integer_categorical",
+        "ca": "integer_categorical",
+        # Categorical feature encoded as string
+        "thal": "string_categorical",
+        # Numerical features to discretize
+        "age": "float_discretized",
+        # Numerical features to normalize
+        "trestbps": "float_normalized",
+        "chol": "float_normalized",
+        "thalach": "float_normalized",
+        "oldpeak": "float_normalized",
+        "slope": "float_normalized",
+    },
+    # We create additional features by hashing
+    # value co-occurrences for the
+    # following groups of categorical features.
+    crosses=[("sex", "age"), ("thal", "ca")],
+    # The hashing space for these co-occurrences
+    # wil be 32-dimensional.
+    crossing_dim=32,
+    # Our utility will one-hot encode all categorical
+    # features and concat all features into a single
+    # vector (one vector per sample).
+    output_mode="concat",
+)
+
+"""
+## Further customizing a `FeatureSpace`
+
+Specifying the feature type via a string name is quick and easy,
+but sometimes you may want to further configure the preprocessing
+of each feature. For instance, in our case, our categorical
+features don't have a large set of possible values -- it's only
+a handful of values per feature (e.g. `1` and `0` for the feature `"FBS"`),
+and all possible values are represented in the training set.
+As a result, we don't need to reserve an index to represent "out of vocabulary" values
+for these features -- which would have been the default behavior.
+Below, we just specify `num_oov_indices=0` in each of these features
+to tell the feature preprocessor to skip "out of vocabulary" indexing.
+
+Other customizations you have access to include specifying the number of
+bins for discretizing features of type `"float_discretized"`,
+or the dimensionality of the hashing space for feature crossing.
+"""
+
+feature_space = FeatureSpace(
+    features={
+        # Categorical features encoded as integers
+        "sex": FeatureSpace.integer_categorical(num_oov_indices=0),
+        "cp": FeatureSpace.integer_categorical(num_oov_indices=0),
+        "fbs": FeatureSpace.integer_categorical(num_oov_indices=0),
+        "restecg": FeatureSpace.integer_categorical(num_oov_indices=0),
+        "exang": FeatureSpace.integer_categorical(num_oov_indices=0),
+        "ca": FeatureSpace.integer_categorical(num_oov_indices=0),
+        # Categorical feature encoded as string
+        "thal": FeatureSpace.string_categorical(num_oov_indices=0),
+        # Numerical features to discretize
+        "age": FeatureSpace.float_discretized(num_bins=30),
+        # Numerical features to normalize
+        "trestbps": FeatureSpace.float_normalized(),
+        "chol": FeatureSpace.float_normalized(),
+        "thalach": FeatureSpace.float_normalized(),
+        "oldpeak": FeatureSpace.float_normalized(),
+        "slope": FeatureSpace.float_normalized(),
+    },
+    # Specify feature cross with a custom crossing dim.
+    crosses=[
+        FeatureSpace.cross(feature_names=("sex", "age"), crossing_dim=64),
+        FeatureSpace.cross(
+            feature_names=("thal", "ca"),
+            crossing_dim=16,
+        ),
+    ],
+    output_mode="concat",
+)
+
+"""
+## Adapt the `FeatureSpace` to the training data
+
+Before we start using the `FeatureSpace` to build a model, we have
+to adapt it to the training data. During `adapt()`, the `FeatureSpace` will:
+
+- Index the set of possible values for categorical features.
+- Compute the mean and variance for numerical features to normalize.
+- Compute the value boundaries for the different bins for numerical features to discretize.
+
+Note that `adapt()` should be called on a `tf.data.Dataset` which yields dicts
+of feature values -- no labels.
+"""
+
+train_ds_with_no_labels = train_ds.map(lambda x, _: x)
+feature_space.adapt(train_ds_with_no_labels)
+
+"""
+At this point, the `FeatureSpace` can be called on a dict of raw feature values, and will return a
+single concatenate vector for each sample, combining encoded features and feature crosses.
+"""
+
+for x, _ in train_ds.take(1):
+    preprocessed_x = feature_space(x)
+    print("preprocessed_x.shape:", preprocessed_x.shape)
+    print("preprocessed_x.dtype:", preprocessed_x.dtype)
+
+"""
+## Two ways to manage preprocessing: as part of the `tf.data` pipeline, or in the model itself
+
+There are two ways in which you can leverage your `FeatureSpace`:
+
+### Asynchronous preprocessing in `tf.data`
+
+You can make it part of your data pipeline, before the model. This enables asynchronous parallel
+preprocessing of the data on CPU before it hits the model. Do this if you're training on GPU or TPU,
+or if you want to speed up preprocessing. Usually, this is always the right thing to do during training.
+
+### Synchronous preprocessing in the model
+
+You can make it part of your model. This means that the model will expect dicts of raw feature
+values, and the preprocessing batch will be done synchronously (in a blocking manner) before the
+rest of the forward pass. Do this if you want to have an end-to-end model that can process
+raw feature values -- but keep in mind that your model will only be able to run on CPU,
+since most types of feature preprocessing (e.g. string preprocessing) are not GPU or TPU compatible.
+
+Do not do this on GPU / TPU or in performance-sensitive settings. In general, you want to do in-model
+preprocessing when you do inference on CPU.
+
+In our case, we will apply the `FeatureSpace` in the tf.data pipeline during training, but we will
+do inference with an end-to-end model that includes the `FeatureSpace`.
+"""
+
+"""
+Let's create a training and validation dataset of preprocessed batches:
+"""
+
+preprocessed_train_ds = train_ds.map(
+    lambda x, y: (feature_space(x), y), num_parallel_calls=tf.data.AUTOTUNE
+)
+preprocessed_train_ds = preprocessed_train_ds.prefetch(tf.data.AUTOTUNE)
+
+preprocessed_val_ds = val_ds.map(
+    lambda x, y: (feature_space(x), y), num_parallel_calls=tf.data.AUTOTUNE
+)
+preprocessed_val_ds = preprocessed_val_ds.prefetch(tf.data.AUTOTUNE)
+
+"""
+## Build a model
+
+Time to build a model -- or rather two models:
+
+- A training model that expects preprocessed features (one sample = one vector)
+- An inference model that expects raw features (one sample = dict of raw feature values)
+"""
+
+dict_inputs = feature_space.get_inputs()
+encoded_features = feature_space.get_encoded_features()
+
+x = keras.layers.Dense(32, activation="relu")(encoded_features)
+x = keras.layers.Dropout(0.5)(x)
+predictions = keras.layers.Dense(1, activation="sigmoid")(x)
+
+training_model = keras.Model(inputs=encoded_features, outputs=predictions)
+training_model.compile(
+    optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"]
+)
+
+inference_model = keras.Model(inputs=dict_inputs, outputs=predictions)
+
+"""
+## Train the model
+
+Let's train our model for 50 epochs. Note that feature preprocessing is happening
+as part of the tf.data pipeline, not as part of the model.
+"""
+
+training_model.fit(
+    preprocessed_train_ds,
+    epochs=20,
+    validation_data=preprocessed_val_ds,
+    verbose=2,
+)
+
+"""
+We quickly get to 80% validation accuracy.
+"""
+
+"""
+## Inference on new data with the end-to-end model
+
+Now, we can use our inference model (which includes the `FeatureSpace`)
+to make predictions based on dicts of raw features values, as follows:
+"""
+
+sample = {
+    "age": 60,
+    "sex": 1,
+    "cp": 1,
+    "trestbps": 145,
+    "chol": 233,
+    "fbs": 1,
+    "restecg": 2,
+    "thalach": 150,
+    "exang": 0,
+    "oldpeak": 2.3,
+    "slope": 3,
+    "ca": 0,
+    "thal": "fixed",
+}
+
+input_dict = {name: tf.convert_to_tensor([value]) for name, value in sample.items()}
+predictions = inference_model.predict(input_dict)
+
+print(
+    f"This particular patient had a {100 * predictions[0][0]:.2f}% probability "
+    "of having a heart disease, as evaluated by our model."
+)
diff --git a/knowledge_base/structured_data/tabtransformer.py b/knowledge_base/structured_data/tabtransformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..304bc442591b3cbb359d1f105f15e42ecb9f76ff
--- /dev/null
+++ b/knowledge_base/structured_data/tabtransformer.py
@@ -0,0 +1,569 @@
+"""
+Title: Structured data learning with TabTransformer
+Author: [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/)
+Date created: 2022/01/18
+Last modified: 2022/01/18
+Description: Using contextual embeddings for structured data classification.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+This example demonstrates how to do structured data classification using
+[TabTransformer](https://arxiv.org/abs/2012.06678), a deep tabular data modeling
+architecture for supervised and semi-supervised learning.
+The TabTransformer is built upon self-attention based Transformers.
+The Transformer layers transform the embeddings of categorical features
+into robust contextual embeddings to achieve higher predictive accuracy.
+
+
+
+## Setup
+"""
+import keras
+from keras import layers
+from keras import ops
+
+import math
+import numpy as np
+import pandas as pd
+from tensorflow import data as tf_data
+import matplotlib.pyplot as plt
+from functools import partial
+
+"""
+## Prepare the data
+
+This example uses the
+[United States Census Income Dataset](https://archive.ics.uci.edu/ml/datasets/census+income)
+provided by the
+[UC Irvine Machine Learning Repository](https://archive.ics.uci.edu/ml/index.php).
+The task is binary classification
+to predict whether a person is likely to be making over USD 50,000 a year.
+
+The dataset includes 48,842 instances with 14 input features: 5 numerical features and 9 categorical features.
+
+First, let's load the dataset from the UCI Machine Learning Repository into a Pandas
+DataFrame:
+"""
+
+CSV_HEADER = [
+    "age",
+    "workclass",
+    "fnlwgt",
+    "education",
+    "education_num",
+    "marital_status",
+    "occupation",
+    "relationship",
+    "race",
+    "gender",
+    "capital_gain",
+    "capital_loss",
+    "hours_per_week",
+    "native_country",
+    "income_bracket",
+]
+
+train_data_url = (
+    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.data"
+)
+train_data = pd.read_csv(train_data_url, header=None, names=CSV_HEADER)
+
+test_data_url = (
+    "https://archive.ics.uci.edu/ml/machine-learning-databases/adult/adult.test"
+)
+test_data = pd.read_csv(test_data_url, header=None, names=CSV_HEADER)
+
+print(f"Train dataset shape: {train_data.shape}")
+print(f"Test dataset shape: {test_data.shape}")
+
+"""
+Remove the first record (because it is not a valid data example) and a trailing 'dot' in the class labels.
+"""
+
+test_data = test_data[1:]
+test_data.income_bracket = test_data.income_bracket.apply(
+    lambda value: value.replace(".", "")
+)
+
+"""
+Now we store the training and test data in separate CSV files.
+"""
+
+train_data_file = "train_data.csv"
+test_data_file = "test_data.csv"
+
+train_data.to_csv(train_data_file, index=False, header=False)
+test_data.to_csv(test_data_file, index=False, header=False)
+
+"""
+## Define dataset metadata
+
+Here, we define the metadata of the dataset that will be useful for reading and parsing
+the data into input features, and encoding the input features with respect to their types.
+"""
+
+# A list of the numerical feature names.
+NUMERIC_FEATURE_NAMES = [
+    "age",
+    "education_num",
+    "capital_gain",
+    "capital_loss",
+    "hours_per_week",
+]
+# A dictionary of the categorical features and their vocabulary.
+CATEGORICAL_FEATURES_WITH_VOCABULARY = {
+    "workclass": sorted(list(train_data["workclass"].unique())),
+    "education": sorted(list(train_data["education"].unique())),
+    "marital_status": sorted(list(train_data["marital_status"].unique())),
+    "occupation": sorted(list(train_data["occupation"].unique())),
+    "relationship": sorted(list(train_data["relationship"].unique())),
+    "race": sorted(list(train_data["race"].unique())),
+    "gender": sorted(list(train_data["gender"].unique())),
+    "native_country": sorted(list(train_data["native_country"].unique())),
+}
+# Name of the column to be used as instances weight.
+WEIGHT_COLUMN_NAME = "fnlwgt"
+# A list of the categorical feature names.
+CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())
+# A list of all the input features.
+FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES
+# A list of column default values for each feature.
+COLUMN_DEFAULTS = [
+    [0.0] if feature_name in NUMERIC_FEATURE_NAMES + [WEIGHT_COLUMN_NAME] else ["NA"]
+    for feature_name in CSV_HEADER
+]
+# The name of the target feature.
+TARGET_FEATURE_NAME = "income_bracket"
+# A list of the labels of the target features.
+TARGET_LABELS = [" <=50K", " >50K"]
+
+"""
+## Configure the hyperparameters
+
+The hyperparameters includes model architecture and training configurations.
+"""
+
+LEARNING_RATE = 0.001
+WEIGHT_DECAY = 0.0001
+DROPOUT_RATE = 0.2
+BATCH_SIZE = 265
+NUM_EPOCHS = 15
+
+NUM_TRANSFORMER_BLOCKS = 3  # Number of transformer blocks.
+NUM_HEADS = 4  # Number of attention heads.
+EMBEDDING_DIMS = 16  # Embedding dimensions of the categorical features.
+MLP_HIDDEN_UNITS_FACTORS = [
+    2,
+    1,
+]  # MLP hidden layer units, as factors of the number of inputs.
+NUM_MLP_BLOCKS = 2  # Number of MLP blocks in the baseline model.
+
+"""
+## Implement data reading pipeline
+
+We define an input function that reads and parses the file, then converts features
+and labels into a[`tf.data.Dataset`](https://www.tensorflow.org/guide/datasets)
+for training or evaluation.
+"""
+
+target_label_lookup = layers.StringLookup(
+    vocabulary=TARGET_LABELS, mask_token=None, num_oov_indices=0
+)
+
+
+def prepare_example(features, target):
+    target_index = target_label_lookup(target)
+    weights = features.pop(WEIGHT_COLUMN_NAME)
+    return features, target_index, weights
+
+
+lookup_dict = {}
+for feature_name in CATEGORICAL_FEATURE_NAMES:
+    vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
+    # Create a lookup to convert a string values to an integer indices.
+    # Since we are not using a mask token, nor expecting any out of vocabulary
+    # (oov) token, we set mask_token to None and num_oov_indices to 0.
+    lookup = layers.StringLookup(
+        vocabulary=vocabulary, mask_token=None, num_oov_indices=0
+    )
+    lookup_dict[feature_name] = lookup
+
+
+def encode_categorical(batch_x, batch_y, weights):
+    for feature_name in CATEGORICAL_FEATURE_NAMES:
+        batch_x[feature_name] = lookup_dict[feature_name](batch_x[feature_name])
+
+    return batch_x, batch_y, weights
+
+
+def get_dataset_from_csv(csv_file_path, batch_size=128, shuffle=False):
+    dataset = (
+        tf_data.experimental.make_csv_dataset(
+            csv_file_path,
+            batch_size=batch_size,
+            column_names=CSV_HEADER,
+            column_defaults=COLUMN_DEFAULTS,
+            label_name=TARGET_FEATURE_NAME,
+            num_epochs=1,
+            header=False,
+            na_value="?",
+            shuffle=shuffle,
+        )
+        .map(prepare_example, num_parallel_calls=tf_data.AUTOTUNE, deterministic=False)
+        .map(encode_categorical)
+    )
+    return dataset.cache()
+
+
+"""
+## Implement a training and evaluation procedure
+"""
+
+
+def run_experiment(
+    model,
+    train_data_file,
+    test_data_file,
+    num_epochs,
+    learning_rate,
+    weight_decay,
+    batch_size,
+):
+    optimizer = keras.optimizers.AdamW(
+        learning_rate=learning_rate, weight_decay=weight_decay
+    )
+
+    model.compile(
+        optimizer=optimizer,
+        loss=keras.losses.BinaryCrossentropy(),
+        metrics=[keras.metrics.BinaryAccuracy(name="accuracy")],
+    )
+
+    train_dataset = get_dataset_from_csv(train_data_file, batch_size, shuffle=True)
+    validation_dataset = get_dataset_from_csv(test_data_file, batch_size)
+
+    print("Start training the model...")
+    history = model.fit(
+        train_dataset, epochs=num_epochs, validation_data=validation_dataset
+    )
+    print("Model training finished")
+
+    _, accuracy = model.evaluate(validation_dataset, verbose=0)
+
+    print(f"Validation accuracy: {round(accuracy * 100, 2)}%")
+
+    return history
+
+
+"""
+## Create model inputs
+
+Now, define the inputs for the models as a dictionary, where the key is the feature name,
+and the value is a `keras.layers.Input` tensor with the corresponding feature shape
+and data type.
+"""
+
+
+def create_model_inputs():
+    inputs = {}
+    for feature_name in FEATURE_NAMES:
+        if feature_name in NUMERIC_FEATURE_NAMES:
+            inputs[feature_name] = layers.Input(
+                name=feature_name, shape=(), dtype="float32"
+            )
+        else:
+            inputs[feature_name] = layers.Input(
+                name=feature_name, shape=(), dtype="int32"
+            )
+    return inputs
+
+
+"""
+## Encode features
+
+The `encode_inputs` method returns `encoded_categorical_feature_list` and `numerical_feature_list`.
+We encode the categorical features as embeddings, using a fixed `embedding_dims` for all the features,
+regardless their vocabulary sizes. This is required for the Transformer model.
+"""
+
+
+def encode_inputs(inputs, embedding_dims):
+    encoded_categorical_feature_list = []
+    numerical_feature_list = []
+
+    for feature_name in inputs:
+        if feature_name in CATEGORICAL_FEATURE_NAMES:
+            vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
+            # Create a lookup to convert a string values to an integer indices.
+            # Since we are not using a mask token, nor expecting any out of vocabulary
+            # (oov) token, we set mask_token to None and num_oov_indices to 0.
+
+            # Convert the string input values into integer indices.
+
+            # Create an embedding layer with the specified dimensions.
+            embedding = layers.Embedding(
+                input_dim=len(vocabulary), output_dim=embedding_dims
+            )
+
+            # Convert the index values to embedding representations.
+            encoded_categorical_feature = embedding(inputs[feature_name])
+            encoded_categorical_feature_list.append(encoded_categorical_feature)
+
+        else:
+            # Use the numerical features as-is.
+            numerical_feature = ops.expand_dims(inputs[feature_name], -1)
+            numerical_feature_list.append(numerical_feature)
+
+    return encoded_categorical_feature_list, numerical_feature_list
+
+
+"""
+## Implement an MLP block
+"""
+
+
+def create_mlp(hidden_units, dropout_rate, activation, normalization_layer, name=None):
+    mlp_layers = []
+    for units in hidden_units:
+        mlp_layers.append(normalization_layer())
+        mlp_layers.append(layers.Dense(units, activation=activation))
+        mlp_layers.append(layers.Dropout(dropout_rate))
+
+    return keras.Sequential(mlp_layers, name=name)
+
+
+"""
+## Experiment 1: a baseline model
+
+In the first experiment, we create a simple multi-layer feed-forward network.
+"""
+
+
+def create_baseline_model(
+    embedding_dims, num_mlp_blocks, mlp_hidden_units_factors, dropout_rate
+):
+    # Create model inputs.
+    inputs = create_model_inputs()
+    # encode features.
+    encoded_categorical_feature_list, numerical_feature_list = encode_inputs(
+        inputs, embedding_dims
+    )
+    # Concatenate all features.
+    features = layers.concatenate(
+        encoded_categorical_feature_list + numerical_feature_list
+    )
+    # Compute Feedforward layer units.
+    feedforward_units = [features.shape[-1]]
+
+    # Create several feedforwad layers with skip connections.
+    for layer_idx in range(num_mlp_blocks):
+        features = create_mlp(
+            hidden_units=feedforward_units,
+            dropout_rate=dropout_rate,
+            activation=keras.activations.gelu,
+            normalization_layer=layers.LayerNormalization,
+            name=f"feedforward_{layer_idx}",
+        )(features)
+
+    # Compute MLP hidden_units.
+    mlp_hidden_units = [
+        factor * features.shape[-1] for factor in mlp_hidden_units_factors
+    ]
+    # Create final MLP.
+    features = create_mlp(
+        hidden_units=mlp_hidden_units,
+        dropout_rate=dropout_rate,
+        activation=keras.activations.selu,
+        normalization_layer=layers.BatchNormalization,
+        name="MLP",
+    )(features)
+
+    # Add a sigmoid as a binary classifer.
+    outputs = layers.Dense(units=1, activation="sigmoid", name="sigmoid")(features)
+    model = keras.Model(inputs=inputs, outputs=outputs)
+    return model
+
+
+baseline_model = create_baseline_model(
+    embedding_dims=EMBEDDING_DIMS,
+    num_mlp_blocks=NUM_MLP_BLOCKS,
+    mlp_hidden_units_factors=MLP_HIDDEN_UNITS_FACTORS,
+    dropout_rate=DROPOUT_RATE,
+)
+
+print("Total model weights:", baseline_model.count_params())
+keras.utils.plot_model(baseline_model, show_shapes=True, rankdir="LR")
+
+"""
+Let's train and evaluate the baseline model:
+"""
+
+history = run_experiment(
+    model=baseline_model,
+    train_data_file=train_data_file,
+    test_data_file=test_data_file,
+    num_epochs=NUM_EPOCHS,
+    learning_rate=LEARNING_RATE,
+    weight_decay=WEIGHT_DECAY,
+    batch_size=BATCH_SIZE,
+)
+
+"""
+The baseline linear model achieves ~81% validation accuracy.
+"""
+
+"""
+## Experiment 2: TabTransformer
+
+The TabTransformer architecture works as follows:
+
+1. All the categorical features are encoded as embeddings, using the same `embedding_dims`.
+This means that each value in each categorical feature will have its own embedding vector.
+2. A column embedding, one embedding vector for each categorical feature, is added (point-wise) to the categorical feature embedding.
+3. The embedded categorical features are fed into a stack of Transformer blocks.
+Each Transformer block consists of a multi-head self-attention layer followed by a feed-forward layer.
+3. The outputs of the final Transformer layer, which are the *contextual embeddings* of the categorical features,
+are concatenated with the input numerical features, and fed into a final MLP block.
+4. A `softmax` classifer is applied at the end of the model.
+
+The [paper](https://arxiv.org/abs/2012.06678) discusses both addition and concatenation of the column embedding in the
+*Appendix: Experiment and Model Details* section.
+The architecture of TabTransformer is shown below, as presented in the paper.
+
+<img src="https://raw.githubusercontent.com/keras-team/keras-io/master/examples/structured_data/img/tabtransformer/tabtransformer.png" width="500"/>
+"""
+
+
+def create_tabtransformer_classifier(
+    num_transformer_blocks,
+    num_heads,
+    embedding_dims,
+    mlp_hidden_units_factors,
+    dropout_rate,
+    use_column_embedding=False,
+):
+    # Create model inputs.
+    inputs = create_model_inputs()
+    # encode features.
+    encoded_categorical_feature_list, numerical_feature_list = encode_inputs(
+        inputs, embedding_dims
+    )
+    # Stack categorical feature embeddings for the Tansformer.
+    encoded_categorical_features = ops.stack(encoded_categorical_feature_list, axis=1)
+    # Concatenate numerical features.
+    numerical_features = layers.concatenate(numerical_feature_list)
+
+    # Add column embedding to categorical feature embeddings.
+    if use_column_embedding:
+        num_columns = encoded_categorical_features.shape[1]
+        column_embedding = layers.Embedding(
+            input_dim=num_columns, output_dim=embedding_dims
+        )
+        column_indices = ops.arange(start=0, stop=num_columns, step=1)
+        encoded_categorical_features = encoded_categorical_features + column_embedding(
+            column_indices
+        )
+
+    # Create multiple layers of the Transformer block.
+    for block_idx in range(num_transformer_blocks):
+        # Create a multi-head attention layer.
+        attention_output = layers.MultiHeadAttention(
+            num_heads=num_heads,
+            key_dim=embedding_dims,
+            dropout=dropout_rate,
+            name=f"multihead_attention_{block_idx}",
+        )(encoded_categorical_features, encoded_categorical_features)
+        # Skip connection 1.
+        x = layers.Add(name=f"skip_connection1_{block_idx}")(
+            [attention_output, encoded_categorical_features]
+        )
+        # Layer normalization 1.
+        x = layers.LayerNormalization(name=f"layer_norm1_{block_idx}", epsilon=1e-6)(x)
+        # Feedforward.
+        feedforward_output = create_mlp(
+            hidden_units=[embedding_dims],
+            dropout_rate=dropout_rate,
+            activation=keras.activations.gelu,
+            normalization_layer=partial(
+                layers.LayerNormalization, epsilon=1e-6
+            ),  # using partial to provide keyword arguments before initialization
+            name=f"feedforward_{block_idx}",
+        )(x)
+        # Skip connection 2.
+        x = layers.Add(name=f"skip_connection2_{block_idx}")([feedforward_output, x])
+        # Layer normalization 2.
+        encoded_categorical_features = layers.LayerNormalization(
+            name=f"layer_norm2_{block_idx}", epsilon=1e-6
+        )(x)
+
+    # Flatten the "contextualized" embeddings of the categorical features.
+    categorical_features = layers.Flatten()(encoded_categorical_features)
+    # Apply layer normalization to the numerical features.
+    numerical_features = layers.LayerNormalization(epsilon=1e-6)(numerical_features)
+    # Prepare the input for the final MLP block.
+    features = layers.concatenate([categorical_features, numerical_features])
+
+    # Compute MLP hidden_units.
+    mlp_hidden_units = [
+        factor * features.shape[-1] for factor in mlp_hidden_units_factors
+    ]
+    # Create final MLP.
+    features = create_mlp(
+        hidden_units=mlp_hidden_units,
+        dropout_rate=dropout_rate,
+        activation=keras.activations.selu,
+        normalization_layer=layers.BatchNormalization,
+        name="MLP",
+    )(features)
+
+    # Add a sigmoid as a binary classifer.
+    outputs = layers.Dense(units=1, activation="sigmoid", name="sigmoid")(features)
+    model = keras.Model(inputs=inputs, outputs=outputs)
+    return model
+
+
+tabtransformer_model = create_tabtransformer_classifier(
+    num_transformer_blocks=NUM_TRANSFORMER_BLOCKS,
+    num_heads=NUM_HEADS,
+    embedding_dims=EMBEDDING_DIMS,
+    mlp_hidden_units_factors=MLP_HIDDEN_UNITS_FACTORS,
+    dropout_rate=DROPOUT_RATE,
+)
+
+print("Total model weights:", tabtransformer_model.count_params())
+keras.utils.plot_model(tabtransformer_model, show_shapes=True, rankdir="LR")
+
+"""
+Let's train and evaluate the TabTransformer model:
+"""
+
+history = run_experiment(
+    model=tabtransformer_model,
+    train_data_file=train_data_file,
+    test_data_file=test_data_file,
+    num_epochs=NUM_EPOCHS,
+    learning_rate=LEARNING_RATE,
+    weight_decay=WEIGHT_DECAY,
+    batch_size=BATCH_SIZE,
+)
+
+"""
+The TabTransformer model achieves ~85% validation accuracy.
+Note that, with the default parameter configurations, both the baseline and the TabTransformer
+have similar number of trainable weights: 109,895 and 87,745 respectively, and both use the same training hyperparameters.
+"""
+
+"""
+## Conclusion
+
+TabTransformer significantly outperforms MLP and recent
+deep networks for tabular data while matching the performance of tree-based ensemble models.
+TabTransformer can be learned in end-to-end supervised training using labeled examples.
+For a scenario where there are a few labeled examples and a large number of unlabeled
+examples, a pre-training procedure can be employed to train the Transformer layers using unlabeled data.
+This is followed by fine-tuning of the pre-trained Transformer layers along with
+the top MLP layer using the labeled data.
+"""
diff --git a/knowledge_base/structured_data/wide_deep_cross_networks.py b/knowledge_base/structured_data/wide_deep_cross_networks.py
new file mode 100644
index 0000000000000000000000000000000000000000..36ee4149eee0d153f584902f42b9885c6e60bda5
--- /dev/null
+++ b/knowledge_base/structured_data/wide_deep_cross_networks.py
@@ -0,0 +1,450 @@
+"""
+Title: Structured data learning with Wide, Deep, and Cross networks
+Author: [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/)
+Date created: 2020/12/31
+Last modified: 2025/01/03
+Description: Using Wide & Deep and Deep & Cross networks for structured data classification.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+This example demonstrates how to do structured data classification using the two modeling
+techniques:
+
+1. [Wide & Deep](https://ai.googleblog.com/2016/06/wide-deep-learning-better-together-with.html) models
+2. [Deep & Cross](https://arxiv.org/abs/1708.05123) models
+
+Note that this example should be run with TensorFlow 2.5 or higher.
+"""
+
+"""
+## The dataset
+
+This example uses the [Covertype](https://archive.ics.uci.edu/ml/datasets/covertype) dataset from the UCI
+Machine Learning Repository. The task is to predict forest cover type from cartographic variables.
+The dataset includes 506,011 instances with 12 input features: 10 numerical features and 2
+categorical features. Each instance is categorized into 1 of 7 classes.
+"""
+
+"""
+## Setup
+"""
+
+import os
+
+# Only the TensorFlow backend supports string inputs.
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import math
+import numpy as np
+import pandas as pd
+from tensorflow import data as tf_data
+import keras
+from keras import layers
+
+"""
+## Prepare the data
+
+First, let's load the dataset from the UCI Machine Learning Repository into a Pandas
+DataFrame:
+"""
+
+data_url = (
+    "https://archive.ics.uci.edu/ml/machine-learning-databases/covtype/covtype.data.gz"
+)
+raw_data = pd.read_csv(data_url, header=None)
+print(f"Dataset shape: {raw_data.shape}")
+raw_data.head()
+
+"""
+The two categorical features in the dataset are binary-encoded.
+We will convert this dataset representation to the typical representation, where each
+categorical feature is represented as a single integer value.
+"""
+
+soil_type_values = [f"soil_type_{idx+1}" for idx in range(40)]
+wilderness_area_values = [f"area_type_{idx+1}" for idx in range(4)]
+
+soil_type = raw_data.loc[:, 14:53].apply(
+    lambda x: soil_type_values[0::1][x.to_numpy().nonzero()[0][0]], axis=1
+)
+wilderness_area = raw_data.loc[:, 10:13].apply(
+    lambda x: wilderness_area_values[0::1][x.to_numpy().nonzero()[0][0]], axis=1
+)
+
+CSV_HEADER = [
+    "Elevation",
+    "Aspect",
+    "Slope",
+    "Horizontal_Distance_To_Hydrology",
+    "Vertical_Distance_To_Hydrology",
+    "Horizontal_Distance_To_Roadways",
+    "Hillshade_9am",
+    "Hillshade_Noon",
+    "Hillshade_3pm",
+    "Horizontal_Distance_To_Fire_Points",
+    "Wilderness_Area",
+    "Soil_Type",
+    "Cover_Type",
+]
+
+data = pd.concat(
+    [raw_data.loc[:, 0:9], wilderness_area, soil_type, raw_data.loc[:, 54]],
+    axis=1,
+    ignore_index=True,
+)
+data.columns = CSV_HEADER
+
+# Convert the target label indices into a range from 0 to 6 (there are 7 labels in total).
+data["Cover_Type"] = data["Cover_Type"] - 1
+
+print(f"Dataset shape: {data.shape}")
+data.head().T
+
+"""
+The shape of the DataFrame shows there are 13 columns per sample
+(12 for the features and 1 for the target label).
+
+Let's split the data into training (85%) and test (15%) sets.
+"""
+
+train_splits = []
+test_splits = []
+
+for _, group_data in data.groupby("Cover_Type"):
+    random_selection = np.random.rand(len(group_data.index)) <= 0.85
+    train_splits.append(group_data[random_selection])
+    test_splits.append(group_data[~random_selection])
+
+train_data = pd.concat(train_splits).sample(frac=1).reset_index(drop=True)
+test_data = pd.concat(test_splits).sample(frac=1).reset_index(drop=True)
+
+print(f"Train split size: {len(train_data.index)}")
+print(f"Test split size: {len(test_data.index)}")
+
+"""
+Next, store the training and test data in separate CSV files.
+"""
+
+train_data_file = "train_data.csv"
+test_data_file = "test_data.csv"
+
+train_data.to_csv(train_data_file, index=False)
+test_data.to_csv(test_data_file, index=False)
+
+"""
+## Define dataset metadata
+
+Here, we define the metadata of the dataset that will be useful for reading and parsing
+the data into input features, and encoding the input features with respect to their types.
+"""
+
+TARGET_FEATURE_NAME = "Cover_Type"
+
+TARGET_FEATURE_LABELS = ["0", "1", "2", "3", "4", "5", "6"]
+
+NUMERIC_FEATURE_NAMES = [
+    "Aspect",
+    "Elevation",
+    "Hillshade_3pm",
+    "Hillshade_9am",
+    "Hillshade_Noon",
+    "Horizontal_Distance_To_Fire_Points",
+    "Horizontal_Distance_To_Hydrology",
+    "Horizontal_Distance_To_Roadways",
+    "Slope",
+    "Vertical_Distance_To_Hydrology",
+]
+
+CATEGORICAL_FEATURES_WITH_VOCABULARY = {
+    "Soil_Type": list(data["Soil_Type"].unique()),
+    "Wilderness_Area": list(data["Wilderness_Area"].unique()),
+}
+
+CATEGORICAL_FEATURE_NAMES = list(CATEGORICAL_FEATURES_WITH_VOCABULARY.keys())
+
+FEATURE_NAMES = NUMERIC_FEATURE_NAMES + CATEGORICAL_FEATURE_NAMES
+
+COLUMN_DEFAULTS = [
+    [0] if feature_name in NUMERIC_FEATURE_NAMES + [TARGET_FEATURE_NAME] else ["NA"]
+    for feature_name in CSV_HEADER
+]
+
+NUM_CLASSES = len(TARGET_FEATURE_LABELS)
+
+"""
+## Experiment setup
+
+Next, let's define an input function that reads and parses the file, then converts features
+and labels into a[`tf.data.Dataset`](https://www.tensorflow.org/guide/datasets)
+for training or evaluation.
+"""
+
+
+# To convert the datasets elements to from OrderedDict to Dictionary
+def process(features, target):
+    return dict(features), target
+
+
+def get_dataset_from_csv(csv_file_path, batch_size, shuffle=False):
+    dataset = tf_data.experimental.make_csv_dataset(
+        csv_file_path,
+        batch_size=batch_size,
+        column_names=CSV_HEADER,
+        column_defaults=COLUMN_DEFAULTS,
+        label_name=TARGET_FEATURE_NAME,
+        num_epochs=1,
+        header=True,
+        shuffle=shuffle,
+    ).map(process)
+    return dataset.cache()
+
+
+"""
+Here we configure the parameters and implement the procedure for running a training and
+evaluation experiment given a model.
+"""
+
+learning_rate = 0.001
+dropout_rate = 0.1
+batch_size = 265
+num_epochs = 1
+
+hidden_units = [32, 32]
+
+
+def run_experiment(model):
+    model.compile(
+        optimizer=keras.optimizers.Adam(learning_rate=learning_rate),
+        loss=keras.losses.SparseCategoricalCrossentropy(),
+        metrics=[keras.metrics.SparseCategoricalAccuracy()],
+    )
+
+    train_dataset = get_dataset_from_csv(train_data_file, batch_size, shuffle=True)
+
+    test_dataset = get_dataset_from_csv(test_data_file, batch_size)
+
+    print("Start training the model...")
+    history = model.fit(train_dataset, epochs=num_epochs)
+    print("Model training finished")
+
+    _, accuracy = model.evaluate(test_dataset, verbose=0)
+
+    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
+
+
+"""
+## Create model inputs
+
+Now, define the inputs for the models as a dictionary, where the key is the feature name,
+and the value is a `keras.layers.Input` tensor with the corresponding feature shape
+and data type.
+"""
+
+
+def create_model_inputs():
+    inputs = {}
+    for feature_name in FEATURE_NAMES:
+        if feature_name in NUMERIC_FEATURE_NAMES:
+            inputs[feature_name] = layers.Input(
+                name=feature_name, shape=(), dtype="float32"
+            )
+        else:
+            inputs[feature_name] = layers.Input(
+                name=feature_name, shape=(), dtype="string"
+            )
+    return inputs
+
+
+"""
+## Encode features
+
+We create two representations of our input features: sparse and dense:
+1. In the **sparse** representation, the categorical features are encoded with one-hot
+encoding using the `CategoryEncoding` layer. This representation can be useful for the
+model to *memorize* particular feature values to make certain predictions.
+2. In the **dense** representation, the categorical features are encoded with
+low-dimensional embeddings using the `Embedding` layer. This representation helps
+the model to *generalize* well to unseen feature combinations.
+"""
+
+
+def encode_inputs(inputs, use_embedding=False):
+    encoded_features = []
+    for feature_name in inputs:
+        if feature_name in CATEGORICAL_FEATURE_NAMES:
+            vocabulary = CATEGORICAL_FEATURES_WITH_VOCABULARY[feature_name]
+            # Create a lookup to convert string values to an integer indices.
+            # Since we are not using a mask token nor expecting any out of vocabulary
+            # (oov) token, we set mask_token to None and  num_oov_indices to 0.
+            lookup = layers.StringLookup(
+                vocabulary=vocabulary,
+                mask_token=None,
+                num_oov_indices=0,
+                output_mode="int" if use_embedding else "binary",
+            )
+            if use_embedding:
+                # Convert the string input values into integer indices.
+                encoded_feature = lookup(inputs[feature_name])
+                embedding_dims = int(math.sqrt(len(vocabulary)))
+                # Create an embedding layer with the specified dimensions.
+                embedding = layers.Embedding(
+                    input_dim=len(vocabulary), output_dim=embedding_dims
+                )
+                # Convert the index values to embedding representations.
+                encoded_feature = embedding(encoded_feature)
+            else:
+                # Convert the string input values into a one hot encoding.
+                encoded_feature = lookup(
+                    keras.ops.expand_dims(inputs[feature_name], -1)
+                )
+        else:
+            # Use the numerical features as-is.
+            encoded_feature = keras.ops.expand_dims(inputs[feature_name], -1)
+
+        encoded_features.append(encoded_feature)
+
+    all_features = layers.concatenate(encoded_features)
+    return all_features
+
+
+"""
+## Experiment 1: a baseline model
+
+In the first experiment, let's create a multi-layer feed-forward network,
+where the categorical features are one-hot encoded.
+"""
+
+
+def create_baseline_model():
+    inputs = create_model_inputs()
+    features = encode_inputs(inputs)
+
+    for units in hidden_units:
+        features = layers.Dense(units)(features)
+        features = layers.BatchNormalization()(features)
+        features = layers.ReLU()(features)
+        features = layers.Dropout(dropout_rate)(features)
+
+    outputs = layers.Dense(units=NUM_CLASSES, activation="softmax")(features)
+    model = keras.Model(inputs=inputs, outputs=outputs)
+    return model
+
+
+baseline_model = create_baseline_model()
+keras.utils.plot_model(baseline_model, show_shapes=True, rankdir="LR")
+
+"""
+Let's run it:
+"""
+
+run_experiment(baseline_model)
+
+"""
+The baseline linear model achieves ~76% test accuracy.
+"""
+
+"""
+## Experiment 2: Wide & Deep model
+
+In the second experiment, we create a Wide & Deep model. The wide part of the model
+a linear model, while the deep part of the model is a multi-layer feed-forward network.
+
+Use the sparse representation of the input features in the wide part of the model and the
+dense representation of the input features for the deep part of the model.
+
+Note that every input features contributes to both parts of the model with different
+representations.
+"""
+
+
+def create_wide_and_deep_model():
+    inputs = create_model_inputs()
+    wide = encode_inputs(inputs)
+    wide = layers.BatchNormalization()(wide)
+
+    deep = encode_inputs(inputs, use_embedding=True)
+    for units in hidden_units:
+        deep = layers.Dense(units)(deep)
+        deep = layers.BatchNormalization()(deep)
+        deep = layers.ReLU()(deep)
+        deep = layers.Dropout(dropout_rate)(deep)
+
+    merged = layers.concatenate([wide, deep])
+    outputs = layers.Dense(units=NUM_CLASSES, activation="softmax")(merged)
+    model = keras.Model(inputs=inputs, outputs=outputs)
+    return model
+
+
+wide_and_deep_model = create_wide_and_deep_model()
+keras.utils.plot_model(wide_and_deep_model, show_shapes=True, rankdir="LR")
+
+"""
+Let's run it:
+"""
+
+run_experiment(wide_and_deep_model)
+
+"""
+The wide and deep model achieves ~79% test accuracy.
+"""
+
+"""
+## Experiment 3: Deep & Cross model
+
+In the third experiment, we create a Deep & Cross model. The deep part of this model
+is the same as the deep part created in the previous experiment. The key idea of
+the cross part is to apply explicit feature crossing in an efficient way,
+where the degree of cross features grows with layer depth.
+"""
+
+
+def create_deep_and_cross_model():
+    inputs = create_model_inputs()
+    x0 = encode_inputs(inputs, use_embedding=True)
+
+    cross = x0
+    for _ in hidden_units:
+        units = cross.shape[-1]
+        x = layers.Dense(units)(cross)
+        cross = x0 * x + cross
+    cross = layers.BatchNormalization()(cross)
+
+    deep = x0
+    for units in hidden_units:
+        deep = layers.Dense(units)(deep)
+        deep = layers.BatchNormalization()(deep)
+        deep = layers.ReLU()(deep)
+        deep = layers.Dropout(dropout_rate)(deep)
+
+    merged = layers.concatenate([cross, deep])
+    outputs = layers.Dense(units=NUM_CLASSES, activation="softmax")(merged)
+    model = keras.Model(inputs=inputs, outputs=outputs)
+    return model
+
+
+deep_and_cross_model = create_deep_and_cross_model()
+keras.utils.plot_model(deep_and_cross_model, show_shapes=True, rankdir="LR")
+
+"""
+Let's run it:
+"""
+
+run_experiment(deep_and_cross_model)
+
+"""
+The deep and cross model achieves ~81% test accuracy.
+"""
+
+"""
+## Conclusion
+
+You can use Keras Preprocessing Layers to easily handle categorical features
+with different encoding mechanisms, including one-hot encoding and feature embedding.
+In addition, different model architectures — like wide, deep, and cross networks
+— have different advantages, with respect to different dataset properties.
+You can explore using them independently or combining them to achieve the best result
+for your dataset.
+"""
diff --git a/knowledge_base/timeseries/eeg_bci_ssvepformer.py b/knowledge_base/timeseries/eeg_bci_ssvepformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..fe4d0ec4e4c854410d63731d4be76df8970a59d0
--- /dev/null
+++ b/knowledge_base/timeseries/eeg_bci_ssvepformer.py
@@ -0,0 +1,655 @@
+"""
+Title: Electroencephalogram Signal Classification for Brain-Computer Interface
+Author: [Okba Bekhelifi](https://github.com/okbalefthanded)
+Date created: 2025/01/08
+Last modified: 2025/01/08
+Description: A Transformer based classification for EEG signal for BCI.
+Accelerator: GPU
+"""
+
+"""
+# Introduction
+
+This tutorial will explain how to build a Transformer based Neural Network to classify
+Brain-Computer Interface (BCI) Electroencephalograpy (EEG) data recorded in a
+Steady-State Visual Evoked Potentials (SSVEPs) experiment for the application of a
+brain-controlled speller.
+
+The tutorial reproduces an experiment from the SSVEPFormer study [1]
+( [arXiv preprint](https://arxiv.org/abs/2210.04172) /
+[Peer-Reviewed paper](https://www.sciencedirect.com/science/article/abs/pii/S0893608023002319) ).
+This model was the first Transformer based model to be introduced for SSVEP data classification,
+we will test it on the Nakanishi et al. [2] public dataset as dataset 1 from the paper.
+
+The process follows an inter-subject classification experiment. Given N subject data in
+the dataset, the training data partition contains data from N-1 subject and the remaining
+single subject data is used for testing. the training set does not contain any sample from
+the testing subject. This way we construct a true subject-independent model. We keep the
+same parameters and settings as the original paper in all processing operations from
+preprocessing to training.
+
+
+The tutorial begins with a quick BCI and dataset description then, we go through the
+technicalities following these sections:
+- Setup, and imports.
+- Dataset download and extraction.
+- Data preprocessing: EEG data filtering, segmentation and visualization of raw and
+filtered data, and frequency response for a well performing participant.
+- Layers and model creation.
+- Evaluation: a single participant data classification as an example then the total
+participants data classification.
+- Visulization: we show the results of training and inference times comparison among
+the Keras 3 available backends (JAX, Tensorflow, and PyTorch) on three different GPUs.
+- Conclusion: final discussion and remarks.
+
+"""
+
+"""
+# Dataset description
+
+## BCI and SSVEP:
+A BCI offers the ability to communicate using only brain activity, this can be achieved
+through exogenous stimuli that generate specific responses indicating the intent of the
+subject. the responses are elicited when the user focuses their attention on the target
+stimulus. We can use visual stimuli by presenting the subject with a set of options
+typically on a monitor as a grid to select one command at a time. Each stimulus will
+flicker following a fixed frequency and phase, the resulting EEG recorded at occipital
+and occipito-parietal areas of the cortex (visual cortex) will have higher power in the
+associated frequency with the stimulus where the subject was looking at. This type of
+BCI paradigm is called the Steady-State Visual Evoked Potentials (SSVEPs) and became
+widely used for multiple application due to its reliability and high perfromance in
+classification and rapidity as a 1-second of EEG is sufficient making a command. Other
+types of brain responses exists and do not require external stimulations, however they
+are less reliable.
+[Demo video](https://www.youtube.com/watch?v=VtA6jsEMIug)
+
+This tutorials uses the 12 commands (class) public SSVEP dataset [2] with the following
+interface emulating a phone dialing numbers.
+![dataset](/img/eeg_bci_ssvepformer/eeg_ssvepformer_dataset1_interface.jpg)
+
+The dataset was recorded with 10 participants, each faced the above 12 SSVEP stimuli (A).
+The stimulation frequencies ranged from 9.25Hz to 14.75 Hz with 0.5Hz step, and phases
+ranged from 0 to 1.5 π with 0.5 π step for each row.(B). The EEG signal was acquired
+with 8 electrodes (channels) (PO7, PO3, POz,
+PO4, PO8, O1, Oz, O2) sampling frequency was 2048 Hz then the stored data were
+downsampled to 256 Hz. The subjects completed 15 blocks of recordings, each consisted
+of 12 random ordered stimulations (1 for each class) of 4 seconds each. In total,
+each subject conducted 180 trials.
+
+
+"""
+
+"""
+# Setup
+"""
+
+"""
+## Select JAX backend
+
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "jax"
+
+"""
+## Install dependencies
+
+"""
+
+"""shell
+pip install -q numpy
+pip install -q scipy
+pip install -q matplotlib
+"""
+
+"""
+# Imports
+
+
+"""
+
+# deep learning libraries
+from keras import backend as K
+from keras import layers
+import keras
+
+# visualization and signal processing imports
+import matplotlib.pyplot as plt
+import tensorflow as tf
+import numpy as np
+from scipy.signal import butter, filtfilt
+from scipy.io import loadmat
+
+# setting the backend, seed and Keras channel format
+K.set_image_data_format("channels_first")
+keras.utils.set_random_seed(42)
+
+"""
+# Download and extract dataset
+
+
+"""
+
+"""
+## Nakanishi et. al 2015 [DataSet Repo](https://github.com/mnakanishi/12JFPM_SSVEP)
+"""
+
+"""shell
+curl -O https://sccn.ucsd.edu/download/cca_ssvep.zip
+unzip cca_ssvep.zip
+"""
+
+"""
+# Pre-Processing
+
+The preprocessing steps followed are first to read the EEG data for each subject, then
+to filter the raw data in a frequency interval where most useful information lies,
+then we select a fixed duration of signal starting from the onset of the stimulation
+(due to latency delay caused by the visual system we start we add 135 milliseconds to
+the stimulation onset). Lastly, all subjects data are concatenated in a single Tensor
+of the shape: [subjects x samples x channels x trials]. The data labels are also
+concatenated following the order of the trials in the experiments and will be a
+matrix of the shape [subjects x trials]
+(here by channels we mean electrodes, we use this notation throughout the tutorial).
+"""
+
+
+def raw_signal(folder, fs=256, duration=1.0, onset=0.135):
+    """selecting a 1-second segment of the raw EEG signal for
+    subject 1.
+    """
+    onset = 38 + int(onset * fs)
+    end = int(duration * fs)
+    data = loadmat(f"{folder}/s1.mat")
+    # samples, channels, trials, targets
+    eeg = data["eeg"].transpose((2, 1, 3, 0))
+    # segment data
+    eeg = eeg[onset : onset + end, :, :, :]
+    return eeg
+
+
+def segment_eeg(
+    folder, elecs=None, fs=256, duration=1.0, band=[5.0, 45.0], order=4, onset=0.135
+):
+    """Filtering and segmenting EEG signals for all subjects."""
+    n_subejects = 10
+    onset = 38 + int(onset * fs)
+    end = int(duration * fs)
+    X, Y = [], []  # empty data and labels
+
+    for subj in range(1, n_subejects + 1):
+        data = loadmat(f"{data_folder}/s{subj}.mat")
+        # samples, channels, trials, targets
+        eeg = data["eeg"].transpose((2, 1, 3, 0))
+        # filter data
+        eeg = filter_eeg(eeg, fs=fs, band=band, order=order)
+        # segment data
+        eeg = eeg[onset : onset + end, :, :, :]
+        # reshape labels
+        samples, channels, blocks, targets = eeg.shape
+        y = np.tile(np.arange(1, targets + 1), (blocks, 1))
+        y = y.reshape((1, blocks * targets), order="F")
+
+        X.append(eeg.reshape((samples, channels, blocks * targets), order="F"))
+        Y.append(y)
+
+    X = np.array(X, dtype=np.float32, order="F")
+    Y = np.array(Y, dtype=np.float32).squeeze()
+
+    return X, Y
+
+
+def filter_eeg(data, fs=256, band=[5.0, 45.0], order=4):
+    """Filter EEG signal using a zero-phase IIR filter"""
+    B, A = butter(order, np.array(band) / (fs / 2), btype="bandpass")
+    return filtfilt(B, A, data, axis=0)
+
+
+"""
+## Segment data into epochs
+"""
+
+data_folder = os.path.abspath("./cca_ssvep")
+band = [8, 64]  # low-frequency / high-frequency cutoffS
+order = 4  # filter order
+fs = 256  # sampling frequency
+duration = 1.0  # 1 second
+
+# raw signal
+X_raw = raw_signal(data_folder, fs=fs, duration=duration)
+print(
+    f"A single subject raw EEG (X_raw) shape: {X_raw.shape} [Samples x Channels x Blocks x Targets]"
+)
+
+# segmented signal
+X, Y = segment_eeg(data_folder, band=band, order=order, fs=fs, duration=duration)
+print(
+    f"Full training data (X) shape: {X.shape} [Subject x Samples x Channels x Trials]"
+)
+print(f"data labels (Y) shape:        {Y.shape} [Subject x Trials]")
+
+samples = X.shape[1]
+time = np.linspace(0.0, samples / fs, samples) * 1000
+
+"""
+## Visualize EEG signal
+"""
+
+"""
+## EEG in time
+
+Raw EEG vs Filtered EEG
+The same 1-second recording for subject s1 at Oz (central electrode in the visual cortex,
+back of the head) is illustrated. left is the raw EEG as recorded and in the right is
+the filtered EEG on the [8, 64] Hz frequency band. we see less noise and
+normalized amplitude values in a natural EEG range.
+"""
+
+
+elec = 6  # Oz channel
+
+x_label = "Time (ms)"
+y_label = "Voltage (uV)"
+# Create subplots
+fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(10, 4))
+
+# Plot data on the first subplot
+ax1.plot(time, X_raw[:, elec, 0, 0], "r-")
+ax1.set_xlabel(x_label)
+ax1.set_ylabel(y_label)
+ax1.set_title("Raw EEG : 1 second at Oz ")
+
+# Plot data on the second subplot
+ax2.plot(time, X[0, :, elec, 0], "b-")
+ax2.set_xlabel(x_label)
+ax2.set_ylabel(y_label)
+ax2.set_title("Filtered EEG between 8-64 Hz: 1 second at Oz")
+
+# Adjust spacing between subplots
+plt.tight_layout()
+
+# Show the plot
+plt.show()
+
+"""
+## EEG frequency representation
+
+Using the welch method, we visualize the frequency power for a well performing subject
+for the entire 4 seconds EEG recording at Oz electrode for each stimuli. the red peaks
+indicate the stimuli fundamental frequency and the 2nd harmonics (double the fundamental
+frequency). we see clear peaks showing the high responses from that subject which means
+that this subject is a good candidate for SSVEP BCI control. In many cases the peaks
+are weak or absent, meaning that subject do not achieve the task correctly.
+
+![eeg_frequency](/img/eeg_bci_ssvepformer/eeg_ssvepformer_frequencypowers.png)
+"""
+
+
+"""
+# Create Layers and model
+
+Create Layers in a cross-framework custom component fashion.
+In the SSVEPFormer, the data is first transformed to the frequency domain through
+Fast-Fourier transform (FFT), to construct a complex spectrum presentation consisting of
+the concatenation of frequency and phase information in a fixed frequency band. To keep
+the model in an end-to-end format, we implement the complex spectrum transformation as
+non-trainable layer.
+
+![model](/img/eeg_bci_ssvepformer/eeg_ssvepformer_model.jpg)
+The SSVEPFormer unlike the Transformer architecture does not contain positional encoding/embedding
+layers which replaced a channel combination block that has a layer of Conv1D layer of 1
+kernel size with double input channels (double the count of electrodes) number of filters,
+and LayerNorm, Gelu activation and dropout.
+Another difference with Transformers is the absence of multi-head attention layers with
+attention mechanism.
+The model encoder contains two identical and successive blocks. Each block has two
+sub-blocks of CNN module and MLP module. the CNN module consists of a LayerNorm, Conv1D
+with the same number of filters as channel combination, LayerNorm, Gelu, Dropout and an
+residual connection. The MLP module consists of a LayerNorm, Dense layer, Gelu, droput
+and residual connection. the Dense layer is applied on each channel separately.
+The last block of the model is MLP head with Flatten layer, Dropout, Dense, LayerNorm,
+Gelu, Dropout and Dense layer with softmax acitvation.
+All trainable weights are initialized by a normal distribution with 0 mean and 0.01
+standard deviation as state in the original paper.
+"""
+
+
+class ComplexSpectrum(keras.layers.Layer):
+    def __init__(self, nfft=512, fft_start=8, fft_end=64):
+        super().__init__()
+        self.nfft = nfft
+        self.fft_start = fft_start
+        self.fft_end = fft_end
+
+    def call(self, x):
+        samples = x.shape[-1]
+        x = keras.ops.rfft(x, fft_length=self.nfft)
+        real = x[0] / samples
+        imag = x[1] / samples
+        real = real[:, :, self.fft_start : self.fft_end]
+        imag = imag[:, :, self.fft_start : self.fft_end]
+        x = keras.ops.concatenate((real, imag), axis=-1)
+        return x
+
+
+class ChannelComb(keras.layers.Layer):
+    def __init__(self, n_channels, drop_rate=0.5):
+        super().__init__()
+        self.conv = layers.Conv1D(
+            2 * n_channels,
+            1,
+            padding="same",
+            kernel_initializer=keras.initializers.RandomNormal(
+                mean=0.0, stddev=0.01, seed=None
+            ),
+        )
+        self.normalization = layers.LayerNormalization()
+        self.activation = layers.Activation(activation="gelu")
+        self.drop = layers.Dropout(drop_rate)
+
+    def call(self, x):
+        x = self.conv(x)
+        x = self.normalization(x)
+        x = self.activation(x)
+        x = self.drop(x)
+        return x
+
+
+class ConvAttention(keras.layers.Layer):
+    def __init__(self, n_channels, drop_rate=0.5):
+        super().__init__()
+        self.norm = layers.LayerNormalization()
+        self.conv = layers.Conv1D(
+            2 * n_channels,
+            31,
+            padding="same",
+            kernel_initializer=keras.initializers.RandomNormal(
+                mean=0.0, stddev=0.01, seed=None
+            ),
+        )
+        self.activation = layers.Activation(activation="gelu")
+        self.drop = layers.Dropout(drop_rate)
+
+    def call(self, x):
+        input = x
+        x = self.norm(x)
+        x = self.conv(x)
+        x = self.activation(x)
+        x = self.drop(x)
+        x = x + input
+        return x
+
+
+class ChannelMLP(keras.layers.Layer):
+    def __init__(self, n_features, drop_rate=0.5):
+        super().__init__()
+        self.norm = layers.LayerNormalization()
+        self.mlp = layers.Dense(
+            2 * n_features,
+            kernel_initializer=keras.initializers.RandomNormal(
+                mean=0.0, stddev=0.01, seed=None
+            ),
+        )
+        self.activation = layers.Activation(activation="gelu")
+        self.drop = layers.Dropout(drop_rate)
+        self.cat = layers.Concatenate(axis=1)
+
+    def call(self, x):
+        input = x
+        channels = x.shape[1]  # x shape : NCF
+        x = self.norm(x)
+        output_channels = []
+        for i in range(channels):
+            c = self.mlp(x[:, :, i])
+            c = layers.Reshape([1, -1])(c)
+            output_channels.append(c)
+        x = self.cat(output_channels)
+        x = self.activation(x)
+        x = self.drop(x)
+        x = x + input
+        return x
+
+
+class Encoder(keras.layers.Layer):
+    def __init__(self, n_channels, n_features, drop_rate=0.5):
+        super().__init__()
+        self.attention1 = ConvAttention(n_channels, drop_rate=drop_rate)
+        self.mlp1 = ChannelMLP(n_features, drop_rate=drop_rate)
+        self.attention2 = ConvAttention(n_channels, drop_rate=drop_rate)
+        self.mlp2 = ChannelMLP(n_features, drop_rate=drop_rate)
+
+    def call(self, x):
+        x = self.attention1(x)
+        x = self.mlp1(x)
+        x = self.attention2(x)
+        x = self.mlp2(x)
+        return x
+
+
+class MlpHead(keras.layers.Layer):
+    def __init__(self, n_classes, drop_rate=0.5):
+        super().__init__()
+        self.flatten = layers.Flatten()
+        self.drop = layers.Dropout(drop_rate)
+        self.linear1 = layers.Dense(
+            6 * n_classes,
+            kernel_initializer=keras.initializers.RandomNormal(
+                mean=0.0, stddev=0.01, seed=None
+            ),
+        )
+        self.norm = layers.LayerNormalization()
+        self.activation = layers.Activation(activation="gelu")
+        self.drop2 = layers.Dropout(drop_rate)
+        self.linear2 = layers.Dense(
+            n_classes,
+            kernel_initializer=keras.initializers.RandomNormal(
+                mean=0.0, stddev=0.01, seed=None
+            ),
+        )
+
+    def call(self, x):
+        x = self.flatten(x)
+        x = self.drop(x)
+        x = self.linear1(x)
+        x = self.norm(x)
+        x = self.activation(x)
+        x = self.drop2(x)
+        x = self.linear2(x)
+        return x
+
+
+"""
+###  Create a sequential model with the layers above
+"""
+
+
+def create_ssvepformer(
+    input_shape, fs, resolution, fq_band, n_channels, n_classes, drop_rate
+):
+    nfft = round(fs / resolution)
+    fft_start = int(fq_band[0] / resolution)
+    fft_end = int(fq_band[1] / resolution) + 1
+    n_features = fft_end - fft_start
+
+    model = keras.Sequential(
+        [
+            keras.Input(shape=input_shape),
+            ComplexSpectrum(nfft, fft_start, fft_end),
+            ChannelComb(n_channels=n_channels, drop_rate=drop_rate),
+            Encoder(n_channels=n_channels, n_features=n_features, drop_rate=drop_rate),
+            Encoder(n_channels=n_channels, n_features=n_features, drop_rate=drop_rate),
+            MlpHead(n_classes=n_classes, drop_rate=drop_rate),
+            layers.Activation(activation="softmax"),
+        ]
+    )
+
+    return model
+
+
+"""
+# Evaluation
+"""
+
+# Training settings same as the original paper
+BATCH_SIZE = 128
+EPOCHS = 100
+LR = 0.001  # learning rate
+WD = 0.001  # weight decay
+MOMENTUM = 0.9
+DROP_RATE = 0.5
+
+resolution = 0.25
+
+"""
+From the entire dataset we select folds for each subject evaluation.
+construct a tf dataset object for train and testing data and create the model and launch
+the training using SGD optimizer.
+"""
+
+
+def concatenate_subjects(x, y, fold):
+    X = np.concatenate([x[idx] for idx in fold], axis=-1)
+    Y = np.concatenate([y[idx] for idx in fold], axis=-1)
+    X = X.transpose((2, 1, 0))  # trials x channels x samples
+    return X, Y - 1  # transform labels to values from 0...11
+
+
+def evaluate_subject(
+    x_train,
+    y_train,
+    x_val,
+    y_val,
+    input_shape,
+    fs=256,
+    resolution=0.25,
+    band=[8, 64],
+    channels=8,
+    n_classes=12,
+    drop_rate=DROP_RATE,
+):
+
+    train_dataset = (
+        tf.data.Dataset.from_tensor_slices((x_train, y_train))
+        .batch(BATCH_SIZE)
+        .prefetch(tf.data.AUTOTUNE)
+    )
+
+    test_dataset = (
+        tf.data.Dataset.from_tensor_slices((x_val, y_val))
+        .batch(BATCH_SIZE)
+        .prefetch(tf.data.AUTOTUNE)
+    )
+
+    model = create_ssvepformer(
+        input_shape, fs, resolution, band, channels, n_classes, drop_rate
+    )
+    sgd = keras.optimizers.SGD(learning_rate=LR, momentum=MOMENTUM, weight_decay=WD)
+
+    model.compile(
+        loss="sparse_categorical_crossentropy",
+        optimizer=sgd,
+        metrics=["accuracy"],
+        jit_compile=True,
+    )
+
+    history = model.fit(
+        train_dataset,
+        batch_size=BATCH_SIZE,
+        epochs=EPOCHS,
+        validation_data=test_dataset,
+        verbose=0,
+    )
+    loss, acc = model.evaluate(test_dataset)
+    return acc * 100
+
+
+"""
+## Run evaluation
+"""
+
+channels = X.shape[2]
+samples = X.shape[1]
+input_shape = (channels, samples)
+n_classes = 12
+
+model = create_ssvepformer(
+    input_shape, fs, resolution, band, channels, n_classes, DROP_RATE
+)
+model.summary()
+
+"""
+## Evaluation on all subjects following a leave-one-subject out data repartition scheme
+"""
+
+accs = np.zeros(10)
+
+for subject in range(10):
+    print(f"Testing subject: {subject+ 1}")
+
+    # create train / test folds
+    folds = np.delete(np.arange(10), subject)
+    train_index = folds
+    test_index = [subject]
+
+    # create data split for each subject
+    x_train, y_train = concatenate_subjects(X, Y, train_index)
+    x_val, y_val = concatenate_subjects(X, Y, test_index)
+
+    # train and evaluate a fold and compute the time it takes
+    acc = evaluate_subject(x_train, y_train, x_val, y_val, input_shape)
+
+    accs[subject] = acc
+
+print(f"\nAccuracy Across Subjects: {accs.mean()} % std: {np.std(accs)}")
+
+"""
+and that's it! we see how some subjects with no data on the training set still can achieve
+almost a 100% correct commands and others show poor performance around 50%. In the original
+paper using PyTorch the average accuracy was 84.04% with 17.37 std. we reached the same
+values knowing the stochastic nature of deep learning.
+"""
+
+"""
+# Visualizations
+
+Training and inference times comparison between the different backends (Jax, Tensorflow
+and PyTorch) on the three GPUs available with Colab Free/Pro/Pro+: T4, L4, A100.
+
+
+"""
+
+"""
+## Training Time
+
+![training_time](/img/eeg_bci_ssvepformer/eeg_ssvepformer_keras_training_time.png)
+"""
+
+"""
+# Inference Time
+
+![inference_time](/img/eeg_bci_ssvepformer/eeg_ssvepformer_keras_inference_time.png)
+"""
+
+"""
+the Jax backend was the best on training and inference in all the GPUs, the PyTorch was
+exremely slow due to the jit compilation option being disable because of the complex
+data type calculated by FFT which is not supported by the PyTorch jit compiler.
+"""
+
+"""
+# Acknowledgment
+
+I thank Chris Perry [X](https://x.com/thechrisperry) @GoogleColab for supporting this
+work with GPU compute.
+"""
+
+"""
+# References
+[1] Chen, J. et al. (2023) ‘A transformer-based deep neural network model for SSVEP
+classification’, Neural Networks, 164, pp. 521–534. Available at: https://doi.org/10.1016/j.neunet.2023.04.045.
+
+[2] Nakanishi, M. et al. (2015) ‘A Comparison Study of Canonical Correlation Analysis
+Based Methods for Detecting Steady-State Visual Evoked Potentials’, Plos One, 10(10), p.
+e0140703. Available at: https://doi.org/10.1371/journal.pone.0140703
+"""
diff --git a/knowledge_base/timeseries/eeg_signal_classification.py b/knowledge_base/timeseries/eeg_signal_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..7cc86437d0cbd0ee09a4893fb588a3455b64316c
--- /dev/null
+++ b/knowledge_base/timeseries/eeg_signal_classification.py
@@ -0,0 +1,560 @@
+"""
+Title: Electroencephalogram Signal Classification for action identification
+Author: [Suvaditya Mukherjee](https://github.com/suvadityamuk)
+Date created: 2022/11/03
+Last modified: 2022/11/05
+Description: Training a Convolutional model to classify EEG signals produced by exposure to certain stimuli.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+The following example explores how we can make a Convolution-based Neural Network to
+perform classification on Electroencephalogram signals captured when subjects were
+exposed to different stimuli.
+We train a model from scratch since such signal-classification models are fairly scarce
+in pre-trained format.
+The data we use is sourced from the UC Berkeley-Biosense Lab where the data was collected
+from 15 subjects at the same time.
+Our process is as follows:
+
+- Load the [UC Berkeley-Biosense Synchronized Brainwave Dataset](https://www.kaggle.com/datasets/berkeley-biosense/synchronized-brainwave-dataset)
+- Visualize random samples from the data
+- Pre-process, collate and scale the data to finally make a `tf.data.Dataset`
+- Prepare class weights in order to tackle major imbalances
+- Create a Conv1D and Dense-based model to perform classification
+- Define callbacks and hyperparameters
+- Train the model
+- Plot metrics from History and perform evaluation
+
+This example needs the following external dependencies (Gdown, Scikit-learn, Pandas,
+Numpy, Matplotlib). You can install it via the following commands.
+
+Gdown is an external package used to download large files from Google Drive. To know
+more, you can refer to its [PyPi page here](https://pypi.org/project/gdown)
+"""
+
+
+"""
+## Setup and Data Downloads
+
+First, lets install our dependencies:
+"""
+
+"""shell
+pip install gdown -q
+pip install scikit-learn -q
+pip install pandas -q
+pip install numpy -q
+pip install matplotlib -q
+"""
+
+"""
+Next, lets download our dataset.
+The gdown package makes it easy to download the data from Google Drive:
+"""
+
+"""shell
+gdown 1V5B7Bt6aJm0UHbR7cRKBEK8jx7lYPVuX
+# gdown will download eeg-data.csv onto the local drive for use. Total size of
+# eeg-data.csv is 105.7 MB
+"""
+
+import pandas as pd
+import matplotlib.pyplot as plt
+import json
+import numpy as np
+import keras
+from keras import layers
+import tensorflow as tf
+from sklearn import preprocessing, model_selection
+import random
+
+QUALITY_THRESHOLD = 128
+BATCH_SIZE = 64
+SHUFFLE_BUFFER_SIZE = BATCH_SIZE * 2
+
+"""
+## Read data from `eeg-data.csv`
+
+We use the Pandas library to read the `eeg-data.csv` file and display the first 5 rows
+using the `.head()` command
+"""
+
+eeg = pd.read_csv("eeg-data.csv")
+
+"""
+We remove unlabeled samples from our dataset as they do not contribute to the model. We
+also perform a `.drop()` operation on the columns that are not required for training data
+preparation
+"""
+
+unlabeled_eeg = eeg[eeg["label"] == "unlabeled"]
+eeg = eeg.loc[eeg["label"] != "unlabeled"]
+eeg = eeg.loc[eeg["label"] != "everyone paired"]
+
+eeg.drop(
+    [
+        "indra_time",
+        "Unnamed: 0",
+        "browser_latency",
+        "reading_time",
+        "attention_esense",
+        "meditation_esense",
+        "updatedAt",
+        "createdAt",
+    ],
+    axis=1,
+    inplace=True,
+)
+
+eeg.reset_index(drop=True, inplace=True)
+eeg.head()
+
+"""
+In the data, the samples recorded are given a score from 0 to 128 based on how
+well-calibrated the sensor was (0 being best, 200 being worst). We filter the values
+based on an arbitrary cutoff limit of 128.
+"""
+
+
+def convert_string_data_to_values(value_string):
+    str_list = json.loads(value_string)
+    return str_list
+
+
+eeg["raw_values"] = eeg["raw_values"].apply(convert_string_data_to_values)
+
+eeg = eeg.loc[eeg["signal_quality"] < QUALITY_THRESHOLD]
+eeg.head()
+
+"""
+## Visualize one random sample from the data
+"""
+
+"""
+We visualize one sample from the data to understand how the stimulus-induced signal looks
+like
+"""
+
+
+def view_eeg_plot(idx):
+    data = eeg.loc[idx, "raw_values"]
+    plt.plot(data)
+    plt.title(f"Sample random plot")
+    plt.show()
+
+
+view_eeg_plot(7)
+
+"""
+## Pre-process and collate data
+"""
+
+"""
+There are a total of 67 different labels present in the data, where there are numbered
+sub-labels. We collate them under a single label as per their numbering and replace them
+in the data itself. Following this process, we perform simple Label encoding to get them
+in an integer format.
+"""
+
+print("Before replacing labels")
+print(eeg["label"].unique(), "\n")
+print(len(eeg["label"].unique()), "\n")
+
+
+eeg.replace(
+    {
+        "label": {
+            "blink1": "blink",
+            "blink2": "blink",
+            "blink3": "blink",
+            "blink4": "blink",
+            "blink5": "blink",
+            "math1": "math",
+            "math2": "math",
+            "math3": "math",
+            "math4": "math",
+            "math5": "math",
+            "math6": "math",
+            "math7": "math",
+            "math8": "math",
+            "math9": "math",
+            "math10": "math",
+            "math11": "math",
+            "math12": "math",
+            "thinkOfItems-ver1": "thinkOfItems",
+            "thinkOfItems-ver2": "thinkOfItems",
+            "video-ver1": "video",
+            "video-ver2": "video",
+            "thinkOfItemsInstruction-ver1": "thinkOfItemsInstruction",
+            "thinkOfItemsInstruction-ver2": "thinkOfItemsInstruction",
+            "colorRound1-1": "colorRound1",
+            "colorRound1-2": "colorRound1",
+            "colorRound1-3": "colorRound1",
+            "colorRound1-4": "colorRound1",
+            "colorRound1-5": "colorRound1",
+            "colorRound1-6": "colorRound1",
+            "colorRound2-1": "colorRound2",
+            "colorRound2-2": "colorRound2",
+            "colorRound2-3": "colorRound2",
+            "colorRound2-4": "colorRound2",
+            "colorRound2-5": "colorRound2",
+            "colorRound2-6": "colorRound2",
+            "colorRound3-1": "colorRound3",
+            "colorRound3-2": "colorRound3",
+            "colorRound3-3": "colorRound3",
+            "colorRound3-4": "colorRound3",
+            "colorRound3-5": "colorRound3",
+            "colorRound3-6": "colorRound3",
+            "colorRound4-1": "colorRound4",
+            "colorRound4-2": "colorRound4",
+            "colorRound4-3": "colorRound4",
+            "colorRound4-4": "colorRound4",
+            "colorRound4-5": "colorRound4",
+            "colorRound4-6": "colorRound4",
+            "colorRound5-1": "colorRound5",
+            "colorRound5-2": "colorRound5",
+            "colorRound5-3": "colorRound5",
+            "colorRound5-4": "colorRound5",
+            "colorRound5-5": "colorRound5",
+            "colorRound5-6": "colorRound5",
+            "colorInstruction1": "colorInstruction",
+            "colorInstruction2": "colorInstruction",
+            "readyRound1": "readyRound",
+            "readyRound2": "readyRound",
+            "readyRound3": "readyRound",
+            "readyRound4": "readyRound",
+            "readyRound5": "readyRound",
+            "colorRound1": "colorRound",
+            "colorRound2": "colorRound",
+            "colorRound3": "colorRound",
+            "colorRound4": "colorRound",
+            "colorRound5": "colorRound",
+        }
+    },
+    inplace=True,
+)
+
+print("After replacing labels")
+print(eeg["label"].unique())
+print(len(eeg["label"].unique()))
+
+le = preprocessing.LabelEncoder()  # Generates a look-up table
+le.fit(eeg["label"])
+eeg["label"] = le.transform(eeg["label"])
+
+"""
+We extract the number of unique classes present in the data
+"""
+
+num_classes = len(eeg["label"].unique())
+print(num_classes)
+
+"""
+We now visualize the number of samples present in each class using a Bar plot.
+"""
+
+plt.bar(range(num_classes), eeg["label"].value_counts())
+plt.title("Number of samples per class")
+plt.show()
+
+"""
+## Scale and split data
+"""
+
+"""
+We perform a simple Min-Max scaling to bring the value-range between 0 and 1. We do not
+use Standard Scaling as the data does not follow a Gaussian distribution.
+"""
+
+scaler = preprocessing.MinMaxScaler()
+series_list = [
+    scaler.fit_transform(np.asarray(i).reshape(-1, 1)) for i in eeg["raw_values"]
+]
+
+labels_list = [i for i in eeg["label"]]
+
+"""
+We now create a Train-test split with a 15% holdout set. Following this, we reshape the
+data to create a sequence of length 512. We also convert the labels from their current
+label-encoded form to a one-hot encoding to enable use of several different
+`keras.metrics` functions.
+"""
+
+x_train, x_test, y_train, y_test = model_selection.train_test_split(
+    series_list, labels_list, test_size=0.15, random_state=42, shuffle=True
+)
+
+print(
+    f"Length of x_train : {len(x_train)}\nLength of x_test : {len(x_test)}\nLength of y_train : {len(y_train)}\nLength of y_test : {len(y_test)}"
+)
+
+x_train = np.asarray(x_train).astype(np.float32).reshape(-1, 512, 1)
+y_train = np.asarray(y_train).astype(np.float32).reshape(-1, 1)
+y_train = keras.utils.to_categorical(y_train)
+
+x_test = np.asarray(x_test).astype(np.float32).reshape(-1, 512, 1)
+y_test = np.asarray(y_test).astype(np.float32).reshape(-1, 1)
+y_test = keras.utils.to_categorical(y_test)
+
+"""
+## Prepare `tf.data.Dataset`
+"""
+
+"""
+We now create a `tf.data.Dataset` from this data to prepare it for training. We also
+shuffle and batch the data for use later.
+"""
+
+train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
+
+train_dataset = train_dataset.shuffle(SHUFFLE_BUFFER_SIZE).batch(BATCH_SIZE)
+test_dataset = test_dataset.batch(BATCH_SIZE)
+
+"""
+## Make Class Weights using Naive method
+"""
+
+"""
+As we can see from the plot of number of samples per class, the dataset is imbalanced.
+Hence, we **calculate weights for each class** to make sure that the model is trained in
+a fair manner without preference to any specific class due to greater number of samples.
+
+We use a naive method to calculate these weights, finding an **inverse proportion** of
+each class and using that as the weight.
+"""
+
+vals_dict = {}
+for i in eeg["label"]:
+    if i in vals_dict.keys():
+        vals_dict[i] += 1
+    else:
+        vals_dict[i] = 1
+total = sum(vals_dict.values())
+
+# Formula used - Naive method where
+# weight = 1 - (no. of samples present / total no. of samples)
+# So more the samples, lower the weight
+
+weight_dict = {k: (1 - (v / total)) for k, v in vals_dict.items()}
+print(weight_dict)
+
+"""
+## Define simple function to plot all the metrics present in a `keras.callbacks.History`
+object
+"""
+
+
+def plot_history_metrics(history: keras.callbacks.History):
+    total_plots = len(history.history)
+    cols = total_plots // 2
+
+    rows = total_plots // cols
+
+    if total_plots % cols != 0:
+        rows += 1
+
+    pos = range(1, total_plots + 1)
+    plt.figure(figsize=(15, 10))
+    for i, (key, value) in enumerate(history.history.items()):
+        plt.subplot(rows, cols, pos[i])
+        plt.plot(range(len(value)), value)
+        plt.title(str(key))
+    plt.show()
+
+
+"""
+## Define function to generate Convolutional model
+"""
+
+
+def create_model():
+    input_layer = keras.Input(shape=(512, 1))
+
+    x = layers.Conv1D(
+        filters=32, kernel_size=3, strides=2, activation="relu", padding="same"
+    )(input_layer)
+    x = layers.BatchNormalization()(x)
+
+    x = layers.Conv1D(
+        filters=64, kernel_size=3, strides=2, activation="relu", padding="same"
+    )(x)
+    x = layers.BatchNormalization()(x)
+
+    x = layers.Conv1D(
+        filters=128, kernel_size=5, strides=2, activation="relu", padding="same"
+    )(x)
+    x = layers.BatchNormalization()(x)
+
+    x = layers.Conv1D(
+        filters=256, kernel_size=5, strides=2, activation="relu", padding="same"
+    )(x)
+    x = layers.BatchNormalization()(x)
+
+    x = layers.Conv1D(
+        filters=512, kernel_size=7, strides=2, activation="relu", padding="same"
+    )(x)
+    x = layers.BatchNormalization()(x)
+
+    x = layers.Conv1D(
+        filters=1024,
+        kernel_size=7,
+        strides=2,
+        activation="relu",
+        padding="same",
+    )(x)
+    x = layers.BatchNormalization()(x)
+
+    x = layers.Dropout(0.2)(x)
+
+    x = layers.Flatten()(x)
+
+    x = layers.Dense(4096, activation="relu")(x)
+    x = layers.Dropout(0.2)(x)
+
+    x = layers.Dense(
+        2048, activation="relu", kernel_regularizer=keras.regularizers.L2()
+    )(x)
+    x = layers.Dropout(0.2)(x)
+
+    x = layers.Dense(
+        1024, activation="relu", kernel_regularizer=keras.regularizers.L2()
+    )(x)
+    x = layers.Dropout(0.2)(x)
+    x = layers.Dense(
+        128, activation="relu", kernel_regularizer=keras.regularizers.L2()
+    )(x)
+    output_layer = layers.Dense(num_classes, activation="softmax")(x)
+
+    return keras.Model(inputs=input_layer, outputs=output_layer)
+
+
+"""
+## Get Model summary
+"""
+
+conv_model = create_model()
+conv_model.summary()
+
+"""
+## Define callbacks, optimizer, loss and metrics
+"""
+
+"""
+We set the number of epochs at 30 after performing extensive experimentation. It was seen
+that this was the optimal number, after performing Early-Stopping analysis as well.
+We define a Model Checkpoint callback to make sure that we only get the best model
+weights.
+We also define a ReduceLROnPlateau as there were several cases found during
+experimentation where the loss stagnated after a certain point. On the other hand, a
+direct LRScheduler was found to be too aggressive in its decay.
+"""
+
+epochs = 30
+
+callbacks = [
+    keras.callbacks.ModelCheckpoint(
+        "best_model.keras", save_best_only=True, monitor="loss"
+    ),
+    keras.callbacks.ReduceLROnPlateau(
+        monitor="val_top_k_categorical_accuracy",
+        factor=0.2,
+        patience=2,
+        min_lr=0.000001,
+    ),
+]
+
+optimizer = keras.optimizers.Adam(amsgrad=True, learning_rate=0.001)
+loss = keras.losses.CategoricalCrossentropy()
+
+"""
+## Compile model and call `model.fit()`
+"""
+
+"""
+We use the `Adam` optimizer since it is commonly considered the best choice for
+preliminary training, and was found to be the best optimizer.
+We use `CategoricalCrossentropy` as the loss as our labels are in a one-hot-encoded form.
+
+We define the `TopKCategoricalAccuracy(k=3)`, `AUC`, `Precision` and `Recall` metrics to
+further aid in understanding the model better.
+"""
+
+conv_model.compile(
+    optimizer=optimizer,
+    loss=loss,
+    metrics=[
+        keras.metrics.TopKCategoricalAccuracy(k=3),
+        keras.metrics.AUC(),
+        keras.metrics.Precision(),
+        keras.metrics.Recall(),
+    ],
+)
+
+conv_model_history = conv_model.fit(
+    train_dataset,
+    epochs=epochs,
+    callbacks=callbacks,
+    validation_data=test_dataset,
+    class_weight=weight_dict,
+)
+
+"""
+## Visualize model metrics during training
+"""
+
+"""
+We use the function defined above to see model metrics during training.
+"""
+
+plot_history_metrics(conv_model_history)
+
+"""
+## Evaluate model on test data
+"""
+
+loss, accuracy, auc, precision, recall = conv_model.evaluate(test_dataset)
+print(f"Loss : {loss}")
+print(f"Top 3 Categorical Accuracy : {accuracy}")
+print(f"Area under the Curve (ROC) : {auc}")
+print(f"Precision : {precision}")
+print(f"Recall : {recall}")
+
+
+def view_evaluated_eeg_plots(model):
+    start_index = random.randint(10, len(eeg))
+    end_index = start_index + 11
+    data = eeg.loc[start_index:end_index, "raw_values"]
+    data_array = [scaler.fit_transform(np.asarray(i).reshape(-1, 1)) for i in data]
+    data_array = [np.asarray(data_array).astype(np.float32).reshape(-1, 512, 1)]
+    original_labels = eeg.loc[start_index:end_index, "label"]
+    predicted_labels = np.argmax(model.predict(data_array, verbose=0), axis=1)
+    original_labels = [
+        le.inverse_transform(np.array(label).reshape(-1))[0]
+        for label in original_labels
+    ]
+    predicted_labels = [
+        le.inverse_transform(np.array(label).reshape(-1))[0]
+        for label in predicted_labels
+    ]
+    total_plots = 12
+    cols = total_plots // 3
+    rows = total_plots // cols
+    if total_plots % cols != 0:
+        rows += 1
+    pos = range(1, total_plots + 1)
+    fig = plt.figure(figsize=(20, 10))
+    for i, (plot_data, og_label, pred_label) in enumerate(
+        zip(data, original_labels, predicted_labels)
+    ):
+        plt.subplot(rows, cols, pos[i])
+        plt.plot(plot_data)
+        plt.title(f"Actual Label : {og_label}\nPredicted Label : {pred_label}")
+        fig.subplots_adjust(hspace=0.5)
+    plt.show()
+
+
+view_evaluated_eeg_plots(conv_model)
diff --git a/knowledge_base/timeseries/event_classification_for_payment_card_fraud_detection.py b/knowledge_base/timeseries/event_classification_for_payment_card_fraud_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..65632d9a80c4b80f72c3bbab59343375d2804771
--- /dev/null
+++ b/knowledge_base/timeseries/event_classification_for_payment_card_fraud_detection.py
@@ -0,0 +1,541 @@
+"""
+Title: Event classification for payment card fraud detection
+Author: [achoum](https://github.com/achoum/)
+Date created: 2024/02/01
+Last modified: 2024/02/01
+Description: Detection of fraudulent payment card transactions using Temporian and a feed-forward neural network.
+Accelerator: GPU
+"""
+
+"""
+This notebook depends on Keras 3, Temporian, and a few other libraries. You can
+install them as follow:
+
+```shell
+pip install temporian keras pandas tf-nightly scikit-learn -U
+```
+"""
+
+import keras  # To train the Machine Learning model
+import temporian as tp  # To convert transactions into tabular data
+
+import numpy as np
+import os
+import pandas as pd
+import datetime
+import math
+import tensorflow as tf
+from sklearn.metrics import RocCurveDisplay
+
+"""
+## Introduction
+
+Payment fraud detection is critical for banks, businesses, and consumers. In
+Europe alone, fraudulent transactions were estimated at
+[€1.89 billion in 2019](https://www.ecb.europa.eu/pub/pdf/cardfraud/ecb.cardfraudreport202110~cac4c418e8.en.pdf).
+Worldwide, approximately
+[3.6%](https://www.cybersource.com/content/dam/documents/campaign/fraud-report/global-fraud-report-2022.pdf)
+of commerce revenue is lost to fraud. In this notebook, we train and evaluate a
+model to detect fraudulent transactions using the synthetic dataset attached to
+the book
+[Reproducible Machine Learning for Credit Card Fraud Detection](https://fraud-detection-handbook.github.io/fraud-detection-handbook/Foreword.html)
+by Le Borgne et al.
+
+Fraudulent transactions often cannot be detected by looking at transactions in
+isolation. Instead, fraudulent transactions are detected by looking at patterns
+across multiple transactions from the same user, to the same merchant, or with
+other types of relationships. To express these relationships in a way that is
+understandable by a machine learning model, and to augment features with feature
+ engineering, we We use the
+ [Temporian](https://temporian.readthedocs.io/en/latest) preprocessing library.
+
+We preprocess a transaction dataset into a tabular dataset and use a
+feed-forward neural network to learn the patterns of fraud and make predictions.
+
+## Loading the dataset
+
+The dataset contains payment transactions sampled between April 1, 2018 and
+September 30, 2018. The transactions are stored in CSV files, one for each day.
+
+**Note:** Downloading the dataset takes ~1 minute.
+"""
+
+start_date = datetime.date(2018, 4, 1)
+end_date = datetime.date(2018, 9, 30)
+
+# Load the dataset as a Pandas dataframe.
+cache_path = "fraud_detection_cache.csv"
+if not os.path.exists(cache_path):
+    print("Download dataset")
+    dataframes = []
+    num_files = (end_date - start_date).days
+    counter = 0
+    while start_date <= end_date:
+        if counter % (num_files // 10) == 0:
+            print(f"[{100 * (counter+1) // num_files}%]", end="", flush=True)
+        print(".", end="", flush=True)
+        url = f"https://github.com/Fraud-Detection-Handbook/simulated-data-raw/raw/6e67dbd0a3bfe0d7ec33abc4bce5f37cd4ff0d6a/data/{start_date}.pkl"
+        dataframes.append(pd.read_pickle(url))
+        start_date += datetime.timedelta(days=1)
+        counter += 1
+    print("done", flush=True)
+    transactions_dataframe = pd.concat(dataframes)
+    transactions_dataframe.to_csv(cache_path, index=False)
+else:
+    print("Load dataset from cache")
+    transactions_dataframe = pd.read_csv(
+        cache_path, dtype={"CUSTOMER_ID": bytes, "TERMINAL_ID": bytes}
+    )
+
+print(f"Found {len(transactions_dataframe)} transactions")
+
+"""
+Each transaction is represented by a single row, with the following columns of
+interest:
+
+- **TX_DATETIME**: The date and time of the transaction.
+- **CUSTOMER_ID**: The unique identifier of the customer.
+- **TERMINAL_ID**: The identifier of the terminal where the transaction was
+    made.
+- **TX_AMOUNT**: The amount of the transaction.
+- **TX_FRAUD**: Whether the transaction is fraudulent (1) or not (0).
+"""
+
+transactions_dataframe = transactions_dataframe[
+    ["TX_DATETIME", "CUSTOMER_ID", "TERMINAL_ID", "TX_AMOUNT", "TX_FRAUD"]
+]
+
+transactions_dataframe.head(4)
+
+"""
+The dataset is highly imbalanced, with the majority of transactions being
+legitimate.
+"""
+
+fraudulent_rate = transactions_dataframe["TX_FRAUD"].mean()
+print("Rate of fraudulent transactions:", fraudulent_rate)
+
+"""
+The
+[pandas dataframe](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html)
+is converted into a
+[Temporian EventSet](https://temporian.readthedocs.io/en/latest/reference/temporian/EventSet/),
+which is better suited for the data exploration and feature preprocessing of the
+ next steps.
+"""
+
+transactions_evset = tp.from_pandas(transactions_dataframe, timestamps="TX_DATETIME")
+
+transactions_evset
+
+"""
+It is possible to plot the entire dataset, but the resulting plot will be
+difficult to read. Instead, we can group the transactions per client.
+"""
+
+transactions_evset.add_index("CUSTOMER_ID").plot(indexes="3774")
+
+"""
+Note the few fraudulent transactions for this client.
+
+## Preparing the training data
+
+Fraudulent transactions in isolation cannot be detected. Instead, we need to
+connect related transactions. For each transaction, we compute the sum and count
+of transactions for the same terminal in the last `n` days. Because we don't
+know the correct value for `n`, we use multiple values for `n` and compute a
+set of features for each of them.
+"""
+
+# Group the transactions per terminal
+transactions_per_terminal = transactions_evset.add_index("TERMINAL_ID")
+
+# Moving statistics per terminal
+tmp_features = []
+for n in [7, 14, 28]:
+    tmp_features.append(
+        transactions_per_terminal["TX_AMOUNT"]
+        .moving_sum(tp.duration.days(n))
+        .rename(f"sum_transactions_{n}_days")
+    )
+
+    tmp_features.append(
+        transactions_per_terminal.moving_count(tp.duration.days(n)).rename(
+            f"count_transactions_{n}_days"
+        )
+    )
+
+feature_set_1 = tp.glue(*tmp_features)
+
+feature_set_1
+
+"""
+Let's look at the features of terminal "3774".
+"""
+
+feature_set_1.plot(indexes="3774")
+
+"""
+A transaction's fraudulent status is not known at the time of the transaction
+(otherwise, there would be no problem). However, the banks knows if a
+transacation is fraudulent one week after it is made. We create a set of
+features that indicate the number and ratio of fraudulent transactions in the
+last N days.
+"""
+
+# Lag the transactions by one week.
+lagged_transactions = transactions_per_terminal.lag(tp.duration.weeks(1))
+
+# Moving statistics per customer
+tmp_features = []
+for n in [7, 14, 28]:
+    tmp_features.append(
+        lagged_transactions["TX_FRAUD"]
+        .moving_sum(tp.duration.days(n), sampling=transactions_per_terminal)
+        .rename(f"count_fraud_transactions_{n}_days")
+    )
+
+    tmp_features.append(
+        lagged_transactions["TX_FRAUD"]
+        .cast(tp.float32)
+        .simple_moving_average(tp.duration.days(n), sampling=transactions_per_terminal)
+        .rename(f"rate_fraud_transactions_{n}_days")
+    )
+
+feature_set_2 = tp.glue(*tmp_features)
+
+"""
+Transaction date and time can be correlated with fraud. While each transaction
+has a timestamp, a machine learning model might struggle to consume them
+directly. Instead, we extract various informative calendar features from the
+timestamps, such as hour, day of the week (e.g., Monday, Tuesday), and day of
+the month (1-31).
+"""
+
+feature_set_3 = tp.glue(
+    transactions_per_terminal.calendar_hour(),
+    transactions_per_terminal.calendar_day_of_week(),
+)
+
+"""
+Finally, we group together all the features and the label.
+"""
+
+all_data = tp.glue(
+    transactions_per_terminal, feature_set_1, feature_set_2, feature_set_3
+).drop_index()
+
+print("All the available features:")
+all_data.schema.feature_names()
+
+"""
+We extract the name of the input features.
+"""
+
+input_feature_names = [k for k in all_data.schema.feature_names() if k.islower()]
+
+print("The model's input features:")
+input_feature_names
+
+"""
+For neural networks to work correctly, numerical inputs must be normalized. A
+common approach is to apply z-normalization, which involves subtracting the mean
+and dividing by the standard deviation estimated from the training data to each
+value. In forecasting, such z-normalization is not recommended as it would lead
+to future leakage. Specifically, to classify a transaction at time t, we cannot
+rely on data after time t since, at serving time when making a prediction at
+time t, no subsequent data is available yet. In short, at time t, we are limited
+to using data that precedes or is concurrent with time t.
+
+The solution is therefore to apply z-normalization **over time**, which means
+that we normalize each transaction using the mean and standard deviation
+computed from the past data **for that transaction**.
+
+Future leakage is pernicious. Luckily, Temporian is here to help: the only
+operator that can cause future leakage is `EventSet.leak()`. If you are not
+using `EventSet.leak()`, your preprocessing is **guaranteed** not to create
+future leakage.
+
+**Note:** For advanced pipelines, you can also check programatically that a
+feature does not depends on an `EventSet.leak()` operation.
+"""
+
+# Cast all values (e.g. ints) to floats.
+values = all_data[input_feature_names].cast(tp.float32)
+
+# Apply z-normalization overtime.
+normalized_features = (
+    values - values.simple_moving_average(math.inf)
+) / values.moving_standard_deviation(math.inf)
+
+# Restore the original name of the features.
+normalized_features = normalized_features.rename(values.schema.feature_names())
+
+print(normalized_features)
+
+"""
+The first transactions will be normalized using poor estimates of the mean and
+standard deviation since there are only a few transactions before them. To
+mitigate this issue, we remove the first week of data from the training dataset.
+
+Notice that the first values contain NaN. In Temporian, NaN represents missing
+values, and all operators handle them accordingly. For instance, when
+calculating a moving average, NaN values are not included in the calculation
+and do not generate a NaN result.
+
+However, neural networks cannot natively handle NaN values. So, we replace them
+with zeros.
+"""
+
+normalized_features = normalized_features.fillna(0.0)
+
+"""
+Finally, we group together the features and the labels.
+"""
+
+normalized_all_data = tp.glue(normalized_features, all_data["TX_FRAUD"])
+
+"""
+## Split dataset into a train, validation and test set
+
+To evaluate the quality of our machine learning model, we need training,
+validation and test sets. Since the system is dynamic (new fraud patterns are
+being created all the time), it is important for the training set to come before
+the validation set, and the validation set come before the testing set:
+
+- **Training:** April 8, 2018 to July 31, 2018
+- **Validation:** August 1, 2018 to August 31, 2018
+- **Testing:** September 1, 2018 to September 30, 2018
+
+For the example to run faster, we will effectively reduce the size of the
+training set to:
+- **Training:** July 1, 2018 to July 31, 2018
+"""
+
+# begin_train = datetime.datetime(2018, 4, 8).timestamp() # Full training dataset
+begin_train = datetime.datetime(2018, 7, 1).timestamp()  # Reduced training dataset
+begin_valid = datetime.datetime(2018, 8, 1).timestamp()
+begin_test = datetime.datetime(2018, 9, 1).timestamp()
+
+is_train = (normalized_all_data.timestamps() >= begin_train) & (
+    normalized_all_data.timestamps() < begin_valid
+)
+is_valid = (normalized_all_data.timestamps() >= begin_valid) & (
+    normalized_all_data.timestamps() < begin_test
+)
+is_test = normalized_all_data.timestamps() >= begin_test
+
+"""
+`is_train`, `is_valid` and `is_test` are boolean features overtime that indicate
+the limit of the tree folds. Let's plot them.
+"""
+
+tp.plot(
+    [
+        is_train.rename("is_train"),
+        is_valid.rename("is_valid"),
+        is_test.rename("is_test"),
+    ]
+)
+
+"""
+We filter the input features and label in each fold.
+"""
+
+train_ds_evset = normalized_all_data.filter(is_train)
+valid_ds_evset = normalized_all_data.filter(is_valid)
+test_ds_evset = normalized_all_data.filter(is_test)
+
+print(f"Training examples: {train_ds_evset.num_events()}")
+print(f"Validation examples: {valid_ds_evset.num_events()}")
+print(f"Testing examples: {test_ds_evset.num_events()}")
+
+"""
+It is important to split the dataset **after** the features have been computed
+because some of the features for the training dataset are computed from
+transactions during the training window.
+
+## Create TensorFlow datasets
+
+We convert the datasets from EventSets to TensorFlow Datasets as Keras consumes
+them natively.
+"""
+
+non_batched_train_ds = tp.to_tensorflow_dataset(train_ds_evset)
+non_batched_valid_ds = tp.to_tensorflow_dataset(valid_ds_evset)
+non_batched_test_ds = tp.to_tensorflow_dataset(test_ds_evset)
+
+"""
+The following processing steps are applied using TensorFlow datasets:
+
+1. The features and labels are separated using `extract_features_and_label` in
+    the format that Keras expects.
+1. The dataset is batched, which means that the examples are grouped into
+    mini-batches.
+1. The training examples are shuffled to improve the quality of mini-batch
+    training.
+
+As we noted before, the dataset is imbalanced in the direction of legitimate
+transactions. While we want to evaluate our model on this original distribution,
+neural networks often train poorly on strongly imbalanced datasets. Therefore,
+we resample the training dataset to a ratio of 80% legitimate / 20% fraudulent
+using `rejection_resample`.
+"""
+
+
+def extract_features_and_label(example):
+    features = {k: example[k] for k in input_feature_names}
+    labels = tf.cast(example["TX_FRAUD"], tf.int32)
+    return features, labels
+
+
+# Target ratio of fraudulent transactions in the training dataset.
+target_rate = 0.2
+
+# Number of examples in a mini-batch.
+batch_size = 32
+
+train_ds = (
+    non_batched_train_ds.shuffle(10000)
+    .rejection_resample(
+        class_func=lambda x: tf.cast(x["TX_FRAUD"], tf.int32),
+        target_dist=[1 - target_rate, target_rate],
+        initial_dist=[1 - fraudulent_rate, fraudulent_rate],
+    )
+    .map(lambda _, x: x)  # Remove the label copy added by "rejection_resample".
+    .batch(batch_size)
+    .map(extract_features_and_label)
+    .prefetch(tf.data.AUTOTUNE)
+)
+
+# The test and validation dataset does not need resampling or shuffling.
+valid_ds = (
+    non_batched_valid_ds.batch(batch_size)
+    .map(extract_features_and_label)
+    .prefetch(tf.data.AUTOTUNE)
+)
+test_ds = (
+    non_batched_test_ds.batch(batch_size)
+    .map(extract_features_and_label)
+    .prefetch(tf.data.AUTOTUNE)
+)
+
+"""
+We print the first four examples of the training dataset. This is a simple way
+to identify some of the errors that could have been made above.
+"""
+
+for features, labels in train_ds.take(1):
+    print("features")
+    for feature_name, feature_value in features.items():
+        print(f"\t{feature_name}: {feature_value[:4]}")
+    print(f"labels: {labels[:4]}")
+
+"""
+## Train the model
+
+The original dataset is transactional, but the processed data is tabular and
+only contains normalized numerical values. Therefore, we train a feed-forward
+neural network.
+"""
+
+inputs = [keras.Input(shape=(1,), name=name) for name in input_feature_names]
+x = keras.layers.concatenate(inputs)
+x = keras.layers.Dense(32, activation="sigmoid")(x)
+x = keras.layers.Dense(16, activation="sigmoid")(x)
+x = keras.layers.Dense(1, activation="sigmoid")(x)
+model = keras.Model(inputs=inputs, outputs=x)
+
+"""
+Our goal is to differentiate between the fraudulent and legitimate transactions,
+so we use a binary classification objective. Because the dataset is imbalanced,
+accuracy is not an informative metric. Instead, we evaluate the model using the
+[area under the curve](https://en.wikipedia.org/wiki/Receiver_operating_characteristic#Area_under_the_curve)
+(AUC).
+"""
+
+model.compile(
+    optimizer=keras.optimizers.Adam(0.01),
+    loss=keras.losses.BinaryCrossentropy(),
+    metrics=[keras.metrics.Accuracy(), keras.metrics.AUC()],
+)
+model.fit(train_ds, validation_data=valid_ds)
+
+"""
+We evaluate the model on the test dataset.
+"""
+
+model.evaluate(test_ds)
+
+"""
+With and AUC of ~83%, our simple fraud detector is showing encouraging
+results.
+
+
+Plotting the ROC curve is a good solution to understand and select the operation
+point of the model i.e. the threshold applied on the model output to
+differentiate between fraudulent and legitimate transactions.
+
+Compute the test predictions:
+"""
+
+predictions = model.predict(test_ds)
+predictions = np.nan_to_num(predictions, nan=0)
+
+"""
+Extract the labels from the test set:
+"""
+
+labels = np.concatenate([label for _, label in test_ds])
+
+"""
+Finaly, we plot the ROC curve.
+"""
+
+_ = RocCurveDisplay.from_predictions(labels, predictions)
+
+
+"""
+The Keras model is ready to be used on transactions with an unknown fraud
+status, a.k.a. serving. We save the model on disk for future use.
+
+**Note:** The model does not include the data preparation and preprocessing steps
+done in Pandas and Temporian. They have to be applied manually to the data fed
+into the model. While not demonstrated here, Temporian preprocessing can also be
+saved to disk with
+[tp.save](https://temporian.readthedocs.io/en/latest/reference/temporian/serialization/save/).
+"""
+
+model.save("fraud_detection_model.keras")
+
+"""
+The model can be later reloaded with:
+"""
+
+loaded_model = keras.saving.load_model("fraud_detection_model.keras")
+
+# Generate predictions with the loaded model on 5 test examples.
+loaded_model.predict(test_ds.rebatch(5).take(1))
+
+"""
+## Conclusion
+
+We trained a feed-forward neural network to identify fraudulent transactions. To
+feed them into the model, the transactions were preprocessed and transformed
+into a tabular dataset using
+[Temporian](https://temporian.readthedocs.io/en/latest/). Now, a question to the
+reader: What could be done to further improve the model's performance?
+
+Here are some ideas:
+
+- Train the model on the entire dataset instead of a single month of data.
+- Train the model for more epochs and use early stopping to ensure that the
+    model is fully trained without overfitting.
+- Make the feed-forward network more powerful by increasing the number of layers
+    while ensuring that the model is regularized.
+- Compute additional preprocessing features. For example, in addition to
+    aggregating transactions by terminal, aggregate transactions by client.
+- Use the Keras Tuner to perform hyperparameter tuning on the model. Note that
+    the parameters of the preprocessing (e.g., the number of days of
+    aggregations) are also hyperparameters that can be tuned.
+"""
diff --git a/knowledge_base/timeseries/timeseries_anomaly_detection.py b/knowledge_base/timeseries/timeseries_anomaly_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..452e4d761af537d444a75133a60f8db47ccf3ecf
--- /dev/null
+++ b/knowledge_base/timeseries/timeseries_anomaly_detection.py
@@ -0,0 +1,306 @@
+"""
+Title: Timeseries anomaly detection using an Autoencoder
+Author: [pavithrasv](https://github.com/pavithrasv)
+Date created: 2020/05/31
+Last modified: 2020/05/31
+Description: Detect anomalies in a timeseries using an Autoencoder.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+This script demonstrates how you can use a reconstruction convolutional
+autoencoder model to detect anomalies in timeseries data.
+"""
+
+"""
+## Setup
+"""
+
+import numpy as np
+import pandas as pd
+import keras
+from keras import layers
+from matplotlib import pyplot as plt
+
+"""
+## Load the data
+
+We will use the [Numenta Anomaly Benchmark(NAB)](
+https://www.kaggle.com/boltzmannbrain/nab) dataset. It provides artificial
+timeseries data containing labeled anomalous periods of behavior. Data are
+ordered, timestamped, single-valued metrics.
+
+We will use the `art_daily_small_noise.csv` file for training and the
+`art_daily_jumpsup.csv` file for testing. The simplicity of this dataset
+allows us to demonstrate anomaly detection effectively.
+"""
+
+master_url_root = "https://raw.githubusercontent.com/numenta/NAB/master/data/"
+
+df_small_noise_url_suffix = "artificialNoAnomaly/art_daily_small_noise.csv"
+df_small_noise_url = master_url_root + df_small_noise_url_suffix
+df_small_noise = pd.read_csv(
+    df_small_noise_url, parse_dates=True, index_col="timestamp"
+)
+
+df_daily_jumpsup_url_suffix = "artificialWithAnomaly/art_daily_jumpsup.csv"
+df_daily_jumpsup_url = master_url_root + df_daily_jumpsup_url_suffix
+df_daily_jumpsup = pd.read_csv(
+    df_daily_jumpsup_url, parse_dates=True, index_col="timestamp"
+)
+
+"""
+## Quick look at the data
+"""
+
+print(df_small_noise.head())
+
+print(df_daily_jumpsup.head())
+
+"""
+## Visualize the data
+### Timeseries data without anomalies
+
+We will use the following data for training.
+"""
+fig, ax = plt.subplots()
+df_small_noise.plot(legend=False, ax=ax)
+plt.show()
+
+"""
+### Timeseries data with anomalies
+
+We will use the following data for testing and see if the sudden jump up in the
+data is detected as an anomaly.
+"""
+fig, ax = plt.subplots()
+df_daily_jumpsup.plot(legend=False, ax=ax)
+plt.show()
+
+"""
+## Prepare training data
+
+Get data values from the training timeseries data file and normalize the
+`value` data. We have a `value` for every 5 mins for 14 days.
+
+-   24 * 60 / 5 = **288 timesteps per day**
+-   288 * 14 = **4032 data points** in total
+"""
+
+
+# Normalize and save the mean and std we get,
+# for normalizing test data.
+training_mean = df_small_noise.mean()
+training_std = df_small_noise.std()
+df_training_value = (df_small_noise - training_mean) / training_std
+print("Number of training samples:", len(df_training_value))
+
+"""
+### Create sequences
+Create sequences combining `TIME_STEPS` contiguous data values from the
+training data.
+"""
+
+TIME_STEPS = 288
+
+
+# Generated training sequences for use in the model.
+def create_sequences(values, time_steps=TIME_STEPS):
+    output = []
+    for i in range(len(values) - time_steps + 1):
+        output.append(values[i : (i + time_steps)])
+    return np.stack(output)
+
+
+x_train = create_sequences(df_training_value.values)
+print("Training input shape: ", x_train.shape)
+
+"""
+## Build a model
+
+We will build a convolutional reconstruction autoencoder model. The model will
+take input of shape `(batch_size, sequence_length, num_features)` and return
+output of the same shape. In this case, `sequence_length` is 288 and
+`num_features` is 1.
+"""
+
+model = keras.Sequential(
+    [
+        layers.Input(shape=(x_train.shape[1], x_train.shape[2])),
+        layers.Conv1D(
+            filters=32,
+            kernel_size=7,
+            padding="same",
+            strides=2,
+            activation="relu",
+        ),
+        layers.Dropout(rate=0.2),
+        layers.Conv1D(
+            filters=16,
+            kernel_size=7,
+            padding="same",
+            strides=2,
+            activation="relu",
+        ),
+        layers.Conv1DTranspose(
+            filters=16,
+            kernel_size=7,
+            padding="same",
+            strides=2,
+            activation="relu",
+        ),
+        layers.Dropout(rate=0.2),
+        layers.Conv1DTranspose(
+            filters=32,
+            kernel_size=7,
+            padding="same",
+            strides=2,
+            activation="relu",
+        ),
+        layers.Conv1DTranspose(filters=1, kernel_size=7, padding="same"),
+    ]
+)
+model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss="mse")
+model.summary()
+
+"""
+## Train the model
+
+Please note that we are using `x_train` as both the input and the target
+since this is a reconstruction model.
+"""
+
+history = model.fit(
+    x_train,
+    x_train,
+    epochs=50,
+    batch_size=128,
+    validation_split=0.1,
+    callbacks=[
+        keras.callbacks.EarlyStopping(monitor="val_loss", patience=5, mode="min")
+    ],
+)
+
+"""
+Let's plot training and validation loss to see how the training went.
+"""
+
+plt.plot(history.history["loss"], label="Training Loss")
+plt.plot(history.history["val_loss"], label="Validation Loss")
+plt.legend()
+plt.show()
+
+"""
+## Detecting anomalies
+
+We will detect anomalies by determining how well our model can reconstruct
+the input data.
+
+
+1.   Find MAE loss on training samples.
+2.   Find max MAE loss value. This is the worst our model has performed trying
+to reconstruct a sample. We will make this the `threshold` for anomaly
+detection.
+3.   If the reconstruction loss for a sample is greater than this `threshold`
+value then we can infer that the model is seeing a pattern that it isn't
+familiar with. We will label this sample as an `anomaly`.
+
+
+"""
+
+# Get train MAE loss.
+x_train_pred = model.predict(x_train)
+train_mae_loss = np.mean(np.abs(x_train_pred - x_train), axis=1)
+
+plt.hist(train_mae_loss, bins=50)
+plt.xlabel("Train MAE loss")
+plt.ylabel("No of samples")
+plt.show()
+
+# Get reconstruction loss threshold.
+threshold = np.max(train_mae_loss)
+print("Reconstruction error threshold: ", threshold)
+
+"""
+### Compare recontruction
+
+Just for fun, let's see how our model has recontructed the first sample.
+This is the 288 timesteps from day 1 of our training dataset.
+"""
+
+# Checking how the first sequence is learnt
+plt.plot(x_train[0])
+plt.plot(x_train_pred[0])
+plt.show()
+
+"""
+### Prepare test data
+"""
+
+
+df_test_value = (df_daily_jumpsup - training_mean) / training_std
+fig, ax = plt.subplots()
+df_test_value.plot(legend=False, ax=ax)
+plt.show()
+
+# Create sequences from test values.
+x_test = create_sequences(df_test_value.values)
+print("Test input shape: ", x_test.shape)
+
+# Get test MAE loss.
+x_test_pred = model.predict(x_test)
+test_mae_loss = np.mean(np.abs(x_test_pred - x_test), axis=1)
+test_mae_loss = test_mae_loss.reshape((-1))
+
+plt.hist(test_mae_loss, bins=50)
+plt.xlabel("test MAE loss")
+plt.ylabel("No of samples")
+plt.show()
+
+# Detect all the samples which are anomalies.
+anomalies = test_mae_loss > threshold
+print("Number of anomaly samples: ", np.sum(anomalies))
+print("Indices of anomaly samples: ", np.where(anomalies))
+
+"""
+## Plot anomalies
+
+We now know the samples of the data which are anomalies. With this, we will
+find the corresponding `timestamps` from the original test data. We will be
+using the following method to do that:
+
+Let's say time_steps = 3 and we have 10 training values. Our `x_train` will
+look like this:
+
+- 0, 1, 2
+- 1, 2, 3
+- 2, 3, 4
+- 3, 4, 5
+- 4, 5, 6
+- 5, 6, 7
+- 6, 7, 8
+- 7, 8, 9
+
+All except the initial and the final time_steps-1 data values, will appear in
+`time_steps` number of samples. So, if we know that the samples
+[(3, 4, 5), (4, 5, 6), (5, 6, 7)] are anomalies, we can say that the data point
+5 is an anomaly.
+"""
+
+# data i is an anomaly if samples [(i - timesteps + 1) to (i)] are anomalies
+anomalous_data_indices = []
+for data_idx in range(TIME_STEPS - 1, len(df_test_value) - TIME_STEPS + 1):
+    if np.all(anomalies[data_idx - TIME_STEPS + 1 : data_idx]):
+        anomalous_data_indices.append(data_idx)
+
+"""
+Let's overlay the anomalies on the original test data plot.
+"""
+
+df_subset = df_daily_jumpsup.iloc[anomalous_data_indices]
+fig, ax = plt.subplots()
+df_daily_jumpsup.plot(legend=False, ax=ax)
+df_subset.plot(legend=False, ax=ax, color="r")
+plt.show()
diff --git a/knowledge_base/timeseries/timeseries_classification_from_scratch.py b/knowledge_base/timeseries/timeseries_classification_from_scratch.py
new file mode 100755
index 0000000000000000000000000000000000000000..b04ebe8de19c9168d82aa04729ebd3f1b7ba4e81
--- /dev/null
+++ b/knowledge_base/timeseries/timeseries_classification_from_scratch.py
@@ -0,0 +1,227 @@
+"""
+Title: Timeseries classification from scratch
+Author: [hfawaz](https://github.com/hfawaz/)
+Date created: 2020/07/21
+Last modified: 2023/11/10
+Description: Training a timeseries classifier from scratch on the FordA dataset from the UCR/UEA archive.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+This example shows how to do timeseries classification from scratch, starting from raw
+CSV timeseries files on disk. We demonstrate the workflow on the FordA dataset from the
+[UCR/UEA archive](https://www.cs.ucr.edu/%7Eeamonn/time_series_data_2018/).
+
+"""
+
+"""
+## Setup
+
+"""
+import keras
+import numpy as np
+import matplotlib.pyplot as plt
+
+"""
+## Load the data: the FordA dataset
+
+### Dataset description
+
+The dataset we are using here is called FordA.
+The data comes from the UCR archive.
+The dataset contains 3601 training instances and another 1320 testing instances.
+Each timeseries corresponds to a measurement of engine noise captured by a motor sensor.
+For this task, the goal is to automatically detect the presence of a specific issue with
+the engine. The problem is a balanced binary classification task. The full description of
+this dataset can be found [here](http://www.j-wichard.de/publications/FordPaper.pdf).
+
+### Read the TSV data
+
+We will use the `FordA_TRAIN` file for training and the
+`FordA_TEST` file for testing. The simplicity of this dataset
+allows us to demonstrate effectively how to use ConvNets for timeseries classification.
+In this file, the first column corresponds to the label.
+"""
+
+
+def readucr(filename):
+    data = np.loadtxt(filename, delimiter="\t")
+    y = data[:, 0]
+    x = data[:, 1:]
+    return x, y.astype(int)
+
+
+root_url = "https://raw.githubusercontent.com/hfawaz/cd-diagram/master/FordA/"
+
+x_train, y_train = readucr(root_url + "FordA_TRAIN.tsv")
+x_test, y_test = readucr(root_url + "FordA_TEST.tsv")
+
+"""
+## Visualize the data
+
+Here we visualize one timeseries example for each class in the dataset.
+
+"""
+
+classes = np.unique(np.concatenate((y_train, y_test), axis=0))
+
+plt.figure()
+for c in classes:
+    c_x_train = x_train[y_train == c]
+    plt.plot(c_x_train[0], label="class " + str(c))
+plt.legend(loc="best")
+plt.show()
+plt.close()
+
+"""
+## Standardize the data
+
+Our timeseries are already in a single length (500). However, their values are
+usually in various ranges. This is not ideal for a neural network;
+in general we should seek to make the input values normalized.
+For this specific dataset, the data is already z-normalized: each timeseries sample
+has a mean equal to zero and a standard deviation equal to one. This type of
+normalization is very common for timeseries classification problems, see
+[Bagnall et al. (2016)](https://link.springer.com/article/10.1007/s10618-016-0483-9).
+
+Note that the timeseries data used here are univariate, meaning we only have one channel
+per timeseries example.
+We will therefore transform the timeseries into a multivariate one with one channel
+using a simple reshaping via numpy.
+This will allow us to construct a model that is easily applicable to multivariate time
+series.
+"""
+
+x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], 1))
+x_test = x_test.reshape((x_test.shape[0], x_test.shape[1], 1))
+
+"""
+Finally, in order to use `sparse_categorical_crossentropy`, we will have to count
+the number of classes beforehand.
+"""
+
+num_classes = len(np.unique(y_train))
+
+"""
+Now we shuffle the training set because we will be using the `validation_split` option
+later when training.
+"""
+
+idx = np.random.permutation(len(x_train))
+x_train = x_train[idx]
+y_train = y_train[idx]
+
+"""
+Standardize the labels to positive integers.
+The expected labels will then be 0 and 1.
+"""
+
+y_train[y_train == -1] = 0
+y_test[y_test == -1] = 0
+
+"""
+## Build a model
+
+We build a Fully Convolutional Neural Network originally proposed in
+[this paper](https://arxiv.org/abs/1611.06455).
+The implementation is based on the TF 2 version provided
+[here](https://github.com/hfawaz/dl-4-tsc/).
+The following hyperparameters (kernel_size, filters, the usage of BatchNorm) were found
+via random search using [KerasTuner](https://github.com/keras-team/keras-tuner).
+
+"""
+
+
+def make_model(input_shape):
+    input_layer = keras.layers.Input(input_shape)
+
+    conv1 = keras.layers.Conv1D(filters=64, kernel_size=3, padding="same")(input_layer)
+    conv1 = keras.layers.BatchNormalization()(conv1)
+    conv1 = keras.layers.ReLU()(conv1)
+
+    conv2 = keras.layers.Conv1D(filters=64, kernel_size=3, padding="same")(conv1)
+    conv2 = keras.layers.BatchNormalization()(conv2)
+    conv2 = keras.layers.ReLU()(conv2)
+
+    conv3 = keras.layers.Conv1D(filters=64, kernel_size=3, padding="same")(conv2)
+    conv3 = keras.layers.BatchNormalization()(conv3)
+    conv3 = keras.layers.ReLU()(conv3)
+
+    gap = keras.layers.GlobalAveragePooling1D()(conv3)
+
+    output_layer = keras.layers.Dense(num_classes, activation="softmax")(gap)
+
+    return keras.models.Model(inputs=input_layer, outputs=output_layer)
+
+
+model = make_model(input_shape=x_train.shape[1:])
+keras.utils.plot_model(model, show_shapes=True)
+
+"""
+## Train the model
+
+"""
+
+epochs = 500
+batch_size = 32
+
+callbacks = [
+    keras.callbacks.ModelCheckpoint(
+        "best_model.keras", save_best_only=True, monitor="val_loss"
+    ),
+    keras.callbacks.ReduceLROnPlateau(
+        monitor="val_loss", factor=0.5, patience=20, min_lr=0.0001
+    ),
+    keras.callbacks.EarlyStopping(monitor="val_loss", patience=50, verbose=1),
+]
+model.compile(
+    optimizer="adam",
+    loss="sparse_categorical_crossentropy",
+    metrics=["sparse_categorical_accuracy"],
+)
+history = model.fit(
+    x_train,
+    y_train,
+    batch_size=batch_size,
+    epochs=epochs,
+    callbacks=callbacks,
+    validation_split=0.2,
+    verbose=1,
+)
+
+"""
+## Evaluate model on test data
+"""
+
+model = keras.models.load_model("best_model.keras")
+
+test_loss, test_acc = model.evaluate(x_test, y_test)
+
+print("Test accuracy", test_acc)
+print("Test loss", test_loss)
+
+"""
+## Plot the model's training and validation loss
+"""
+
+metric = "sparse_categorical_accuracy"
+plt.figure()
+plt.plot(history.history[metric])
+plt.plot(history.history["val_" + metric])
+plt.title("model " + metric)
+plt.ylabel(metric, fontsize="large")
+plt.xlabel("epoch", fontsize="large")
+plt.legend(["train", "val"], loc="best")
+plt.show()
+plt.close()
+
+"""
+We can see how the training accuracy reaches almost 0.95 after 100 epochs.
+However, by observing the validation accuracy we can see how the network still needs
+training until it reaches almost 0.97 for both the validation and the training accuracy
+after 200 epochs. Beyond the 200th epoch, if we continue on training, the validation
+accuracy will start decreasing while the training accuracy will continue on increasing:
+the model starts overfitting.
+"""
diff --git a/knowledge_base/timeseries/timeseries_classification_transformer.py b/knowledge_base/timeseries/timeseries_classification_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..7bfc11d1da4ae4535c3e7f1cd3028393a199d8f2
--- /dev/null
+++ b/knowledge_base/timeseries/timeseries_classification_transformer.py
@@ -0,0 +1,174 @@
+"""
+Title: Timeseries classification with a Transformer model
+Author: [Theodoros Ntakouris](https://github.com/ntakouris)
+Date created: 2021/06/25
+Last modified: 2021/08/05
+Description: This notebook demonstrates how to do timeseries classification using a Transformer model.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+This is the Transformer architecture from
+[Attention Is All You Need](https://arxiv.org/abs/1706.03762),
+applied to timeseries instead of natural language.
+
+This example requires TensorFlow 2.4 or higher.
+
+## Load the dataset
+
+We are going to use the same dataset and preprocessing as the
+[TimeSeries Classification from Scratch](https://keras.io/examples/timeseries/timeseries_classification_from_scratch)
+example.
+"""
+
+import numpy as np
+import keras
+from keras import layers
+
+
+def readucr(filename):
+    data = np.loadtxt(filename, delimiter="\t")
+    y = data[:, 0]
+    x = data[:, 1:]
+    return x, y.astype(int)
+
+
+root_url = "https://raw.githubusercontent.com/hfawaz/cd-diagram/master/FordA/"
+
+x_train, y_train = readucr(root_url + "FordA_TRAIN.tsv")
+x_test, y_test = readucr(root_url + "FordA_TEST.tsv")
+
+x_train = x_train.reshape((x_train.shape[0], x_train.shape[1], 1))
+x_test = x_test.reshape((x_test.shape[0], x_test.shape[1], 1))
+
+n_classes = len(np.unique(y_train))
+
+idx = np.random.permutation(len(x_train))
+x_train = x_train[idx]
+y_train = y_train[idx]
+
+y_train[y_train == -1] = 0
+y_test[y_test == -1] = 0
+
+"""
+## Build the model
+
+Our model processes a tensor of shape `(batch size, sequence length, features)`,
+where `sequence length` is the number of time steps and `features` is each input
+timeseries.
+
+You can replace your classification RNN layers with this one: the
+inputs are fully compatible!
+
+We include residual connections, layer normalization, and dropout.
+The resulting layer can be stacked multiple times.
+
+The projection layers are implemented through `keras.layers.Conv1D`.
+"""
+
+# This implementation applies Layer Normalization before the residual connection
+# to improve training stability by producing better-behaved gradients and often
+# eliminating the need for learning rate warm-up.
+
+
+def transformer_encoder(inputs, head_size, num_heads, ff_dim, dropout=0):
+    # Attention and Normalization
+    x = layers.MultiHeadAttention(
+        key_dim=head_size, num_heads=num_heads, dropout=dropout
+    )(inputs, inputs)
+    x = layers.Dropout(dropout)(x)
+    x = layers.LayerNormalization(epsilon=1e-6)(x)
+    res = x + inputs
+
+    # Feed Forward Part
+    x = layers.Conv1D(filters=ff_dim, kernel_size=1, activation="relu")(res)
+    x = layers.Dropout(dropout)(x)
+    x = layers.Conv1D(filters=inputs.shape[-1], kernel_size=1)(x)
+    x = layers.LayerNormalization(epsilon=1e-6)(x)
+    return x + res
+
+
+"""
+The main part of our model is now complete. We can stack multiple of those
+`transformer_encoder` blocks and we can also proceed to add the final
+Multi-Layer Perceptron classification head. Apart from a stack of `Dense`
+layers, we need to reduce the output tensor of the `TransformerEncoder` part of
+our model down to a vector of features for each data point in the current
+batch. A common way to achieve this is to use a pooling layer. For
+this example, a `GlobalAveragePooling1D` layer is sufficient.
+"""
+
+
+def build_model(
+    input_shape,
+    head_size,
+    num_heads,
+    ff_dim,
+    num_transformer_blocks,
+    mlp_units,
+    dropout=0,
+    mlp_dropout=0,
+):
+    inputs = keras.Input(shape=input_shape)
+    x = inputs
+    for _ in range(num_transformer_blocks):
+        x = transformer_encoder(x, head_size, num_heads, ff_dim, dropout)
+
+    x = layers.GlobalAveragePooling1D(data_format="channels_last")(x)
+    for dim in mlp_units:
+        x = layers.Dense(dim, activation="relu")(x)
+        x = layers.Dropout(mlp_dropout)(x)
+    outputs = layers.Dense(n_classes, activation="softmax")(x)
+    return keras.Model(inputs, outputs)
+
+
+"""
+## Train and evaluate
+"""
+
+input_shape = x_train.shape[1:]
+
+model = build_model(
+    input_shape,
+    head_size=256,
+    num_heads=4,
+    ff_dim=4,
+    num_transformer_blocks=4,
+    mlp_units=[128],
+    mlp_dropout=0.4,
+    dropout=0.25,
+)
+
+model.compile(
+    loss="sparse_categorical_crossentropy",
+    optimizer=keras.optimizers.Adam(learning_rate=1e-4),
+    metrics=["sparse_categorical_accuracy"],
+)
+model.summary()
+
+callbacks = [keras.callbacks.EarlyStopping(patience=10, restore_best_weights=True)]
+
+model.fit(
+    x_train,
+    y_train,
+    validation_split=0.2,
+    epochs=150,
+    batch_size=64,
+    callbacks=callbacks,
+)
+
+model.evaluate(x_test, y_test, verbose=1)
+
+"""
+## Conclusions
+
+In about 110-120 epochs (25s each on Colab), the model reaches a training
+accuracy of ~0.95, validation accuracy of ~84 and a testing
+accuracy of ~85, without hyperparameter tuning. And that is for a model
+with less than 100k parameters. Of course, parameter count and accuracy could be
+improved by a hyperparameter search and a more sophisticated learning rate
+schedule, or a different optimizer.
+
+"""
diff --git a/knowledge_base/timeseries/timeseries_traffic_forecasting.py b/knowledge_base/timeseries/timeseries_traffic_forecasting.py
new file mode 100644
index 0000000000000000000000000000000000000000..ea66ee2131e54cbfbfe2a9fa6ab3e13e0a10dc82
--- /dev/null
+++ b/knowledge_base/timeseries/timeseries_traffic_forecasting.py
@@ -0,0 +1,660 @@
+"""
+Title: Traffic forecasting using graph neural networks and LSTM
+Author: [Arash Khodadadi](https://www.linkedin.com/in/arash-khodadadi-08a02490/)
+Date created: 2021/12/28
+Last modified: 2023/11/22
+Description: This example demonstrates how to do timeseries forecasting over graphs.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+This example shows how to forecast traffic condition using graph neural networks and LSTM.
+Specifically, we are interested in predicting the future values of the traffic speed given
+a history of the traffic speed for a collection of road segments.
+
+One popular method to
+solve this problem is to consider each road segment's traffic speed as a separate
+timeseries and predict the future values of each timeseries
+using the past values of the same timeseries.
+
+This method, however, ignores the dependency of the traffic speed of one road segment on
+the neighboring segments. To be able to take into account the complex interactions between
+the traffic speed on a collection of neighboring roads, we can define the traffic network
+as a graph and consider the traffic speed as a signal on this graph. In this example,
+we implement a neural network architecture which can process timeseries data over a graph.
+We first show how to process the data and create a
+[tf.data.Dataset](https://www.tensorflow.org/api_docs/python/tf/data/Dataset) for
+forecasting over graphs. Then, we implement a model which uses graph convolution and
+LSTM layers to perform forecasting over a graph.
+
+The data processing and the model architecture are inspired by this paper:
+
+Yu, Bing, Haoteng Yin, and Zhanxing Zhu. "Spatio-temporal graph convolutional networks:
+a deep learning framework for traffic forecasting." Proceedings of the 27th International
+Joint Conference on Artificial Intelligence, 2018.
+([github](https://github.com/VeritasYin/STGCN_IJCAI-18))
+"""
+
+"""
+## Setup
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import pandas as pd
+import numpy as np
+import typing
+import matplotlib.pyplot as plt
+
+import tensorflow as tf
+import keras
+from keras import layers
+from keras import ops
+
+"""
+## Data preparation
+"""
+
+"""
+### Data description
+
+We use a real-world traffic speed dataset named `PeMSD7`. We use the version
+collected and prepared by [Yu et al., 2018](https://arxiv.org/abs/1709.04875)
+and available
+[here](https://github.com/VeritasYin/STGCN_IJCAI-18/tree/master/dataset).
+
+The data consists of two files:
+
+- `PeMSD7_W_228.csv` contains the distances between 228
+stations across the District 7 of California.
+- `PeMSD7_V_228.csv` contains traffic
+speed collected for those stations in the weekdays of May and June of 2012.
+
+The full description of the dataset can be found in
+[Yu et al., 2018](https://arxiv.org/abs/1709.04875).
+"""
+
+"""
+### Loading data
+"""
+
+url = "https://github.com/VeritasYin/STGCN_IJCAI-18/raw/master/dataset/PeMSD7_Full.zip"
+data_dir = keras.utils.get_file(origin=url, extract=True, archive_format="zip")
+data_dir = data_dir.rstrip("PeMSD7_Full.zip")
+
+route_distances = pd.read_csv(
+    os.path.join(data_dir, "PeMSD7_W_228.csv"), header=None
+).to_numpy()
+speeds_array = pd.read_csv(
+    os.path.join(data_dir, "PeMSD7_V_228.csv"), header=None
+).to_numpy()
+
+print(f"route_distances shape={route_distances.shape}")
+print(f"speeds_array shape={speeds_array.shape}")
+
+"""
+### sub-sampling roads
+
+To reduce the problem size and make the training faster, we will only
+work with a sample of 26 roads out of the 228 roads in the dataset.
+We have chosen the roads by starting from road 0, choosing the 5 closest
+roads to it, and continuing this process until we get 25 roads. You can choose
+any other subset of the roads. We chose the roads in this way to increase the likelihood
+of having roads with correlated speed timeseries.
+`sample_routes` contains the IDs of the selected roads.
+"""
+
+sample_routes = [
+    0,
+    1,
+    4,
+    7,
+    8,
+    11,
+    15,
+    108,
+    109,
+    114,
+    115,
+    118,
+    120,
+    123,
+    124,
+    126,
+    127,
+    129,
+    130,
+    132,
+    133,
+    136,
+    139,
+    144,
+    147,
+    216,
+]
+route_distances = route_distances[np.ix_(sample_routes, sample_routes)]
+speeds_array = speeds_array[:, sample_routes]
+
+print(f"route_distances shape={route_distances.shape}")
+print(f"speeds_array shape={speeds_array.shape}")
+
+"""
+### Data visualization
+
+Here are the timeseries of the traffic speed for two of the routes:
+"""
+
+plt.figure(figsize=(18, 6))
+plt.plot(speeds_array[:, [0, -1]])
+plt.legend(["route_0", "route_25"])
+
+"""
+We can also visualize the correlation between the timeseries in different routes.
+"""
+
+plt.figure(figsize=(8, 8))
+plt.matshow(np.corrcoef(speeds_array.T), 0)
+plt.xlabel("road number")
+plt.ylabel("road number")
+
+"""
+Using this correlation heatmap, we can see that for example the speed in
+routes 4, 5, 6 are highly correlated.
+"""
+
+"""
+### Splitting and normalizing data
+
+Next, we split the speed values array into train/validation/test sets,
+and normalize the resulting arrays:
+"""
+
+train_size, val_size = 0.5, 0.2
+
+
+def preprocess(data_array: np.ndarray, train_size: float, val_size: float):
+    """Splits data into train/val/test sets and normalizes the data.
+
+    Args:
+        data_array: ndarray of shape `(num_time_steps, num_routes)`
+        train_size: A float value between 0.0 and 1.0 that represent the proportion of the dataset
+            to include in the train split.
+        val_size: A float value between 0.0 and 1.0 that represent the proportion of the dataset
+            to include in the validation split.
+
+    Returns:
+        `train_array`, `val_array`, `test_array`
+    """
+
+    num_time_steps = data_array.shape[0]
+    num_train, num_val = (
+        int(num_time_steps * train_size),
+        int(num_time_steps * val_size),
+    )
+    train_array = data_array[:num_train]
+    mean, std = train_array.mean(axis=0), train_array.std(axis=0)
+
+    train_array = (train_array - mean) / std
+    val_array = (data_array[num_train : (num_train + num_val)] - mean) / std
+    test_array = (data_array[(num_train + num_val) :] - mean) / std
+
+    return train_array, val_array, test_array
+
+
+train_array, val_array, test_array = preprocess(speeds_array, train_size, val_size)
+
+print(f"train set size: {train_array.shape}")
+print(f"validation set size: {val_array.shape}")
+print(f"test set size: {test_array.shape}")
+
+"""
+### Creating TensorFlow Datasets
+
+Next, we create the datasets for our forecasting problem. The forecasting problem
+can be stated as follows: given a sequence of the
+road speed values at times `t+1, t+2, ..., t+T`, we want to predict the future values of
+the roads speed for times `t+T+1, ..., t+T+h`. So for each time `t` the inputs to our
+model are `T` vectors each of size `N` and the targets are `h` vectors each of size `N`,
+where `N` is the number of roads.
+"""
+
+"""
+We use the Keras built-in function
+`keras.utils.timeseries_dataset_from_array`.
+The function `create_tf_dataset()` below takes as input a `numpy.ndarray` and returns a
+`tf.data.Dataset`. In this function `input_sequence_length=T` and `forecast_horizon=h`.
+
+The argument `multi_horizon` needs more explanation. Assume `forecast_horizon=3`.
+If `multi_horizon=True` then the model will make a forecast for time steps
+`t+T+1, t+T+2, t+T+3`. So the target will have shape `(T,3)`. But if
+`multi_horizon=False`, the model will make a forecast only for time step `t+T+3` and
+so the target will have shape `(T, 1)`.
+
+You may notice that the input tensor in each batch has shape
+`(batch_size, input_sequence_length, num_routes, 1)`. The last dimension is added to
+make the model more general: at each time step, the input features for each raod may
+contain multiple timeseries. For instance, one might want to use temperature timeseries
+in addition to historical values of the speed as input features. In this example,
+however, the last dimension of the input is always 1.
+
+We use the last 12 values of the speed in each road to forecast the speed for 3 time
+steps ahead:
+"""
+
+batch_size = 64
+input_sequence_length = 12
+forecast_horizon = 3
+multi_horizon = False
+
+
+def create_tf_dataset(
+    data_array: np.ndarray,
+    input_sequence_length: int,
+    forecast_horizon: int,
+    batch_size: int = 128,
+    shuffle=True,
+    multi_horizon=True,
+):
+    """Creates tensorflow dataset from numpy array.
+
+    This function creates a dataset where each element is a tuple `(inputs, targets)`.
+    `inputs` is a Tensor
+    of shape `(batch_size, input_sequence_length, num_routes, 1)` containing
+    the `input_sequence_length` past values of the timeseries for each node.
+    `targets` is a Tensor of shape `(batch_size, forecast_horizon, num_routes)`
+    containing the `forecast_horizon`
+    future values of the timeseries for each node.
+
+    Args:
+        data_array: np.ndarray with shape `(num_time_steps, num_routes)`
+        input_sequence_length: Length of the input sequence (in number of timesteps).
+        forecast_horizon: If `multi_horizon=True`, the target will be the values of the timeseries for 1 to
+            `forecast_horizon` timesteps ahead. If `multi_horizon=False`, the target will be the value of the
+            timeseries `forecast_horizon` steps ahead (only one value).
+        batch_size: Number of timeseries samples in each batch.
+        shuffle: Whether to shuffle output samples, or instead draw them in chronological order.
+        multi_horizon: See `forecast_horizon`.
+
+    Returns:
+        A tf.data.Dataset instance.
+    """
+
+    inputs = keras.utils.timeseries_dataset_from_array(
+        np.expand_dims(data_array[:-forecast_horizon], axis=-1),
+        None,
+        sequence_length=input_sequence_length,
+        shuffle=False,
+        batch_size=batch_size,
+    )
+
+    target_offset = (
+        input_sequence_length
+        if multi_horizon
+        else input_sequence_length + forecast_horizon - 1
+    )
+    target_seq_length = forecast_horizon if multi_horizon else 1
+    targets = keras.utils.timeseries_dataset_from_array(
+        data_array[target_offset:],
+        None,
+        sequence_length=target_seq_length,
+        shuffle=False,
+        batch_size=batch_size,
+    )
+
+    dataset = tf.data.Dataset.zip((inputs, targets))
+    if shuffle:
+        dataset = dataset.shuffle(100)
+
+    return dataset.prefetch(16).cache()
+
+
+train_dataset, val_dataset = (
+    create_tf_dataset(data_array, input_sequence_length, forecast_horizon, batch_size)
+    for data_array in [train_array, val_array]
+)
+
+test_dataset = create_tf_dataset(
+    test_array,
+    input_sequence_length,
+    forecast_horizon,
+    batch_size=test_array.shape[0],
+    shuffle=False,
+    multi_horizon=multi_horizon,
+)
+
+
+"""
+### Roads Graph
+
+As mentioned before, we assume that the road segments form a graph.
+The `PeMSD7` dataset has the road segments distance. The next step
+is to create the graph adjacency matrix from these distances. Following
+[Yu et al., 2018](https://arxiv.org/abs/1709.04875) (equation 10) we assume there
+is an edge between two nodes in the graph if the distance between the corresponding roads
+is less than a threshold.
+"""
+
+
+def compute_adjacency_matrix(
+    route_distances: np.ndarray, sigma2: float, epsilon: float
+):
+    """Computes the adjacency matrix from distances matrix.
+
+    It uses the formula in https://github.com/VeritasYin/STGCN_IJCAI-18#data-preprocessing to
+    compute an adjacency matrix from the distance matrix.
+    The implementation follows that paper.
+
+    Args:
+        route_distances: np.ndarray of shape `(num_routes, num_routes)`. Entry `i,j` of this array is the
+            distance between roads `i,j`.
+        sigma2: Determines the width of the Gaussian kernel applied to the square distances matrix.
+        epsilon: A threshold specifying if there is an edge between two nodes. Specifically, `A[i,j]=1`
+            if `np.exp(-w2[i,j] / sigma2) >= epsilon` and `A[i,j]=0` otherwise, where `A` is the adjacency
+            matrix and `w2=route_distances * route_distances`
+
+    Returns:
+        A boolean graph adjacency matrix.
+    """
+    num_routes = route_distances.shape[0]
+    route_distances = route_distances / 10000.0
+    w2, w_mask = (
+        route_distances * route_distances,
+        np.ones([num_routes, num_routes]) - np.identity(num_routes),
+    )
+    return (np.exp(-w2 / sigma2) >= epsilon) * w_mask
+
+
+"""
+The function `compute_adjacency_matrix()` returns a boolean adjacency matrix
+where 1 means there is an edge between two nodes. We use the following class
+to store the information about the graph.
+"""
+
+
+class GraphInfo:
+    def __init__(self, edges: typing.Tuple[list, list], num_nodes: int):
+        self.edges = edges
+        self.num_nodes = num_nodes
+
+
+sigma2 = 0.1
+epsilon = 0.5
+adjacency_matrix = compute_adjacency_matrix(route_distances, sigma2, epsilon)
+node_indices, neighbor_indices = np.where(adjacency_matrix == 1)
+graph = GraphInfo(
+    edges=(node_indices.tolist(), neighbor_indices.tolist()),
+    num_nodes=adjacency_matrix.shape[0],
+)
+print(f"number of nodes: {graph.num_nodes}, number of edges: {len(graph.edges[0])}")
+
+"""
+## Network architecture
+
+Our model for forecasting over the graph consists of a graph convolution
+layer and a LSTM layer.
+"""
+
+"""
+### Graph convolution layer
+
+Our implementation of the graph convolution layer resembles the implementation
+in [this Keras example](https://keras.io/examples/graph/gnn_citations/). Note that
+in that example input to the layer is a 2D tensor of shape `(num_nodes,in_feat)`
+but in our example the input to the layer is a 4D tensor of shape
+`(num_nodes, batch_size, input_seq_length, in_feat)`. The graph convolution layer
+performs the following steps:
+
+- The nodes' representations are computed in `self.compute_nodes_representation()`
+by multiplying the input features by `self.weight`
+- The aggregated neighbors' messages are computed in `self.compute_aggregated_messages()`
+by first aggregating the neighbors' representations and then multiplying the results by
+`self.weight`
+- The final output of the layer is computed in `self.update()` by combining the nodes
+representations and the neighbors' aggregated messages
+"""
+
+
+class GraphConv(layers.Layer):
+    def __init__(
+        self,
+        in_feat,
+        out_feat,
+        graph_info: GraphInfo,
+        aggregation_type="mean",
+        combination_type="concat",
+        activation: typing.Optional[str] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.in_feat = in_feat
+        self.out_feat = out_feat
+        self.graph_info = graph_info
+        self.aggregation_type = aggregation_type
+        self.combination_type = combination_type
+        self.weight = self.add_weight(
+            initializer=keras.initializers.GlorotUniform(),
+            shape=(in_feat, out_feat),
+            dtype="float32",
+            trainable=True,
+        )
+        self.activation = layers.Activation(activation)
+
+    def aggregate(self, neighbour_representations):
+        aggregation_func = {
+            "sum": tf.math.unsorted_segment_sum,
+            "mean": tf.math.unsorted_segment_mean,
+            "max": tf.math.unsorted_segment_max,
+        }.get(self.aggregation_type)
+
+        if aggregation_func:
+            return aggregation_func(
+                neighbour_representations,
+                self.graph_info.edges[0],
+                num_segments=self.graph_info.num_nodes,
+            )
+
+        raise ValueError(f"Invalid aggregation type: {self.aggregation_type}")
+
+    def compute_nodes_representation(self, features):
+        """Computes each node's representation.
+
+        The nodes' representations are obtained by multiplying the features tensor with
+        `self.weight`. Note that
+        `self.weight` has shape `(in_feat, out_feat)`.
+
+        Args:
+            features: Tensor of shape `(num_nodes, batch_size, input_seq_len, in_feat)`
+
+        Returns:
+            A tensor of shape `(num_nodes, batch_size, input_seq_len, out_feat)`
+        """
+        return ops.matmul(features, self.weight)
+
+    def compute_aggregated_messages(self, features):
+        neighbour_representations = tf.gather(features, self.graph_info.edges[1])
+        aggregated_messages = self.aggregate(neighbour_representations)
+        return ops.matmul(aggregated_messages, self.weight)
+
+    def update(self, nodes_representation, aggregated_messages):
+        if self.combination_type == "concat":
+            h = ops.concatenate([nodes_representation, aggregated_messages], axis=-1)
+        elif self.combination_type == "add":
+            h = nodes_representation + aggregated_messages
+        else:
+            raise ValueError(f"Invalid combination type: {self.combination_type}.")
+        return self.activation(h)
+
+    def call(self, features):
+        """Forward pass.
+
+        Args:
+            features: tensor of shape `(num_nodes, batch_size, input_seq_len, in_feat)`
+
+        Returns:
+            A tensor of shape `(num_nodes, batch_size, input_seq_len, out_feat)`
+        """
+        nodes_representation = self.compute_nodes_representation(features)
+        aggregated_messages = self.compute_aggregated_messages(features)
+        return self.update(nodes_representation, aggregated_messages)
+
+
+"""
+### LSTM plus graph convolution
+
+By applying the graph convolution layer to the input tensor, we get another tensor
+containing the nodes' representations over time (another 4D tensor). For each time
+step, a node's representation is informed by the information from its neighbors.
+
+To make good forecasts, however, we need not only information from the neighbors
+but also we need to process the information over time. To this end, we can pass each
+node's tensor through a recurrent layer. The `LSTMGC` layer below, first applies
+a graph convolution layer to the inputs and then passes the results through a
+`LSTM` layer.
+"""
+
+
+class LSTMGC(layers.Layer):
+    """Layer comprising a convolution layer followed by LSTM and dense layers."""
+
+    def __init__(
+        self,
+        in_feat,
+        out_feat,
+        lstm_units: int,
+        input_seq_len: int,
+        output_seq_len: int,
+        graph_info: GraphInfo,
+        graph_conv_params: typing.Optional[dict] = None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        # graph conv layer
+        if graph_conv_params is None:
+            graph_conv_params = {
+                "aggregation_type": "mean",
+                "combination_type": "concat",
+                "activation": None,
+            }
+        self.graph_conv = GraphConv(in_feat, out_feat, graph_info, **graph_conv_params)
+
+        self.lstm = layers.LSTM(lstm_units, activation="relu")
+        self.dense = layers.Dense(output_seq_len)
+
+        self.input_seq_len, self.output_seq_len = input_seq_len, output_seq_len
+
+    def call(self, inputs):
+        """Forward pass.
+
+        Args:
+            inputs: tensor of shape `(batch_size, input_seq_len, num_nodes, in_feat)`
+
+        Returns:
+            A tensor of shape `(batch_size, output_seq_len, num_nodes)`.
+        """
+
+        # convert shape to  (num_nodes, batch_size, input_seq_len, in_feat)
+        inputs = ops.transpose(inputs, [2, 0, 1, 3])
+
+        gcn_out = self.graph_conv(
+            inputs
+        )  # gcn_out has shape: (num_nodes, batch_size, input_seq_len, out_feat)
+        shape = ops.shape(gcn_out)
+        num_nodes, batch_size, input_seq_len, out_feat = (
+            shape[0],
+            shape[1],
+            shape[2],
+            shape[3],
+        )
+
+        # LSTM takes only 3D tensors as input
+        gcn_out = ops.reshape(
+            gcn_out, (batch_size * num_nodes, input_seq_len, out_feat)
+        )
+        lstm_out = self.lstm(
+            gcn_out
+        )  # lstm_out has shape: (batch_size * num_nodes, lstm_units)
+
+        dense_output = self.dense(
+            lstm_out
+        )  # dense_output has shape: (batch_size * num_nodes, output_seq_len)
+        output = ops.reshape(dense_output, (num_nodes, batch_size, self.output_seq_len))
+        return ops.transpose(
+            output, [1, 2, 0]
+        )  # returns Tensor of shape (batch_size, output_seq_len, num_nodes)
+
+
+"""
+## Model training
+"""
+
+in_feat = 1
+batch_size = 64
+epochs = 20
+input_sequence_length = 12
+forecast_horizon = 3
+multi_horizon = False
+out_feat = 10
+lstm_units = 64
+graph_conv_params = {
+    "aggregation_type": "mean",
+    "combination_type": "concat",
+    "activation": None,
+}
+
+st_gcn = LSTMGC(
+    in_feat,
+    out_feat,
+    lstm_units,
+    input_sequence_length,
+    forecast_horizon,
+    graph,
+    graph_conv_params,
+)
+inputs = layers.Input((input_sequence_length, graph.num_nodes, in_feat))
+outputs = st_gcn(inputs)
+
+model = keras.models.Model(inputs, outputs)
+model.compile(
+    optimizer=keras.optimizers.RMSprop(learning_rate=0.0002),
+    loss=keras.losses.MeanSquaredError(),
+)
+model.fit(
+    train_dataset,
+    validation_data=val_dataset,
+    epochs=epochs,
+    callbacks=[keras.callbacks.EarlyStopping(patience=10)],
+)
+
+"""
+## Making forecasts on test set
+
+Now we can use the trained model to make forecasts for the test set. Below, we
+compute the MAE of the model and compare it to the MAE of naive forecasts.
+The naive forecasts are the last value of the speed for each node.
+"""
+
+x_test, y = next(test_dataset.as_numpy_iterator())
+y_pred = model.predict(x_test)
+plt.figure(figsize=(18, 6))
+plt.plot(y[:, 0, 0])
+plt.plot(y_pred[:, 0, 0])
+plt.legend(["actual", "forecast"])
+
+naive_mse, model_mse = (
+    np.square(x_test[:, -1, :, 0] - y[:, 0, :]).mean(),
+    np.square(y_pred[:, 0, :] - y[:, 0, :]).mean(),
+)
+print(f"naive MAE: {naive_mse}, model MAE: {model_mse}")
+
+"""
+Of course, the goal here is to demonstrate the method,
+not to achieve the best performance. To improve the
+model's accuracy, all model hyperparameters should be tuned carefully. In addition,
+several of the `LSTMGC` blocks can be stacked to increase the representation power
+of the model.
+"""
diff --git a/knowledge_base/timeseries/timeseries_weather_forecasting.py b/knowledge_base/timeseries/timeseries_weather_forecasting.py
new file mode 100644
index 0000000000000000000000000000000000000000..0b30771948734df88e7ccb7d1ffd620a66521692
--- /dev/null
+++ b/knowledge_base/timeseries/timeseries_weather_forecasting.py
@@ -0,0 +1,361 @@
+"""
+Title: Timeseries forecasting for weather prediction
+Authors: [Prabhanshu Attri](https://prabhanshu.com/github), [Yashika Sharma](https://github.com/yashika51), [Kristi Takach](https://github.com/ktakattack), [Falak Shah](https://github.com/falaktheoptimist)
+Date created: 2020/06/23
+Last modified: 2023/11/22
+Description: This notebook demonstrates how to do timeseries forecasting using a LSTM model.
+Accelerator: GPU
+"""
+
+"""
+## Setup
+"""
+
+import pandas as pd
+import matplotlib.pyplot as plt
+import keras
+
+"""
+## Climate Data Time-Series
+
+We will be using Jena Climate dataset recorded by the
+[Max Planck Institute for Biogeochemistry](https://www.bgc-jena.mpg.de/wetter/).
+The dataset consists of 14 features such as temperature, pressure, humidity etc, recorded once per
+10 minutes.
+
+**Location**: Weather Station, Max Planck Institute for Biogeochemistry
+in Jena, Germany
+
+**Time-frame Considered**: Jan 10, 2009 - December 31, 2016
+
+
+The table below shows the column names, their value formats, and their description.
+
+Index| Features      |Format             |Description
+-----|---------------|-------------------|-----------------------
+1    |Date Time      |01.01.2009 00:10:00|Date-time reference
+2    |p (mbar)       |996.52             |The pascal SI derived unit of pressure used to quantify internal pressure. Meteorological reports typically state atmospheric pressure in millibars.
+3    |T (degC)       |-8.02              |Temperature in Celsius
+4    |Tpot (K)       |265.4              |Temperature in Kelvin
+5    |Tdew (degC)    |-8.9               |Temperature in Celsius relative to humidity. Dew Point is a measure of the absolute amount of water in the air, the DP is the temperature at which the air cannot hold all the moisture in it and water condenses.
+6    |rh (%)         |93.3               |Relative Humidity is a measure of how saturated the air is with water vapor, the %RH determines the amount of water contained within collection objects.
+7    |VPmax (mbar)   |3.33               |Saturation vapor pressure
+8    |VPact (mbar)   |3.11               |Vapor pressure
+9    |VPdef (mbar)   |0.22               |Vapor pressure deficit
+10   |sh (g/kg)      |1.94               |Specific humidity
+11   |H2OC (mmol/mol)|3.12               |Water vapor concentration
+12   |rho (g/m ** 3) |1307.75            |Airtight
+13   |wv (m/s)       |1.03               |Wind speed
+14   |max. wv (m/s)  |1.75               |Maximum wind speed
+15   |wd (deg)       |152.3              |Wind direction in degrees
+"""
+
+from zipfile import ZipFile
+
+uri = "https://storage.googleapis.com/tensorflow/tf-keras-datasets/jena_climate_2009_2016.csv.zip"
+zip_path = keras.utils.get_file(origin=uri, fname="jena_climate_2009_2016.csv.zip")
+zip_file = ZipFile(zip_path)
+zip_file.extractall()
+csv_path = "jena_climate_2009_2016.csv"
+
+df = pd.read_csv(csv_path)
+
+"""
+## Raw Data Visualization
+
+To give us a sense of the data we are working with, each feature has been plotted below.
+This shows the distinct pattern of each feature over the time period from 2009 to 2016.
+It also shows where anomalies are present, which will be addressed during normalization.
+"""
+
+titles = [
+    "Pressure",
+    "Temperature",
+    "Temperature in Kelvin",
+    "Temperature (dew point)",
+    "Relative Humidity",
+    "Saturation vapor pressure",
+    "Vapor pressure",
+    "Vapor pressure deficit",
+    "Specific humidity",
+    "Water vapor concentration",
+    "Airtight",
+    "Wind speed",
+    "Maximum wind speed",
+    "Wind direction in degrees",
+]
+
+feature_keys = [
+    "p (mbar)",
+    "T (degC)",
+    "Tpot (K)",
+    "Tdew (degC)",
+    "rh (%)",
+    "VPmax (mbar)",
+    "VPact (mbar)",
+    "VPdef (mbar)",
+    "sh (g/kg)",
+    "H2OC (mmol/mol)",
+    "rho (g/m**3)",
+    "wv (m/s)",
+    "max. wv (m/s)",
+    "wd (deg)",
+]
+
+colors = [
+    "blue",
+    "orange",
+    "green",
+    "red",
+    "purple",
+    "brown",
+    "pink",
+    "gray",
+    "olive",
+    "cyan",
+]
+
+date_time_key = "Date Time"
+
+
+def show_raw_visualization(data):
+    time_data = data[date_time_key]
+    fig, axes = plt.subplots(
+        nrows=7, ncols=2, figsize=(15, 20), dpi=80, facecolor="w", edgecolor="k"
+    )
+    for i in range(len(feature_keys)):
+        key = feature_keys[i]
+        c = colors[i % (len(colors))]
+        t_data = data[key]
+        t_data.index = time_data
+        t_data.head()
+        ax = t_data.plot(
+            ax=axes[i // 2, i % 2],
+            color=c,
+            title="{} - {}".format(titles[i], key),
+            rot=25,
+        )
+        ax.legend([titles[i]])
+    plt.tight_layout()
+
+
+show_raw_visualization(df)
+
+
+"""
+## Data Preprocessing
+
+Here we are picking ~300,000 data points for training. Observation is recorded every
+10 mins, that means 6 times per hour. We will resample one point per hour since no
+drastic change is expected within 60 minutes. We do this via the `sampling_rate`
+argument in `timeseries_dataset_from_array` utility.
+
+We are tracking data from past 720 timestamps (720/6=120 hours). This data will be
+used to predict the temperature after 72 timestamps (72/6=12 hours).
+
+Since every feature has values with
+varying ranges, we do normalization to confine feature values to a range of `[0, 1]` before
+training a neural network.
+We do this by subtracting the mean and dividing by the standard deviation of each feature.
+
+71.5 % of the data will be used to train the model, i.e. 300,693 rows. `split_fraction` can
+be changed to alter this percentage.
+
+The model is shown data for first 5 days i.e. 720 observations, that are sampled every
+hour. The temperature after 72 (12 hours * 6 observation per hour) observation will be
+used as a label.
+"""
+
+split_fraction = 0.715
+train_split = int(split_fraction * int(df.shape[0]))
+step = 6
+
+past = 720
+future = 72
+learning_rate = 0.001
+batch_size = 256
+epochs = 10
+
+
+def normalize(data, train_split):
+    data_mean = data[:train_split].mean(axis=0)
+    data_std = data[:train_split].std(axis=0)
+    return (data - data_mean) / data_std
+
+
+"""
+We can see from the correlation heatmap, few parameters like Relative Humidity and
+Specific Humidity are redundant. Hence we will be using select features, not all.
+"""
+
+print(
+    "The selected parameters are:",
+    ", ".join([titles[i] for i in [0, 1, 5, 7, 8, 10, 11]]),
+)
+selected_features = [feature_keys[i] for i in [0, 1, 5, 7, 8, 10, 11]]
+features = df[selected_features]
+features.index = df[date_time_key]
+features.head()
+
+features = normalize(features.values, train_split)
+features = pd.DataFrame(features)
+features.head()
+
+train_data = features.loc[0 : train_split - 1]
+val_data = features.loc[train_split:]
+
+"""
+# Training dataset
+
+The training dataset labels starts from the 792nd observation (720 + 72).
+"""
+
+start = past + future
+end = start + train_split
+
+x_train = train_data[[i for i in range(7)]].values
+y_train = features.iloc[start:end][[1]]
+
+sequence_length = int(past / step)
+
+"""
+The `timeseries_dataset_from_array` function takes in a sequence of data-points gathered at
+equal intervals, along with time series parameters such as length of the
+sequences/windows, spacing between two sequence/windows, etc., to produce batches of
+sub-timeseries inputs and targets sampled from the main timeseries.
+"""
+
+dataset_train = keras.preprocessing.timeseries_dataset_from_array(
+    x_train,
+    y_train,
+    sequence_length=sequence_length,
+    sampling_rate=step,
+    batch_size=batch_size,
+)
+
+"""
+## Validation dataset
+
+The validation dataset must not contain the last 792 rows as we won't have label data for
+those records, hence 792 must be subtracted from the end of the data.
+
+The validation label dataset must start from 792 after train_split, hence we must add
+past + future (792) to label_start.
+"""
+
+x_end = len(val_data) - past - future
+
+label_start = train_split + past + future
+
+x_val = val_data.iloc[:x_end][[i for i in range(7)]].values
+y_val = features.iloc[label_start:][[1]]
+
+dataset_val = keras.preprocessing.timeseries_dataset_from_array(
+    x_val,
+    y_val,
+    sequence_length=sequence_length,
+    sampling_rate=step,
+    batch_size=batch_size,
+)
+
+
+for batch in dataset_train.take(1):
+    inputs, targets = batch
+
+print("Input shape:", inputs.numpy().shape)
+print("Target shape:", targets.numpy().shape)
+
+"""
+## Training
+"""
+
+inputs = keras.layers.Input(shape=(inputs.shape[1], inputs.shape[2]))
+lstm_out = keras.layers.LSTM(32)(inputs)
+outputs = keras.layers.Dense(1)(lstm_out)
+
+model = keras.Model(inputs=inputs, outputs=outputs)
+model.compile(optimizer=keras.optimizers.Adam(learning_rate=learning_rate), loss="mse")
+model.summary()
+
+"""
+We'll use the `ModelCheckpoint` callback to regularly save checkpoints, and
+the `EarlyStopping` callback to interrupt training when the validation loss
+is not longer improving.
+"""
+
+path_checkpoint = "model_checkpoint.weights.h5"
+es_callback = keras.callbacks.EarlyStopping(monitor="val_loss", min_delta=0, patience=5)
+
+modelckpt_callback = keras.callbacks.ModelCheckpoint(
+    monitor="val_loss",
+    filepath=path_checkpoint,
+    verbose=1,
+    save_weights_only=True,
+    save_best_only=True,
+)
+
+history = model.fit(
+    dataset_train,
+    epochs=epochs,
+    validation_data=dataset_val,
+    callbacks=[es_callback, modelckpt_callback],
+)
+
+"""
+We can visualize the loss with the function below. After one point, the loss stops
+decreasing.
+"""
+
+
+def visualize_loss(history, title):
+    loss = history.history["loss"]
+    val_loss = history.history["val_loss"]
+    epochs = range(len(loss))
+    plt.figure()
+    plt.plot(epochs, loss, "b", label="Training loss")
+    plt.plot(epochs, val_loss, "r", label="Validation loss")
+    plt.title(title)
+    plt.xlabel("Epochs")
+    plt.ylabel("Loss")
+    plt.legend()
+    plt.show()
+
+
+visualize_loss(history, "Training and Validation Loss")
+
+"""
+## Prediction
+
+The trained model above is now able to make predictions for 5 sets of values from
+validation set.
+"""
+
+
+def show_plot(plot_data, delta, title):
+    labels = ["History", "True Future", "Model Prediction"]
+    marker = [".-", "rx", "go"]
+    time_steps = list(range(-(plot_data[0].shape[0]), 0))
+    if delta:
+        future = delta
+    else:
+        future = 0
+
+    plt.title(title)
+    for i, val in enumerate(plot_data):
+        if i:
+            plt.plot(future, plot_data[i], marker[i], markersize=10, label=labels[i])
+        else:
+            plt.plot(time_steps, plot_data[i].flatten(), marker[i], label=labels[i])
+    plt.legend()
+    plt.xlim([time_steps[0], (future + 5) * 2])
+    plt.xlabel("Time-Step")
+    plt.show()
+    return
+
+
+for x, y in dataset_val.take(5):
+    show_plot(
+        [x[0][:, 1].numpy(), y[0].numpy(), model.predict(x)[0]],
+        12,
+        "Single Step Prediction",
+    )
diff --git a/knowledge_base/vision/3D_image_classification.py b/knowledge_base/vision/3D_image_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..9449a8efdb30f7c801743b8d0268fcdce9a0cea0
--- /dev/null
+++ b/knowledge_base/vision/3D_image_classification.py
@@ -0,0 +1,439 @@
+"""
+Title: 3D image classification from CT scans
+Author: [Hasib Zunair](https://twitter.com/hasibzunair)
+Date created: 2020/09/23
+Last modified: 2024/01/11
+Description: Train a 3D convolutional neural network to predict presence of pneumonia.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+This example will show the steps needed to build a 3D convolutional neural network (CNN)
+to predict the presence of viral pneumonia in computer tomography (CT) scans. 2D CNNs are
+commonly used to process RGB images (3 channels). A 3D CNN is simply the 3D
+equivalent: it takes as input a 3D volume or a sequence of 2D frames (e.g. slices in a CT scan),
+3D CNNs are a powerful model for learning representations for volumetric data.
+
+## References
+
+- [A survey on Deep Learning Advances on Different 3D DataRepresentations](https://arxiv.org/abs/1808.01462)
+- [VoxNet: A 3D Convolutional Neural Network for Real-Time Object Recognition](https://www.ri.cmu.edu/pub_files/2015/9/voxnet_maturana_scherer_iros15.pdf)
+- [FusionNet: 3D Object Classification Using MultipleData Representations](https://arxiv.org/abs/1607.05695)
+- [Uniformizing Techniques to Process CT scans with 3D CNNs for Tuberculosis Prediction](https://arxiv.org/abs/2007.13224)
+"""
+"""
+## Setup
+"""
+
+import os
+import zipfile
+import numpy as np
+import tensorflow as tf  # for data preprocessing
+
+import keras
+from keras import layers
+
+"""
+## Downloading the MosMedData: Chest CT Scans with COVID-19 Related Findings
+
+In this example, we use a subset of the
+[MosMedData: Chest CT Scans with COVID-19 Related Findings](https://www.medrxiv.org/content/10.1101/2020.05.20.20100362v1).
+This dataset consists of lung CT scans with COVID-19 related findings, as well as without such findings.
+
+We will be using the associated radiological findings of the CT scans as labels to build
+a classifier to predict presence of viral pneumonia.
+Hence, the task is a binary classification problem.
+"""
+
+# Download url of normal CT scans.
+url = "https://github.com/hasibzunair/3D-image-classification-tutorial/releases/download/v0.2/CT-0.zip"
+filename = os.path.join(os.getcwd(), "CT-0.zip")
+keras.utils.get_file(filename, url)
+
+# Download url of abnormal CT scans.
+url = "https://github.com/hasibzunair/3D-image-classification-tutorial/releases/download/v0.2/CT-23.zip"
+filename = os.path.join(os.getcwd(), "CT-23.zip")
+keras.utils.get_file(filename, url)
+
+# Make a directory to store the data.
+os.makedirs("MosMedData")
+
+# Unzip data in the newly created directory.
+with zipfile.ZipFile("CT-0.zip", "r") as z_fp:
+    z_fp.extractall("./MosMedData/")
+
+with zipfile.ZipFile("CT-23.zip", "r") as z_fp:
+    z_fp.extractall("./MosMedData/")
+
+"""
+## Loading data and preprocessing
+
+The files are provided in Nifti format with the extension .nii. To read the
+scans, we use the `nibabel` package.
+You can install the package via `pip install nibabel`. CT scans store raw voxel
+intensity in Hounsfield units (HU). They range from -1024 to above 2000 in this dataset.
+Above 400 are bones with different radiointensity, so this is used as a higher bound. A threshold
+between -1000 and 400 is commonly used to normalize CT scans.
+
+To process the data, we do the following:
+
+* We first rotate the volumes by 90 degrees, so the orientation is fixed
+* We scale the HU values to be between 0 and 1.
+* We resize width, height and depth.
+
+Here we define several helper functions to process the data. These functions
+will be used when building training and validation datasets.
+"""
+
+
+import nibabel as nib
+
+from scipy import ndimage
+
+
+def read_nifti_file(filepath):
+    """Read and load volume"""
+    # Read file
+    scan = nib.load(filepath)
+    # Get raw data
+    scan = scan.get_fdata()
+    return scan
+
+
+def normalize(volume):
+    """Normalize the volume"""
+    min = -1000
+    max = 400
+    volume[volume < min] = min
+    volume[volume > max] = max
+    volume = (volume - min) / (max - min)
+    volume = volume.astype("float32")
+    return volume
+
+
+def resize_volume(img):
+    """Resize across z-axis"""
+    # Set the desired depth
+    desired_depth = 64
+    desired_width = 128
+    desired_height = 128
+    # Get current depth
+    current_depth = img.shape[-1]
+    current_width = img.shape[0]
+    current_height = img.shape[1]
+    # Compute depth factor
+    depth = current_depth / desired_depth
+    width = current_width / desired_width
+    height = current_height / desired_height
+    depth_factor = 1 / depth
+    width_factor = 1 / width
+    height_factor = 1 / height
+    # Rotate
+    img = ndimage.rotate(img, 90, reshape=False)
+    # Resize across z-axis
+    img = ndimage.zoom(img, (width_factor, height_factor, depth_factor), order=1)
+    return img
+
+
+def process_scan(path):
+    """Read and resize volume"""
+    # Read scan
+    volume = read_nifti_file(path)
+    # Normalize
+    volume = normalize(volume)
+    # Resize width, height and depth
+    volume = resize_volume(volume)
+    return volume
+
+
+"""
+Let's read the paths of the CT scans from the class directories.
+"""
+
+# Folder "CT-0" consist of CT scans having normal lung tissue,
+# no CT-signs of viral pneumonia.
+normal_scan_paths = [
+    os.path.join(os.getcwd(), "MosMedData/CT-0", x)
+    for x in os.listdir("MosMedData/CT-0")
+]
+# Folder "CT-23" consist of CT scans having several ground-glass opacifications,
+# involvement of lung parenchyma.
+abnormal_scan_paths = [
+    os.path.join(os.getcwd(), "MosMedData/CT-23", x)
+    for x in os.listdir("MosMedData/CT-23")
+]
+
+print("CT scans with normal lung tissue: " + str(len(normal_scan_paths)))
+print("CT scans with abnormal lung tissue: " + str(len(abnormal_scan_paths)))
+
+
+"""
+## Build train and validation datasets
+Read the scans from the class directories and assign labels. Downsample the scans to have
+shape of 128x128x64. Rescale the raw HU values to the range 0 to 1.
+Lastly, split the dataset into train and validation subsets.
+"""
+
+# Read and process the scans.
+# Each scan is resized across height, width, and depth and rescaled.
+abnormal_scans = np.array([process_scan(path) for path in abnormal_scan_paths])
+normal_scans = np.array([process_scan(path) for path in normal_scan_paths])
+
+# For the CT scans having presence of viral pneumonia
+# assign 1, for the normal ones assign 0.
+abnormal_labels = np.array([1 for _ in range(len(abnormal_scans))])
+normal_labels = np.array([0 for _ in range(len(normal_scans))])
+
+# Split data in the ratio 70-30 for training and validation.
+x_train = np.concatenate((abnormal_scans[:70], normal_scans[:70]), axis=0)
+y_train = np.concatenate((abnormal_labels[:70], normal_labels[:70]), axis=0)
+x_val = np.concatenate((abnormal_scans[70:], normal_scans[70:]), axis=0)
+y_val = np.concatenate((abnormal_labels[70:], normal_labels[70:]), axis=0)
+print(
+    "Number of samples in train and validation are %d and %d."
+    % (x_train.shape[0], x_val.shape[0])
+)
+
+"""
+## Data augmentation
+
+The CT scans also augmented by rotating at random angles during training. Since
+the data is stored in rank-3 tensors of shape `(samples, height, width, depth)`,
+we add a dimension of size 1 at axis 4 to be able to perform 3D convolutions on
+the data. The new shape is thus `(samples, height, width, depth, 1)`. There are
+different kinds of preprocessing and augmentation techniques out there,
+this example shows a few simple ones to get started.
+"""
+
+import random
+
+from scipy import ndimage
+
+
+def rotate(volume):
+    """Rotate the volume by a few degrees"""
+
+    def scipy_rotate(volume):
+        # define some rotation angles
+        angles = [-20, -10, -5, 5, 10, 20]
+        # pick angles at random
+        angle = random.choice(angles)
+        # rotate volume
+        volume = ndimage.rotate(volume, angle, reshape=False)
+        volume[volume < 0] = 0
+        volume[volume > 1] = 1
+        return volume
+
+    augmented_volume = tf.numpy_function(scipy_rotate, [volume], tf.float32)
+    return augmented_volume
+
+
+def train_preprocessing(volume, label):
+    """Process training data by rotating and adding a channel."""
+    # Rotate volume
+    volume = rotate(volume)
+    volume = tf.expand_dims(volume, axis=3)
+    return volume, label
+
+
+def validation_preprocessing(volume, label):
+    """Process validation data by only adding a channel."""
+    volume = tf.expand_dims(volume, axis=3)
+    return volume, label
+
+
+"""
+While defining the train and validation data loader, the training data is passed through
+and augmentation function which randomly rotates volume at different angles. Note that both
+training and validation data are already rescaled to have values between 0 and 1.
+"""
+
+# Define data loaders.
+train_loader = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+validation_loader = tf.data.Dataset.from_tensor_slices((x_val, y_val))
+
+batch_size = 2
+# Augment the on the fly during training.
+train_dataset = (
+    train_loader.shuffle(len(x_train))
+    .map(train_preprocessing)
+    .batch(batch_size)
+    .prefetch(2)
+)
+# Only rescale.
+validation_dataset = (
+    validation_loader.shuffle(len(x_val))
+    .map(validation_preprocessing)
+    .batch(batch_size)
+    .prefetch(2)
+)
+
+"""
+Visualize an augmented CT scan.
+"""
+
+import matplotlib.pyplot as plt
+
+data = train_dataset.take(1)
+images, labels = list(data)[0]
+images = images.numpy()
+image = images[0]
+print("Dimension of the CT scan is:", image.shape)
+plt.imshow(np.squeeze(image[:, :, 30]), cmap="gray")
+
+
+"""
+Since a CT scan has many slices, let's visualize a montage of the slices.
+"""
+
+
+def plot_slices(num_rows, num_columns, width, height, data):
+    """Plot a montage of 20 CT slices"""
+    data = np.rot90(np.array(data))
+    data = np.transpose(data)
+    data = np.reshape(data, (num_rows, num_columns, width, height))
+    rows_data, columns_data = data.shape[0], data.shape[1]
+    heights = [slc[0].shape[0] for slc in data]
+    widths = [slc.shape[1] for slc in data[0]]
+    fig_width = 12.0
+    fig_height = fig_width * sum(heights) / sum(widths)
+    f, axarr = plt.subplots(
+        rows_data,
+        columns_data,
+        figsize=(fig_width, fig_height),
+        gridspec_kw={"height_ratios": heights},
+    )
+    for i in range(rows_data):
+        for j in range(columns_data):
+            axarr[i, j].imshow(data[i][j], cmap="gray")
+            axarr[i, j].axis("off")
+    plt.subplots_adjust(wspace=0, hspace=0, left=0, right=1, bottom=0, top=1)
+    plt.show()
+
+
+# Visualize montage of slices.
+# 4 rows and 10 columns for 100 slices of the CT scan.
+plot_slices(4, 10, 128, 128, image[:, :, :40])
+
+"""
+## Define a 3D convolutional neural network
+
+To make the model easier to understand, we structure it into blocks.
+The architecture of the 3D CNN used in this example
+is based on [this paper](https://arxiv.org/abs/2007.13224).
+"""
+
+
+def get_model(width=128, height=128, depth=64):
+    """Build a 3D convolutional neural network model."""
+
+    inputs = keras.Input((width, height, depth, 1))
+
+    x = layers.Conv3D(filters=64, kernel_size=3, activation="relu")(inputs)
+    x = layers.MaxPool3D(pool_size=2)(x)
+    x = layers.BatchNormalization()(x)
+
+    x = layers.Conv3D(filters=64, kernel_size=3, activation="relu")(x)
+    x = layers.MaxPool3D(pool_size=2)(x)
+    x = layers.BatchNormalization()(x)
+
+    x = layers.Conv3D(filters=128, kernel_size=3, activation="relu")(x)
+    x = layers.MaxPool3D(pool_size=2)(x)
+    x = layers.BatchNormalization()(x)
+
+    x = layers.Conv3D(filters=256, kernel_size=3, activation="relu")(x)
+    x = layers.MaxPool3D(pool_size=2)(x)
+    x = layers.BatchNormalization()(x)
+
+    x = layers.GlobalAveragePooling3D()(x)
+    x = layers.Dense(units=512, activation="relu")(x)
+    x = layers.Dropout(0.3)(x)
+
+    outputs = layers.Dense(units=1, activation="sigmoid")(x)
+
+    # Define the model.
+    model = keras.Model(inputs, outputs, name="3dcnn")
+    return model
+
+
+# Build model.
+model = get_model(width=128, height=128, depth=64)
+model.summary()
+
+"""
+## Train model
+"""
+
+# Compile model.
+initial_learning_rate = 0.0001
+lr_schedule = keras.optimizers.schedules.ExponentialDecay(
+    initial_learning_rate, decay_steps=100000, decay_rate=0.96, staircase=True
+)
+model.compile(
+    loss="binary_crossentropy",
+    optimizer=keras.optimizers.Adam(learning_rate=lr_schedule),
+    metrics=["acc"],
+    run_eagerly=True,
+)
+
+# Define callbacks.
+checkpoint_cb = keras.callbacks.ModelCheckpoint(
+    "3d_image_classification.keras", save_best_only=True
+)
+early_stopping_cb = keras.callbacks.EarlyStopping(monitor="val_acc", patience=15)
+
+# Train the model, doing validation at the end of each epoch
+epochs = 100
+model.fit(
+    train_dataset,
+    validation_data=validation_dataset,
+    epochs=epochs,
+    shuffle=True,
+    verbose=2,
+    callbacks=[checkpoint_cb, early_stopping_cb],
+)
+
+"""
+It is important to note that the number of samples is very small (only 200) and we don't
+specify a random seed. As such, you can expect significant variance in the results. The full dataset
+which consists of over 1000 CT scans can be found [here](https://www.medrxiv.org/content/10.1101/2020.05.20.20100362v1). Using the full
+dataset, an accuracy of 83% was achieved. A variability of 6-7% in the classification
+performance is observed in both cases.
+"""
+
+"""
+## Visualizing model performance
+
+Here the model accuracy and loss for the training and the validation sets are plotted.
+Since the validation set is class-balanced, accuracy provides an unbiased representation
+of the model's performance.
+"""
+
+fig, ax = plt.subplots(1, 2, figsize=(20, 3))
+ax = ax.ravel()
+
+for i, metric in enumerate(["acc", "loss"]):
+    ax[i].plot(model.history.history[metric])
+    ax[i].plot(model.history.history["val_" + metric])
+    ax[i].set_title("Model {}".format(metric))
+    ax[i].set_xlabel("epochs")
+    ax[i].set_ylabel(metric)
+    ax[i].legend(["train", "val"])
+
+"""
+## Make predictions on a single CT scan
+"""
+
+# Load best weights.
+model.load_weights("3d_image_classification.keras")
+prediction = model.predict(np.expand_dims(x_val[0], axis=0))[0]
+scores = [1 - prediction[0], prediction[0]]
+
+class_names = ["normal", "abnormal"]
+for score, name in zip(scores, class_names):
+    print(
+        "This model is %.2f percent confident that CT scan is %s"
+        % ((100 * score), name)
+    )
diff --git a/knowledge_base/vision/adamatch.py b/knowledge_base/vision/adamatch.py
new file mode 100644
index 0000000000000000000000000000000000000000..afd2365499594753432bcb71c7aca5d04ee1d422
--- /dev/null
+++ b/knowledge_base/vision/adamatch.py
@@ -0,0 +1,603 @@
+"""
+Title: Semi-supervision and domain adaptation with AdaMatch
+Author: [Sayak Paul](https://twitter.com/RisingSayak)
+Date created: 2021/06/19
+Last modified: 2021/06/19
+Description: Unifying semi-supervised learning and unsupervised domain adaptation with AdaMatch.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+In this example, we will implement the AdaMatch algorithm, proposed in
+[AdaMatch: A Unified Approach to Semi-Supervised Learning and Domain Adaptation](https://arxiv.org/abs/2106.04732)
+by Berthelot et al. It sets a new state-of-the-art in unsupervised domain adaptation (as of
+June 2021). AdaMatch is particularly interesting because it
+unifies semi-supervised learning (SSL) and unsupervised domain adaptation
+(UDA) under one framework. It thereby provides a way to perform semi-supervised domain
+adaptation (SSDA).
+
+This example requires TensorFlow 2.5 or higher, as well as TensorFlow Models, which can
+be installed using the following command:
+"""
+
+"""shell
+pip install -q tf-models-official==2.9.2
+"""
+
+"""
+Before we proceed, let's review a few preliminary concepts underlying this example.
+"""
+
+"""
+## Preliminaries
+
+In **semi-supervised learning (SSL)**, we use a small amount of labeled data to
+train models on a bigger unlabeled dataset. Popular semi-supervised learning methods
+for computer vision include [FixMatch](https://arxiv.org/abs/2001.07685),
+[MixMatch](https://arxiv.org/abs/1905.02249),
+[Noisy Student Training](https://arxiv.org/abs/1911.04252), etc. You can refer to
+[this example](https://keras.io/examples/vision/consistency_training/) to get an idea
+of what a standard SSL workflow looks like.
+
+In **unsupervised domain adaptation**, we have access to a source labeled dataset and
+a target *unlabeled* dataset. Then the task is to learn a model that can generalize well
+to the target dataset. The source and the target datasets vary in terms of distribution.
+The following figure provides an illustration of this idea. In the present example, we use the
+[MNIST dataset](http://yann.lecun.com/exdb/mnist/) as the source dataset, while the target dataset is
+[SVHN](http://ufldl.stanford.edu/housenumbers/), which consists of images of house
+numbers. Both datasets have various varying factors in terms of texture, viewpoint,
+appearance, etc.: their domains, or distributions, are different from one
+another.
+
+![](https://i.imgur.com/dJFSJuT.png)
+
+Popular domain adaptation algorithms in deep learning include
+[Deep CORAL](https://arxiv.org/abs/1612.01939),
+[Moment Matching](https://arxiv.org/abs/1812.01754), etc.
+"""
+
+"""
+## Setup
+"""
+
+import tensorflow as tf
+
+tf.random.set_seed(42)
+
+import numpy as np
+
+from tensorflow import keras
+from tensorflow.keras import layers
+from tensorflow.keras import regularizers
+from keras_cv.layers import RandAugment
+
+import tensorflow_datasets as tfds
+
+tfds.disable_progress_bar()
+
+"""
+## Prepare the data
+"""
+
+# MNIST
+(
+    (mnist_x_train, mnist_y_train),
+    (mnist_x_test, mnist_y_test),
+) = keras.datasets.mnist.load_data()
+
+# Add a channel dimension
+mnist_x_train = tf.expand_dims(mnist_x_train, -1)
+mnist_x_test = tf.expand_dims(mnist_x_test, -1)
+
+# Convert the labels to one-hot encoded vectors
+mnist_y_train = tf.one_hot(mnist_y_train, 10).numpy()
+
+# SVHN
+svhn_train, svhn_test = tfds.load(
+    "svhn_cropped", split=["train", "test"], as_supervised=True
+)
+
+"""
+## Define constants and hyperparameters
+"""
+
+RESIZE_TO = 32
+
+SOURCE_BATCH_SIZE = 64
+TARGET_BATCH_SIZE = 3 * SOURCE_BATCH_SIZE  # Reference: Section 3.2
+EPOCHS = 10
+STEPS_PER_EPOCH = len(mnist_x_train) // SOURCE_BATCH_SIZE
+TOTAL_STEPS = EPOCHS * STEPS_PER_EPOCH
+
+AUTO = tf.data.AUTOTUNE
+LEARNING_RATE = 0.03
+
+WEIGHT_DECAY = 0.0005
+INIT = "he_normal"
+DEPTH = 28
+WIDTH_MULT = 2
+
+"""
+## Data augmentation utilities
+
+A standard element of SSL algorithms is to feed weakly and strongly augmented versions of
+the same images to the learning model to make its predictions consistent. For strong
+augmentation, [RandAugment](https://arxiv.org/abs/1909.13719) is a standard choice. For
+weak augmentation, we will use horizontal flipping and random cropping.
+"""
+
+# Initialize `RandAugment` object with 2 layers of
+# augmentation transforms and strength of 5.
+augmenter = RandAugment(value_range=(0, 255), augmentations_per_image=2, magnitude=0.5)
+
+
+def weak_augment(image, source=True):
+    if image.dtype != tf.float32:
+        image = tf.cast(image, tf.float32)
+
+    # MNIST images are grayscale, this is why we first convert them to
+    # RGB images.
+    if source:
+        image = tf.image.resize_with_pad(image, RESIZE_TO, RESIZE_TO)
+        image = tf.tile(image, [1, 1, 3])
+    image = tf.image.random_flip_left_right(image)
+    image = tf.image.random_crop(image, (RESIZE_TO, RESIZE_TO, 3))
+    return image
+
+
+def strong_augment(image, source=True):
+    if image.dtype != tf.float32:
+        image = tf.cast(image, tf.float32)
+
+    if source:
+        image = tf.image.resize_with_pad(image, RESIZE_TO, RESIZE_TO)
+        image = tf.tile(image, [1, 1, 3])
+    image = augmenter(image)
+    return image
+
+
+"""
+## Data loading utilities
+"""
+
+
+def create_individual_ds(ds, aug_func, source=True):
+    if source:
+        batch_size = SOURCE_BATCH_SIZE
+    else:
+        # During training 3x more target unlabeled samples are shown
+        # to the model in AdaMatch (Section 3.2 of the paper).
+        batch_size = TARGET_BATCH_SIZE
+    ds = ds.shuffle(batch_size * 10, seed=42)
+
+    if source:
+        ds = ds.map(lambda x, y: (aug_func(x), y), num_parallel_calls=AUTO)
+    else:
+        ds = ds.map(lambda x, y: (aug_func(x, False), y), num_parallel_calls=AUTO)
+
+    ds = ds.batch(batch_size).prefetch(AUTO)
+    return ds
+
+
+"""
+`_w` and `_s` suffixes denote weak and strong respectively.
+"""
+
+source_ds = tf.data.Dataset.from_tensor_slices((mnist_x_train, mnist_y_train))
+source_ds_w = create_individual_ds(source_ds, weak_augment)
+source_ds_s = create_individual_ds(source_ds, strong_augment)
+final_source_ds = tf.data.Dataset.zip((source_ds_w, source_ds_s))
+
+target_ds_w = create_individual_ds(svhn_train, weak_augment, source=False)
+target_ds_s = create_individual_ds(svhn_train, strong_augment, source=False)
+final_target_ds = tf.data.Dataset.zip((target_ds_w, target_ds_s))
+
+"""
+Here's what a single image batch looks like:
+
+![](https://i.imgur.com/aver8cG.png)
+"""
+
+"""
+## Loss computation utilities
+"""
+
+
+def compute_loss_source(source_labels, logits_source_w, logits_source_s):
+    loss_func = keras.losses.CategoricalCrossentropy(from_logits=True)
+    # First compute the losses between original source labels and
+    # predictions made on the weakly and strongly augmented versions
+    # of the same images.
+    w_loss = loss_func(source_labels, logits_source_w)
+    s_loss = loss_func(source_labels, logits_source_s)
+    return w_loss + s_loss
+
+
+def compute_loss_target(target_pseudo_labels_w, logits_target_s, mask):
+    loss_func = keras.losses.CategoricalCrossentropy(from_logits=True, reduction="none")
+    target_pseudo_labels_w = tf.stop_gradient(target_pseudo_labels_w)
+    # For calculating loss for the target samples, we treat the pseudo labels
+    # as the ground-truth. These are not considered during backpropagation
+    # which is a standard SSL practice.
+    target_loss = loss_func(target_pseudo_labels_w, logits_target_s)
+
+    # More on `mask` later.
+    mask = tf.cast(mask, target_loss.dtype)
+    target_loss *= mask
+    return tf.reduce_mean(target_loss, 0)
+
+
+"""
+## Subclassed model for AdaMatch training
+
+The figure below presents the overall workflow of AdaMatch (taken from the
+[original paper](https://arxiv.org/abs/2106.04732)):
+
+![](https://i.imgur.com/1QsEm2M.png)
+
+Here's a brief step-by-step breakdown of the workflow:
+
+1. We first retrieve the weakly and strongly augmented pairs of images from the source and
+target datasets.
+2. We prepare two concatenated copies:
+    i. One where both pairs are concatenated.
+    ii. One where only the source data image pair is concatenated.
+3. We run two forward passes through the model:
+    i. The first forward pass uses the concatenated copy obtained from **2.i**. In
+this forward pass, the [Batch Normalization](https://arxiv.org/abs/1502.03167) statistics
+are updated.
+    ii. In the second forward pass, we only use the concatenated copy obtained from **2.ii**.
+    Batch Normalization layers are run in inference mode.
+4. The respective logits are computed for both the forward passes.
+5. The logits go through a series of transformations, introduced in the paper (which
+we will discuss shortly).
+6. We compute the loss and update the gradients of the underlying model.
+"""
+
+
+class AdaMatch(keras.Model):
+    def __init__(self, model, total_steps, tau=0.9):
+        super().__init__()
+        self.model = model
+        self.tau = tau  # Denotes the confidence threshold
+        self.loss_tracker = tf.keras.metrics.Mean(name="loss")
+        self.total_steps = total_steps
+        self.current_step = tf.Variable(0, dtype="int64")
+
+    @property
+    def metrics(self):
+        return [self.loss_tracker]
+
+    # This is a warmup schedule to update the weight of the
+    # loss contributed by the target unlabeled samples. More
+    # on this in the text.
+    def compute_mu(self):
+        pi = tf.constant(np.pi, dtype="float32")
+        step = tf.cast(self.current_step, dtype="float32")
+        return 0.5 - tf.cos(tf.math.minimum(pi, (2 * pi * step) / self.total_steps)) / 2
+
+    def train_step(self, data):
+        ## Unpack and organize the data ##
+        source_ds, target_ds = data
+        (source_w, source_labels), (source_s, _) = source_ds
+        (
+            (target_w, _),
+            (target_s, _),
+        ) = target_ds  # Notice that we are NOT using any labels here.
+
+        combined_images = tf.concat([source_w, source_s, target_w, target_s], 0)
+        combined_source = tf.concat([source_w, source_s], 0)
+
+        total_source = tf.shape(combined_source)[0]
+        total_target = tf.shape(tf.concat([target_w, target_s], 0))[0]
+
+        with tf.GradientTape() as tape:
+            ## Forward passes ##
+            combined_logits = self.model(combined_images, training=True)
+            z_d_prime_source = self.model(
+                combined_source, training=False
+            )  # No BatchNorm update.
+            z_prime_source = combined_logits[:total_source]
+
+            ## 1. Random logit interpolation for the source images ##
+            lambd = tf.random.uniform((total_source, 10), 0, 1)
+            final_source_logits = (lambd * z_prime_source) + (
+                (1 - lambd) * z_d_prime_source
+            )
+
+            ## 2. Distribution alignment (only consider weakly augmented images) ##
+            # Compute softmax for logits of the WEAKLY augmented SOURCE images.
+            y_hat_source_w = tf.nn.softmax(final_source_logits[: tf.shape(source_w)[0]])
+
+            # Extract logits for the WEAKLY augmented TARGET images and compute softmax.
+            logits_target = combined_logits[total_source:]
+            logits_target_w = logits_target[: tf.shape(target_w)[0]]
+            y_hat_target_w = tf.nn.softmax(logits_target_w)
+
+            # Align the target label distribution to that of the source.
+            expectation_ratio = tf.reduce_mean(y_hat_source_w) / tf.reduce_mean(
+                y_hat_target_w
+            )
+            y_tilde_target_w = tf.math.l2_normalize(
+                y_hat_target_w * expectation_ratio, 1
+            )
+
+            ## 3. Relative confidence thresholding ##
+            row_wise_max = tf.reduce_max(y_hat_source_w, axis=-1)
+            final_sum = tf.reduce_mean(row_wise_max, 0)
+            c_tau = self.tau * final_sum
+            mask = tf.reduce_max(y_tilde_target_w, axis=-1) >= c_tau
+
+            ## Compute losses (pay attention to the indexing) ##
+            source_loss = compute_loss_source(
+                source_labels,
+                final_source_logits[: tf.shape(source_w)[0]],
+                final_source_logits[tf.shape(source_w)[0] :],
+            )
+            target_loss = compute_loss_target(
+                y_tilde_target_w, logits_target[tf.shape(target_w)[0] :], mask
+            )
+
+            t = self.compute_mu()  # Compute weight for the target loss
+            total_loss = source_loss + (t * target_loss)
+            self.current_step.assign_add(
+                1
+            )  # Update current training step for the scheduler
+
+        gradients = tape.gradient(total_loss, self.model.trainable_variables)
+        self.optimizer.apply_gradients(zip(gradients, self.model.trainable_variables))
+
+        self.loss_tracker.update_state(total_loss)
+        return {"loss": self.loss_tracker.result()}
+
+
+"""
+The authors introduce three improvements in the paper:
+
+* In AdaMatch, we perform two forward passes, and only one of them is respsonsible for
+updating the Batch Normalization statistics. This is done to account for distribution
+shifts in the target dataset. In the other forward pass, we only use the source sample,
+and the Batch Normalization layers are run in inference mode. Logits for the source
+samples (weakly and strongly augmented versions) from these two passes are slightly
+different from one another because of how Batch Normalization layers are run. Final
+logits for the source samples are computed by linearly interpolating between these two
+different pairs of logits. This induces a form of consistency regularization. This step
+is referred to as **random logit interpolation**.
+* **Distribution alignment** is used to align the source and target label distributions.
+This further helps the underlying model learn *domain-invariant representations*. In case
+of unsupervised domain adaptation, we don't have access to any labels of the target
+dataset. This is why pseudo labels are generated from the underlying model.
+* The underlying model generates pseudo-labels for the target samples. It's likely that
+the model would make faulty predictions. Those can propagate back as we make progress in
+the training, and hurt the overall performance. To compensate for that, we filter the
+high-confidence predictions based on a threshold (hence the use of `mask` inside
+`compute_loss_target()`). In AdaMatch, this threshold is relatively adjusted which is why
+it is called **relative confidence thresholding**.
+
+For more details on these methods and to know how each of them contribute please refer to
+[the paper](https://arxiv.org/abs/2106.04732).
+
+**About `compute_mu()`**:
+
+Rather than using a fixed scalar quantity, a varying scalar is used in AdaMatch. It
+denotes the weight of the loss contibuted by the target samples. Visually, the weight
+scheduler look like so:
+
+![](https://i.imgur.com/dG7i9uH.png)
+
+This scheduler increases the weight of the target domain loss from 0 to 1 for the first
+half of the training. Then it keeps that weight at 1 for the second half of the training.
+"""
+
+"""
+## Instantiate a Wide-ResNet-28-2
+
+The authors use a [WideResNet-28-2](https://arxiv.org/abs/1605.07146) for the dataset
+pairs we are using in this example. Most of the following code has been referred from
+[this script](https://github.com/asmith26/wide_resnets_keras/blob/master/main.py). Note
+that the following model has a scaling layer inside it that scales the pixel values to
+[0, 1].
+"""
+
+
+def wide_basic(x, n_input_plane, n_output_plane, stride):
+    conv_params = [[3, 3, stride, "same"], [3, 3, (1, 1), "same"]]
+
+    n_bottleneck_plane = n_output_plane
+
+    # Residual block
+    for i, v in enumerate(conv_params):
+        if i == 0:
+            if n_input_plane != n_output_plane:
+                x = layers.BatchNormalization()(x)
+                x = layers.Activation("relu")(x)
+                convs = x
+            else:
+                convs = layers.BatchNormalization()(x)
+                convs = layers.Activation("relu")(convs)
+            convs = layers.Conv2D(
+                n_bottleneck_plane,
+                (v[0], v[1]),
+                strides=v[2],
+                padding=v[3],
+                kernel_initializer=INIT,
+                kernel_regularizer=regularizers.l2(WEIGHT_DECAY),
+                use_bias=False,
+            )(convs)
+        else:
+            convs = layers.BatchNormalization()(convs)
+            convs = layers.Activation("relu")(convs)
+            convs = layers.Conv2D(
+                n_bottleneck_plane,
+                (v[0], v[1]),
+                strides=v[2],
+                padding=v[3],
+                kernel_initializer=INIT,
+                kernel_regularizer=regularizers.l2(WEIGHT_DECAY),
+                use_bias=False,
+            )(convs)
+
+    # Shortcut connection: identity function or 1x1
+    # convolutional
+    #  (depends on difference between input & output shape - this
+    #   corresponds to whether we are using the first block in
+    #   each
+    #   group; see `block_series()`).
+    if n_input_plane != n_output_plane:
+        shortcut = layers.Conv2D(
+            n_output_plane,
+            (1, 1),
+            strides=stride,
+            padding="same",
+            kernel_initializer=INIT,
+            kernel_regularizer=regularizers.l2(WEIGHT_DECAY),
+            use_bias=False,
+        )(x)
+    else:
+        shortcut = x
+
+    return layers.Add()([convs, shortcut])
+
+
+# Stacking residual units on the same stage
+def block_series(x, n_input_plane, n_output_plane, count, stride):
+    x = wide_basic(x, n_input_plane, n_output_plane, stride)
+    for i in range(2, int(count + 1)):
+        x = wide_basic(x, n_output_plane, n_output_plane, stride=1)
+    return x
+
+
+def get_network(image_size=32, num_classes=10):
+    n = (DEPTH - 4) / 6
+    n_stages = [16, 16 * WIDTH_MULT, 32 * WIDTH_MULT, 64 * WIDTH_MULT]
+
+    inputs = keras.Input(shape=(image_size, image_size, 3))
+    x = layers.Rescaling(scale=1.0 / 255)(inputs)
+
+    conv1 = layers.Conv2D(
+        n_stages[0],
+        (3, 3),
+        strides=1,
+        padding="same",
+        kernel_initializer=INIT,
+        kernel_regularizer=regularizers.l2(WEIGHT_DECAY),
+        use_bias=False,
+    )(x)
+
+    ## Add wide residual blocks ##
+
+    conv2 = block_series(
+        conv1,
+        n_input_plane=n_stages[0],
+        n_output_plane=n_stages[1],
+        count=n,
+        stride=(1, 1),
+    )  # Stage 1
+
+    conv3 = block_series(
+        conv2,
+        n_input_plane=n_stages[1],
+        n_output_plane=n_stages[2],
+        count=n,
+        stride=(2, 2),
+    )  # Stage 2
+
+    conv4 = block_series(
+        conv3,
+        n_input_plane=n_stages[2],
+        n_output_plane=n_stages[3],
+        count=n,
+        stride=(2, 2),
+    )  # Stage 3
+
+    batch_norm = layers.BatchNormalization()(conv4)
+    relu = layers.Activation("relu")(batch_norm)
+
+    # Classifier
+    trunk_outputs = layers.GlobalAveragePooling2D()(relu)
+    outputs = layers.Dense(
+        num_classes, kernel_regularizer=regularizers.l2(WEIGHT_DECAY)
+    )(trunk_outputs)
+
+    return keras.Model(inputs, outputs)
+
+
+"""
+We can now instantiate a Wide ResNet model like so. Note that the purpose of using a
+Wide ResNet here is to keep the implementation as close to the original one
+as possible.
+"""
+
+wrn_model = get_network()
+print(f"Model has {wrn_model.count_params()/1e6} Million parameters.")
+
+"""
+## Instantiate AdaMatch model and compile it
+"""
+
+reduce_lr = keras.optimizers.schedules.CosineDecay(LEARNING_RATE, TOTAL_STEPS, 0.25)
+optimizer = keras.optimizers.Adam(reduce_lr)
+
+adamatch_trainer = AdaMatch(model=wrn_model, total_steps=TOTAL_STEPS)
+adamatch_trainer.compile(optimizer=optimizer)
+
+"""
+## Model training
+"""
+
+total_ds = tf.data.Dataset.zip((final_source_ds, final_target_ds))
+adamatch_trainer.fit(total_ds, epochs=EPOCHS)
+
+"""
+## Evaluation on the target and source test sets
+"""
+
+# Compile the AdaMatch model to yield accuracy.
+adamatch_trained_model = adamatch_trainer.model
+adamatch_trained_model.compile(metrics=keras.metrics.SparseCategoricalAccuracy())
+
+# Score on the target test set.
+svhn_test = svhn_test.batch(TARGET_BATCH_SIZE).prefetch(AUTO)
+_, accuracy = adamatch_trained_model.evaluate(svhn_test)
+print(f"Accuracy on target test set: {accuracy * 100:.2f}%")
+
+"""
+With more training, this score improves. When this same network is trained with
+standard classification objective, it yields an accuracy of **7.20%** which is
+significantly lower than what we got with AdaMatch. You can check out
+[this notebook](https://colab.research.google.com/github/sayakpaul/AdaMatch-TF/blob/main/Vanilla_WideResNet.ipynb)
+to learn more about the hyperparameters and other experimental details.
+"""
+
+
+# Utility function for preprocessing the source test set.
+def prepare_test_ds_source(image, label):
+    image = tf.image.resize_with_pad(image, RESIZE_TO, RESIZE_TO)
+    image = tf.tile(image, [1, 1, 3])
+    return image, label
+
+
+source_test_ds = tf.data.Dataset.from_tensor_slices((mnist_x_test, mnist_y_test))
+source_test_ds = (
+    source_test_ds.map(prepare_test_ds_source, num_parallel_calls=AUTO)
+    .batch(TARGET_BATCH_SIZE)
+    .prefetch(AUTO)
+)
+
+# Evaluation on the source test set.
+_, accuracy = adamatch_trained_model.evaluate(source_test_ds)
+print(f"Accuracy on source test set: {accuracy * 100:.2f}%")
+
+"""
+You can reproduce the results by using these
+[model weights](https://github.com/sayakpaul/AdaMatch-TF/releases/tag/v1.0.0).
+"""
+
+"""
+**Example available on HuggingFace**
+| Trained Model | Demo |
+| :--: | :--: |
+| [![Generic badge](https://img.shields.io/badge/%F0%9F%A4%97%20Model-AdaMatch%20Domain%20Adaption-black.svg)](https://huggingface.co/keras-io/adamatch-domain-adaption) | [![Generic badge](https://img.shields.io/badge/%F0%9F%A4%97%20Spaces-AdaMatch%20Domain%20Adaption-black.svg)](https://huggingface.co/spaces/keras-io/adamatch-domain-adaption) |
+"""
diff --git a/knowledge_base/vision/attention_mil_classification.py b/knowledge_base/vision/attention_mil_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..a83dab57bd1e0d2f5a626b4772fc544b080b6608
--- /dev/null
+++ b/knowledge_base/vision/attention_mil_classification.py
@@ -0,0 +1,565 @@
+"""
+Title: Classification using Attention-based Deep Multiple Instance Learning (MIL).
+Author: [Mohamad Jaber](https://www.linkedin.com/in/mohamadjaber1/)
+Date created: 2021/08/16
+Last modified: 2021/11/25
+Description: MIL approach to classify bags of instances and get their individual instance score.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+### What is Multiple Instance Learning (MIL)?
+
+Usually, with supervised learning algorithms, the learner receives labels for a set of
+instances. In the case of MIL, the learner receives labels for a set of bags, each of which
+contains a set of instances. The bag is labeled positive if it contains at least
+one positive instance, and negative if it does not contain any.
+
+### Motivation
+
+It is often assumed in image classification tasks that each image clearly represents a
+class label. In medical imaging (e.g. computational pathology, etc.) an *entire image*
+is represented by a single class label (cancerous/non-cancerous) or a region of interest
+could be given. However, one will be interested in knowing which patterns in the image
+is actually causing it to belong to that class. In this context, the image(s) will be
+divided and the subimages will form the bag of instances.
+
+Therefore, the goals are to:
+
+1. Learn a model to predict a class label for a bag of instances.
+2. Find out which instances within the bag caused a position class label
+prediction.
+
+### Implementation
+
+The following steps describe how the model works:
+
+1. The feature extractor layers extract feature embeddings.
+2. The embeddings are fed into the MIL attention layer to get
+the attention scores. The layer is designed as permutation-invariant.
+3. Input features and their corresponding attention scores are multiplied together.
+4. The resulting output is passed to a softmax function for classification.
+
+### References
+
+- [Attention-based Deep Multiple Instance Learning](https://arxiv.org/abs/1802.04712).
+- Some of the attention operator code implementation was inspired from https://github.com/utayao/Atten_Deep_MIL.
+- Imbalanced data [tutorial](https://www.tensorflow.org/tutorials/structured_data/imbalanced_data)
+by TensorFlow.
+
+"""
+"""
+## Setup
+"""
+
+import numpy as np
+import keras
+from keras import layers
+from keras import ops
+from tqdm import tqdm
+from matplotlib import pyplot as plt
+
+plt.style.use("ggplot")
+
+"""
+## Create dataset
+
+We will create a set of bags and assign their labels according to their contents.
+If at least one positive instance
+is available in a bag, the bag is considered as a positive bag. If it does not contain any
+positive instance, the bag will be considered as negative.
+
+### Configuration parameters
+
+- `POSITIVE_CLASS`: The desired class to be kept in the positive bag.
+- `BAG_COUNT`: The number of training bags.
+- `VAL_BAG_COUNT`: The number of validation bags.
+- `BAG_SIZE`: The number of instances in a bag.
+- `PLOT_SIZE`: The number of bags to plot.
+- `ENSEMBLE_AVG_COUNT`: The number of models to create and average together. (Optional:
+often results in better performance - set to 1 for single model)
+"""
+
+POSITIVE_CLASS = 1
+BAG_COUNT = 1000
+VAL_BAG_COUNT = 300
+BAG_SIZE = 3
+PLOT_SIZE = 3
+ENSEMBLE_AVG_COUNT = 1
+
+"""
+### Prepare bags
+
+Since the attention operator is a permutation-invariant operator, an instance with a
+positive class label is randomly placed among the instances in the positive bag.
+"""
+
+
+def create_bags(input_data, input_labels, positive_class, bag_count, instance_count):
+    # Set up bags.
+    bags = []
+    bag_labels = []
+
+    # Normalize input data.
+    input_data = np.divide(input_data, 255.0)
+
+    # Count positive samples.
+    count = 0
+
+    for _ in range(bag_count):
+        # Pick a fixed size random subset of samples.
+        index = np.random.choice(input_data.shape[0], instance_count, replace=False)
+        instances_data = input_data[index]
+        instances_labels = input_labels[index]
+
+        # By default, all bags are labeled as 0.
+        bag_label = 0
+
+        # Check if there is at least a positive class in the bag.
+        if positive_class in instances_labels:
+            # Positive bag will be labeled as 1.
+            bag_label = 1
+            count += 1
+
+        bags.append(instances_data)
+        bag_labels.append(np.array([bag_label]))
+
+    print(f"Positive bags: {count}")
+    print(f"Negative bags: {bag_count - count}")
+
+    return (list(np.swapaxes(bags, 0, 1)), np.array(bag_labels))
+
+
+# Load the MNIST dataset.
+(x_train, y_train), (x_val, y_val) = keras.datasets.mnist.load_data()
+
+# Create training data.
+train_data, train_labels = create_bags(
+    x_train, y_train, POSITIVE_CLASS, BAG_COUNT, BAG_SIZE
+)
+
+# Create validation data.
+val_data, val_labels = create_bags(
+    x_val, y_val, POSITIVE_CLASS, VAL_BAG_COUNT, BAG_SIZE
+)
+
+"""
+## Create the model
+
+We will now build the attention layer, prepare some utilities, then build and train the
+entire model.
+
+### Attention operator implementation
+
+The output size of this layer is decided by the size of a single bag.
+
+The attention mechanism uses a weighted average of instances in a bag, in which the sum
+of the weights must equal to 1 (invariant of the bag size).
+
+The weight matrices (parameters) are **w** and **v**. To include positive and negative
+values, hyperbolic tangent element-wise non-linearity is utilized.
+
+A **Gated attention mechanism** can be used to deal with complex relations. Another weight
+matrix, **u**, is added to the computation.
+A sigmoid non-linearity is used to overcome approximately linear behavior for *x* ∈ [−1, 1]
+by hyperbolic tangent non-linearity.
+"""
+
+
+class MILAttentionLayer(layers.Layer):
+    """Implementation of the attention-based Deep MIL layer.
+
+    Args:
+      weight_params_dim: Positive Integer. Dimension of the weight matrix.
+      kernel_initializer: Initializer for the `kernel` matrix.
+      kernel_regularizer: Regularizer function applied to the `kernel` matrix.
+      use_gated: Boolean, whether or not to use the gated mechanism.
+
+    Returns:
+      List of 2D tensors with BAG_SIZE length.
+      The tensors are the attention scores after softmax with shape `(batch_size, 1)`.
+    """
+
+    def __init__(
+        self,
+        weight_params_dim,
+        kernel_initializer="glorot_uniform",
+        kernel_regularizer=None,
+        use_gated=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.weight_params_dim = weight_params_dim
+        self.use_gated = use_gated
+
+        self.kernel_initializer = keras.initializers.get(kernel_initializer)
+        self.kernel_regularizer = keras.regularizers.get(kernel_regularizer)
+
+        self.v_init = self.kernel_initializer
+        self.w_init = self.kernel_initializer
+        self.u_init = self.kernel_initializer
+
+        self.v_regularizer = self.kernel_regularizer
+        self.w_regularizer = self.kernel_regularizer
+        self.u_regularizer = self.kernel_regularizer
+
+    def build(self, input_shape):
+        # Input shape.
+        # List of 2D tensors with shape: (batch_size, input_dim).
+        input_dim = input_shape[0][1]
+
+        self.v_weight_params = self.add_weight(
+            shape=(input_dim, self.weight_params_dim),
+            initializer=self.v_init,
+            name="v",
+            regularizer=self.v_regularizer,
+            trainable=True,
+        )
+
+        self.w_weight_params = self.add_weight(
+            shape=(self.weight_params_dim, 1),
+            initializer=self.w_init,
+            name="w",
+            regularizer=self.w_regularizer,
+            trainable=True,
+        )
+
+        if self.use_gated:
+            self.u_weight_params = self.add_weight(
+                shape=(input_dim, self.weight_params_dim),
+                initializer=self.u_init,
+                name="u",
+                regularizer=self.u_regularizer,
+                trainable=True,
+            )
+        else:
+            self.u_weight_params = None
+
+        self.input_built = True
+
+    def call(self, inputs):
+        # Assigning variables from the number of inputs.
+        instances = [self.compute_attention_scores(instance) for instance in inputs]
+
+        # Stack instances into a single tensor.
+        instances = ops.stack(instances)
+
+        # Apply softmax over instances such that the output summation is equal to 1.
+        alpha = ops.softmax(instances, axis=0)
+
+        # Split to recreate the same array of tensors we had as inputs.
+        return [alpha[i] for i in range(alpha.shape[0])]
+
+    def compute_attention_scores(self, instance):
+        # Reserve in-case "gated mechanism" used.
+        original_instance = instance
+
+        # tanh(v*h_k^T)
+        instance = ops.tanh(ops.tensordot(instance, self.v_weight_params, axes=1))
+
+        # for learning non-linear relations efficiently.
+        if self.use_gated:
+            instance = instance * ops.sigmoid(
+                ops.tensordot(original_instance, self.u_weight_params, axes=1)
+            )
+
+        # w^T*(tanh(v*h_k^T)) / w^T*(tanh(v*h_k^T)*sigmoid(u*h_k^T))
+        return ops.tensordot(instance, self.w_weight_params, axes=1)
+
+
+"""
+## Visualizer tool
+
+Plot the number of bags (given by `PLOT_SIZE`) with respect to the class.
+
+Moreover, if activated, the class label prediction with its associated instance score
+for each bag (after the model has been trained) can be seen.
+"""
+
+
+def plot(data, labels, bag_class, predictions=None, attention_weights=None):
+    """ "Utility for plotting bags and attention weights.
+
+    Args:
+      data: Input data that contains the bags of instances.
+      labels: The associated bag labels of the input data.
+      bag_class: String name of the desired bag class.
+        The options are: "positive" or "negative".
+      predictions: Class labels model predictions.
+      If you don't specify anything, ground truth labels will be used.
+      attention_weights: Attention weights for each instance within the input data.
+      If you don't specify anything, the values won't be displayed.
+    """
+    return  ## TODO
+    labels = np.array(labels).reshape(-1)
+
+    if bag_class == "positive":
+        if predictions is not None:
+            labels = np.where(predictions.argmax(1) == 1)[0]
+            bags = np.array(data)[:, labels[0:PLOT_SIZE]]
+
+        else:
+            labels = np.where(labels == 1)[0]
+            bags = np.array(data)[:, labels[0:PLOT_SIZE]]
+
+    elif bag_class == "negative":
+        if predictions is not None:
+            labels = np.where(predictions.argmax(1) == 0)[0]
+            bags = np.array(data)[:, labels[0:PLOT_SIZE]]
+        else:
+            labels = np.where(labels == 0)[0]
+            bags = np.array(data)[:, labels[0:PLOT_SIZE]]
+
+    else:
+        print(f"There is no class {bag_class}")
+        return
+
+    print(f"The bag class label is {bag_class}")
+    for i in range(PLOT_SIZE):
+        figure = plt.figure(figsize=(8, 8))
+        print(f"Bag number: {labels[i]}")
+        for j in range(BAG_SIZE):
+            image = bags[j][i]
+            figure.add_subplot(1, BAG_SIZE, j + 1)
+            plt.grid(False)
+            if attention_weights is not None:
+                plt.title(np.around(attention_weights[labels[i]][j], 2))
+            plt.imshow(image)
+        plt.show()
+
+
+# Plot some of validation data bags per class.
+plot(val_data, val_labels, "positive")
+plot(val_data, val_labels, "negative")
+
+"""
+## Create model
+
+First we will create some embeddings per instance, invoke the attention operator and then
+use the softmax function to output the class probabilities.
+"""
+
+
+def create_model(instance_shape):
+    # Extract features from inputs.
+    inputs, embeddings = [], []
+    shared_dense_layer_1 = layers.Dense(128, activation="relu")
+    shared_dense_layer_2 = layers.Dense(64, activation="relu")
+    for _ in range(BAG_SIZE):
+        inp = layers.Input(instance_shape)
+        flatten = layers.Flatten()(inp)
+        dense_1 = shared_dense_layer_1(flatten)
+        dense_2 = shared_dense_layer_2(dense_1)
+        inputs.append(inp)
+        embeddings.append(dense_2)
+
+    # Invoke the attention layer.
+    alpha = MILAttentionLayer(
+        weight_params_dim=256,
+        kernel_regularizer=keras.regularizers.L2(0.01),
+        use_gated=True,
+        name="alpha",
+    )(embeddings)
+
+    # Multiply attention weights with the input layers.
+    multiply_layers = [
+        layers.multiply([alpha[i], embeddings[i]]) for i in range(len(alpha))
+    ]
+
+    # Concatenate layers.
+    concat = layers.concatenate(multiply_layers, axis=1)
+
+    # Classification output node.
+    output = layers.Dense(2, activation="softmax")(concat)
+
+    return keras.Model(inputs, output)
+
+
+"""
+## Class weights
+
+Since this kind of problem could simply turn into imbalanced data classification problem,
+class weighting should be considered.
+
+Let's say there are 1000 bags. There often could be cases were ~90 % of the bags do not
+contain any positive label and ~10 % do.
+Such data can be referred to as **Imbalanced data**.
+
+Using class weights, the model will tend to give a higher weight to the rare class.
+"""
+
+
+def compute_class_weights(labels):
+    # Count number of positive and negative bags.
+    negative_count = len(np.where(labels == 0)[0])
+    positive_count = len(np.where(labels == 1)[0])
+    total_count = negative_count + positive_count
+
+    # Build class weight dictionary.
+    return {
+        0: (1 / negative_count) * (total_count / 2),
+        1: (1 / positive_count) * (total_count / 2),
+    }
+
+
+"""
+## Build and train model
+
+The model is built and trained in this section.
+"""
+
+
+def train(train_data, train_labels, val_data, val_labels, model):
+    # Train model.
+    # Prepare callbacks.
+    # Path where to save best weights.
+
+    # Take the file name from the wrapper.
+    file_path = "/tmp/best_model.weights.h5"
+
+    # Initialize model checkpoint callback.
+    model_checkpoint = keras.callbacks.ModelCheckpoint(
+        file_path,
+        monitor="val_loss",
+        verbose=0,
+        mode="min",
+        save_best_only=True,
+        save_weights_only=True,
+    )
+
+    # Initialize early stopping callback.
+    # The model performance is monitored across the validation data and stops training
+    # when the generalization error cease to decrease.
+    early_stopping = keras.callbacks.EarlyStopping(
+        monitor="val_loss", patience=10, mode="min"
+    )
+
+    # Compile model.
+    model.compile(
+        optimizer="adam",
+        loss="sparse_categorical_crossentropy",
+        metrics=["accuracy"],
+    )
+
+    # Fit model.
+    model.fit(
+        train_data,
+        train_labels,
+        validation_data=(val_data, val_labels),
+        epochs=20,
+        class_weight=compute_class_weights(train_labels),
+        batch_size=1,
+        callbacks=[early_stopping, model_checkpoint],
+        verbose=0,
+    )
+
+    # Load best weights.
+    model.load_weights(file_path)
+
+    return model
+
+
+# Building model(s).
+instance_shape = train_data[0][0].shape
+models = [create_model(instance_shape) for _ in range(ENSEMBLE_AVG_COUNT)]
+
+# Show single model architecture.
+print(models[0].summary())
+
+# Training model(s).
+trained_models = [
+    train(train_data, train_labels, val_data, val_labels, model)
+    for model in tqdm(models)
+]
+
+"""
+## Model evaluation
+
+The models are now ready for evaluation.
+With each model we also create an associated intermediate model to get the
+weights from the attention layer.
+
+We will compute a prediction for each of our `ENSEMBLE_AVG_COUNT` models, and
+average them together for our final prediction.
+"""
+
+
+def predict(data, labels, trained_models):
+    # Collect info per model.
+    models_predictions = []
+    models_attention_weights = []
+    models_losses = []
+    models_accuracies = []
+
+    for model in trained_models:
+        # Predict output classes on data.
+        predictions = model.predict(data)
+        models_predictions.append(predictions)
+
+        # Create intermediate model to get MIL attention layer weights.
+        intermediate_model = keras.Model(model.input, model.get_layer("alpha").output)
+
+        # Predict MIL attention layer weights.
+        intermediate_predictions = intermediate_model.predict(data)
+
+        attention_weights = np.squeeze(np.swapaxes(intermediate_predictions, 1, 0))
+        models_attention_weights.append(attention_weights)
+
+        loss, accuracy = model.evaluate(data, labels, verbose=0)
+        models_losses.append(loss)
+        models_accuracies.append(accuracy)
+
+    print(
+        f"The average loss and accuracy are {np.sum(models_losses, axis=0) / ENSEMBLE_AVG_COUNT:.2f}"
+        f" and {100 * np.sum(models_accuracies, axis=0) / ENSEMBLE_AVG_COUNT:.2f} % resp."
+    )
+
+    return (
+        np.sum(models_predictions, axis=0) / ENSEMBLE_AVG_COUNT,
+        np.sum(models_attention_weights, axis=0) / ENSEMBLE_AVG_COUNT,
+    )
+
+
+# Evaluate and predict classes and attention scores on validation data.
+class_predictions, attention_params = predict(val_data, val_labels, trained_models)
+
+# Plot some results from our validation data.
+plot(
+    val_data,
+    val_labels,
+    "positive",
+    predictions=class_predictions,
+    attention_weights=attention_params,
+)
+plot(
+    val_data,
+    val_labels,
+    "negative",
+    predictions=class_predictions,
+    attention_weights=attention_params,
+)
+
+"""
+## Conclusion
+
+From the above plot, you can notice that the weights always sum to 1. In a
+positively predict bag, the instance which resulted in the positive labeling will have
+a substantially higher attention score than the rest of the bag. However, in a negatively
+predicted bag, there are two cases:
+
+* All instances will have approximately similar scores.
+* An instance will have relatively higher score (but not as high as of a positive instance).
+This is because the feature space of this instance is close to that of the positive instance.
+
+## Remarks
+
+- If the model is overfit, the weights will be equally distributed for all bags. Hence,
+the regularization techniques are necessary.
+- In the paper, the bag sizes can differ from one bag to another. For simplicity, the
+bag sizes are fixed here.
+- In order not to rely on the random initial weights of a single model, averaging ensemble
+methods should be considered.
+"""
diff --git a/knowledge_base/vision/autoencoder.py b/knowledge_base/vision/autoencoder.py
new file mode 100644
index 0000000000000000000000000000000000000000..a6d36cfb23f69853934663e806ca4c30578bc55a
--- /dev/null
+++ b/knowledge_base/vision/autoencoder.py
@@ -0,0 +1,165 @@
+"""
+Title: Convolutional autoencoder for image denoising
+Author: [Santiago L. Valdarrama](https://twitter.com/svpino)
+Date created: 2021/03/01
+Last modified: 2021/03/01
+Description: How to train a deep convolutional autoencoder for image denoising.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+This example demonstrates how to implement a deep convolutional autoencoder
+for image denoising, mapping noisy digits images from the MNIST dataset to
+clean digits images. This implementation is based on an original blog post
+titled [Building Autoencoders in Keras](https://blog.keras.io/building-autoencoders-in-keras.html)
+by [François Chollet](https://twitter.com/fchollet).
+"""
+
+"""
+## Setup
+"""
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+from keras import layers
+from keras.datasets import mnist
+from keras.models import Model
+
+
+def preprocess(array):
+    """Normalizes the supplied array and reshapes it."""
+    array = array.astype("float32") / 255.0
+    array = np.reshape(array, (len(array), 28, 28, 1))
+    return array
+
+
+def noise(array):
+    """Adds random noise to each image in the supplied array."""
+    noise_factor = 0.4
+    noisy_array = array + noise_factor * np.random.normal(
+        loc=0.0, scale=1.0, size=array.shape
+    )
+
+    return np.clip(noisy_array, 0.0, 1.0)
+
+
+def display(array1, array2):
+    """Displays ten random images from each array."""
+    n = 10
+    indices = np.random.randint(len(array1), size=n)
+    images1 = array1[indices, :]
+    images2 = array2[indices, :]
+
+    plt.figure(figsize=(20, 4))
+    for i, (image1, image2) in enumerate(zip(images1, images2)):
+        ax = plt.subplot(2, n, i + 1)
+        plt.imshow(image1.reshape(28, 28))
+        plt.gray()
+        ax.get_xaxis().set_visible(False)
+        ax.get_yaxis().set_visible(False)
+
+        ax = plt.subplot(2, n, i + 1 + n)
+        plt.imshow(image2.reshape(28, 28))
+        plt.gray()
+        ax.get_xaxis().set_visible(False)
+        ax.get_yaxis().set_visible(False)
+
+    plt.show()
+
+
+"""
+## Prepare the data
+"""
+
+# Since we only need images from the dataset to encode and decode, we
+# won't use the labels.
+(train_data, _), (test_data, _) = mnist.load_data()
+
+# Normalize and reshape the data
+train_data = preprocess(train_data)
+test_data = preprocess(test_data)
+
+# Create a copy of the data with added noise
+noisy_train_data = noise(train_data)
+noisy_test_data = noise(test_data)
+
+# Display the train data and a version of it with added noise
+display(train_data, noisy_train_data)
+
+"""
+## Build the autoencoder
+
+We are going to use the Functional API to build our convolutional autoencoder.
+"""
+
+input = layers.Input(shape=(28, 28, 1))
+
+# Encoder
+x = layers.Conv2D(32, (3, 3), activation="relu", padding="same")(input)
+x = layers.MaxPooling2D((2, 2), padding="same")(x)
+x = layers.Conv2D(32, (3, 3), activation="relu", padding="same")(x)
+x = layers.MaxPooling2D((2, 2), padding="same")(x)
+
+# Decoder
+x = layers.Conv2DTranspose(32, (3, 3), strides=2, activation="relu", padding="same")(x)
+x = layers.Conv2DTranspose(32, (3, 3), strides=2, activation="relu", padding="same")(x)
+x = layers.Conv2D(1, (3, 3), activation="sigmoid", padding="same")(x)
+
+# Autoencoder
+autoencoder = Model(input, x)
+autoencoder.compile(optimizer="adam", loss="binary_crossentropy")
+autoencoder.summary()
+
+"""
+Now we can train our autoencoder using `train_data` as both our input data
+and target. Notice we are setting up the validation data using the same
+format.
+"""
+
+autoencoder.fit(
+    x=train_data,
+    y=train_data,
+    epochs=50,
+    batch_size=128,
+    shuffle=True,
+    validation_data=(test_data, test_data),
+)
+
+"""
+Let's predict on our test dataset and display the original image together with
+the prediction from our autoencoder.
+
+Notice how the predictions are pretty close to the original images, although
+not quite the same.
+"""
+
+predictions = autoencoder.predict(test_data)
+display(test_data, predictions)
+
+"""
+Now that we know that our autoencoder works, let's retrain it using the noisy
+data as our input and the clean data as our target. We want our autoencoder to
+learn how to denoise the images.
+"""
+
+autoencoder.fit(
+    x=noisy_train_data,
+    y=train_data,
+    epochs=100,
+    batch_size=128,
+    shuffle=True,
+    validation_data=(noisy_test_data, test_data),
+)
+
+"""
+Let's now predict on the noisy data and display the results of our autoencoder.
+
+Notice how the autoencoder does an amazing job at removing the noise from the
+input images.
+"""
+
+predictions = autoencoder.predict(noisy_test_data)
+display(noisy_test_data, predictions)
diff --git a/knowledge_base/vision/barlow_twins.py b/knowledge_base/vision/barlow_twins.py
new file mode 100644
index 0000000000000000000000000000000000000000..aaea461b19802088f46890d2664693a3ea3b9c5d
--- /dev/null
+++ b/knowledge_base/vision/barlow_twins.py
@@ -0,0 +1,1050 @@
+"""
+Title: Barlow Twins for Contrastive SSL
+Author: [Abhiraam Eranti](https://github.com/dewball345)
+Date created: 11/4/21
+Last modified: 12/20/21
+Description: A keras implementation of Barlow Twins (constrastive SSL with redundancy reduction).
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+"""
+
+"""
+Self-supervised learning (SSL) is a relatively novel technique in which a model
+learns from unlabeled data, and is often used when the data is corrupted or
+if there is very little of it. A practical use for SSL is to create
+intermediate embeddings that are learned from the data. These embeddings are
+based on the dataset itself, with similar images having similar embeddings, and
+vice versa. They are then attached to the rest of the model, which uses those
+embeddings as information and effectively learns and makes predictions properly.
+These embeddings, ideally, should contain as much information and insight about
+the data as possible, so that the model can make better predictions. However,
+a common problem that arises is that the model creates embeddings that are
+redundant. For example, if two images are similar, the model will create
+embeddings that are just a string of 1's, or some other value that
+contains repeating bits of information. This is no better than a one-hot
+encoding or just having one bit as the model’s representations; it defeats the
+purpose of the embeddings, as they do not learn as much about the dataset as
+possible. For other approaches, the solution to the problem was to carefully
+configure the model such that it tries not to be redundant.
+
+
+Barlow Twins is a new approach to this problem; while other solutions mainly
+tackle the first goal of invariance (similar images have similar embeddings),
+the Barlow Twins method also prioritizes the goal of reducing redundancy.
+
+It also has the advantage of being much simpler than other methods, and its
+model architecture is symmetric, meaning that both twins in the model do the
+same thing. It is also near state-of-the-art on imagenet, even exceeding methods
+like SimCLR.
+
+
+One disadvantage of Barlow Twins is that it is heavily dependent on
+augmentation, suffering major performance decreases in accuracy without them.
+
+TL, DR: Barlow twins creates representations that are:
+
+*   Invariant.
+*   Not redundant, and carry as much info about the dataset.
+
+Also, it is simpler than other methods.
+
+This notebook can train a Barlow Twins model and reach up to
+64% validation accuracy on the CIFAR-10 dataset.
+"""
+
+"""
+![image](https://i.imgur.com/G6LnEPT.png)
+
+
+
+
+
+
+"""
+
+"""
+### High-Level Theory
+
+
+"""
+
+"""
+The model takes two versions of the same image(with different augmentations) as
+input. Then it takes a prediction of each of them, creating representations.
+They are then used to make a cross-correlation matrix.
+
+Cross-correlation matrix:
+```
+(pred_1.T @ pred_2) / batch_size
+```
+
+The cross-correlation matrix measures the correlation between the output
+neurons in the two representations made by the model predictions of the two
+augmented versions of data. Ideally, a cross-correlation matrix should look
+like an identity matrix if the two images are the same.
+
+When this happens, it means that the representations:
+
+1.   Are invariant. The diagonal shows the correlation between each
+representation's neurons and its corresponding augmented one. Because the two
+versions come from the same image, the diagonal of the matrix should show that
+there is a strong correlation between them. If the images are different, there
+shouldn't be a diagonal.
+2.   Do not show signs of redundancy. If the neurons show correlation with a
+non-diagonal neuron, it means that it is not correctly identifying similarities
+between the two augmented images. This means that it is redundant.
+
+Here is a good way of understanding in pseudocode(information from the original
+paper):
+
+```
+c[i][i] = 1
+c[i][j] = 0
+
+where:
+  c is the cross-correlation matrix
+  i is the index of one representation's neuron
+  j is the index of the second representation's neuron
+```
+"""
+
+"""
+Taken from the original paper: [Barlow Twins: Self-Supervised Learning via Redundancy
+Reduction](https://arxiv.org/abs/2103.03230)
+"""
+
+"""
+### References
+"""
+
+"""
+Paper:
+[Barlow Twins: Self-Supervised Learning via Redundancy
+Reduction](https://arxiv.org/abs/2103.03230)
+
+Original Implementation:
+ [facebookresearch/barlowtwins](https://github.com/facebookresearch/barlowtwins)
+
+
+"""
+
+"""
+## Setup
+"""
+
+"""shell
+pip install tensorflow-addons
+"""
+
+import os
+
+# slightly faster improvements, on the first epoch 30 second decrease and a 1-2 second
+# decrease in epoch time. Overall saves approx. 5 min of training time
+
+# Allocates two threads for a gpu private which allows more operations to be
+# done faster
+os.environ["TF_GPU_THREAD_MODE"] = "gpu_private"
+
+import tensorflow as tf  # framework
+from tensorflow import keras  # for tf.keras
+import tensorflow_addons as tfa  # LAMB optimizer and gaussian_blur_2d function
+import numpy as np  # np.random.random
+import matplotlib.pyplot as plt  # graphs
+import datetime  # tensorboard logs naming
+
+# XLA optimization for faster performance(up to 10-15 minutes total time saved)
+tf.config.optimizer.set_jit(True)
+
+"""
+## Load the CIFAR-10 dataset
+"""
+
+[
+    (train_features, train_labels),
+    (test_features, test_labels),
+] = keras.datasets.cifar10.load_data()
+
+train_features = train_features / 255.0
+test_features = test_features / 255.0
+
+"""
+## Necessary Hyperparameters
+"""
+
+# Batch size of dataset
+BATCH_SIZE = 512
+# Width and height of image
+IMAGE_SIZE = 32
+
+"""
+## Augmentation Utilities
+The Barlow twins algorithm is heavily reliant on
+Augmentation. One unique feature of the method is that sometimes, augmentations
+probabilistically occur.
+
+**Augmentations**
+
+*   *RandomToGrayscale*: randomly applies grayscale to image 20% of the time
+*   *RandomColorJitter*: randomly applies color jitter 80% of the time
+*   *RandomFlip*: randomly flips image horizontally 50% of the time
+*   *RandomResizedCrop*: randomly crops an image to a random size then resizes. This
+happens 100% of the time
+*   *RandomSolarize*: randomly applies solarization to an image 20% of the time
+*   *RandomBlur*: randomly blurs an image 20% of the time
+"""
+
+
+class Augmentation(keras.layers.Layer):
+    """Base augmentation class.
+
+    Base augmentation class. Contains the random_execute method.
+
+    Methods:
+        random_execute: method that returns true or false based
+          on a probability. Used to determine whether an augmentation
+          will be run.
+    """
+
+    def __init__(self):
+        super().__init__()
+
+    @tf.function
+    def random_execute(self, prob: float) -> bool:
+        """random_execute function.
+
+        Arguments:
+            prob: a float value from 0-1 that determines the
+              probability.
+
+        Returns:
+            returns true or false based on the probability.
+        """
+
+        return tf.random.uniform([], minval=0, maxval=1) < prob
+
+
+class RandomToGrayscale(Augmentation):
+    """RandomToGrayscale class.
+
+    RandomToGrayscale class. Randomly makes an image
+    grayscaled based on the random_execute method. There
+    is a 20% chance that an image will be grayscaled.
+
+    Methods:
+        call: method that grayscales an image 20% of
+          the time.
+    """
+
+    @tf.function
+    def call(self, x: tf.Tensor) -> tf.Tensor:
+        """call function.
+
+        Arguments:
+            x: a tf.Tensor representing the image.
+
+        Returns:
+            returns a grayscaled version of the image 20% of the time
+              and the original image 80% of the time.
+        """
+
+        if self.random_execute(0.2):
+            x = tf.image.rgb_to_grayscale(x)
+            x = tf.tile(x, [1, 1, 3])
+        return x
+
+
+class RandomColorJitter(Augmentation):
+    """RandomColorJitter class.
+
+    RandomColorJitter class. Randomly adds color jitter to an image.
+    Color jitter means to add random brightness, contrast,
+    saturation, and hue to an image. There is a 80% chance that an
+    image will be randomly color-jittered.
+
+    Methods:
+        call: method that color-jitters an image 80% of
+          the time.
+    """
+
+    @tf.function
+    def call(self, x: tf.Tensor) -> tf.Tensor:
+        """call function.
+
+        Adds color jitter to image, including:
+          Brightness change by a max-delta of 0.8
+          Contrast change by a max-delta of 0.8
+          Saturation change by a max-delta of 0.8
+          Hue change by a max-delta of 0.2
+        Originally, the same deltas of the original paper
+        were used, but a performance boost of almost 2% was found
+        when doubling them.
+
+        Arguments:
+            x: a tf.Tensor representing the image.
+
+        Returns:
+            returns a color-jittered version of the image 80% of the time
+              and the original image 20% of the time.
+        """
+
+        if self.random_execute(0.8):
+            x = tf.image.random_brightness(x, 0.8)
+            x = tf.image.random_contrast(x, 0.4, 1.6)
+            x = tf.image.random_saturation(x, 0.4, 1.6)
+            x = tf.image.random_hue(x, 0.2)
+        return x
+
+
+class RandomFlip(Augmentation):
+    """RandomFlip class.
+
+    RandomFlip class. Randomly flips image horizontally. There is a 50%
+    chance that an image will be randomly flipped.
+
+    Methods:
+        call: method that flips an image 50% of
+          the time.
+    """
+
+    @tf.function
+    def call(self, x: tf.Tensor) -> tf.Tensor:
+        """call function.
+
+        Randomly flips the image.
+
+        Arguments:
+            x: a tf.Tensor representing the image.
+
+        Returns:
+            returns a flipped version of the image 50% of the time
+              and the original image 50% of the time.
+        """
+
+        if self.random_execute(0.5):
+            x = tf.image.random_flip_left_right(x)
+        return x
+
+
+class RandomResizedCrop(Augmentation):
+    """RandomResizedCrop class.
+
+    RandomResizedCrop class. Randomly crop an image to a random size,
+    then resize the image back to the original size.
+
+    Attributes:
+        image_size: The dimension of the image
+
+    Methods:
+        __call__: method that does random resize crop to the image.
+    """
+
+    def __init__(self, image_size):
+        super().__init__()
+        self.image_size = image_size
+
+    def call(self, x: tf.Tensor) -> tf.Tensor:
+        """call function.
+
+        Does random resize crop by randomly cropping an image to a random
+        size 75% - 100% the size of the image. Then resizes it.
+
+        Arguments:
+            x: a tf.Tensor representing the image.
+
+        Returns:
+            returns a randomly cropped image.
+        """
+
+        rand_size = tf.random.uniform(
+            shape=[],
+            minval=int(0.75 * self.image_size),
+            maxval=1 * self.image_size,
+            dtype=tf.int32,
+        )
+
+        crop = tf.image.random_crop(x, (rand_size, rand_size, 3))
+        crop_resize = tf.image.resize(crop, (self.image_size, self.image_size))
+        return crop_resize
+
+
+class RandomSolarize(Augmentation):
+    """RandomSolarize class.
+
+    RandomSolarize class. Randomly solarizes an image.
+    Solarization is when pixels accidentally flip to an inverted state.
+
+    Methods:
+        call: method that does random solarization 20% of the time.
+    """
+
+    @tf.function
+    def call(self, x: tf.Tensor) -> tf.Tensor:
+        """call function.
+
+        Randomly solarizes the image.
+
+        Arguments:
+            x: a tf.Tensor representing the image.
+
+        Returns:
+            returns a solarized version of the image 20% of the time
+              and the original image 80% of the time.
+        """
+
+        if self.random_execute(0.2):
+            # flips abnormally low pixels to abnormally high pixels
+            x = tf.where(x < 10, x, 255 - x)
+        return x
+
+
+class RandomBlur(Augmentation):
+    """RandomBlur class.
+
+    RandomBlur class. Randomly blurs an image.
+
+    Methods:
+        call: method that does random blur 20% of the time.
+    """
+
+    @tf.function
+    def call(self, x: tf.Tensor) -> tf.Tensor:
+        """call function.
+
+        Randomly solarizes the image.
+
+        Arguments:
+            x: a tf.Tensor representing the image.
+
+        Returns:
+            returns a blurred version of the image 20% of the time
+              and the original image 80% of the time.
+        """
+
+        if self.random_execute(0.2):
+            s = np.random.random()
+            return tfa.image.gaussian_filter2d(image=x, sigma=s)
+        return x
+
+
+class RandomAugmentor(keras.Model):
+    """RandomAugmentor class.
+
+    RandomAugmentor class. Chains all the augmentations into
+    one pipeline.
+
+    Attributes:
+        image_size: An integer represing the width and height
+          of the image. Designed to be used for square images.
+        random_resized_crop: Instance variable representing the
+          RandomResizedCrop layer.
+        random_flip: Instance variable representing the
+          RandomFlip layer.
+        random_color_jitter: Instance variable representing the
+          RandomColorJitter layer.
+        random_blur: Instance variable representing the
+          RandomBlur layer
+        random_to_grayscale: Instance variable representing the
+          RandomToGrayscale layer
+        random_solarize: Instance variable representing the
+          RandomSolarize layer
+
+    Methods:
+        call: chains layers in pipeline together
+    """
+
+    def __init__(self, image_size: int):
+        super().__init__()
+
+        self.image_size = image_size
+        self.random_resized_crop = RandomResizedCrop(image_size)
+        self.random_flip = RandomFlip()
+        self.random_color_jitter = RandomColorJitter()
+        self.random_blur = RandomBlur()
+        self.random_to_grayscale = RandomToGrayscale()
+        self.random_solarize = RandomSolarize()
+
+    def call(self, x: tf.Tensor) -> tf.Tensor:
+        x = self.random_resized_crop(x)
+        x = self.random_flip(x)
+        x = self.random_color_jitter(x)
+        x = self.random_blur(x)
+        x = self.random_to_grayscale(x)
+        x = self.random_solarize(x)
+
+        x = tf.clip_by_value(x, 0, 1)
+        return x
+
+
+bt_augmentor = RandomAugmentor(IMAGE_SIZE)
+
+"""
+## Data Loading
+
+A class that creates the barlow twins' dataset.
+
+The dataset consists of two copies of each image, with each copy receiving different
+augmentations.
+"""
+
+
+class BTDatasetCreator:
+    """Barlow twins dataset creator class.
+
+    BTDatasetCreator class. Responsible for creating the
+    barlow twins' dataset.
+
+    Attributes:
+        options: tf.data.Options needed to configure a setting
+          that may improve performance.
+        seed: random seed for shuffling. Used to synchronize two
+          augmented versions.
+        augmentor: augmentor used for augmentation.
+
+    Methods:
+        __call__: creates barlow dataset.
+        augmented_version: creates 1 half of the dataset.
+    """
+
+    def __init__(self, augmentor: RandomAugmentor, seed: int = 1024):
+        self.options = tf.data.Options()
+        self.options.threading.max_intra_op_parallelism = 1
+        self.seed = seed
+        self.augmentor = augmentor
+
+    def augmented_version(self, ds: list) -> tf.data.Dataset:
+        return (
+            tf.data.Dataset.from_tensor_slices(ds)
+            .shuffle(1000, seed=self.seed)
+            .map(self.augmentor, num_parallel_calls=tf.data.AUTOTUNE)
+            .batch(BATCH_SIZE, drop_remainder=True)
+            .prefetch(tf.data.AUTOTUNE)
+            .with_options(self.options)
+        )
+
+    def __call__(self, ds: list) -> tf.data.Dataset:
+        a1 = self.augmented_version(ds)
+        a2 = self.augmented_version(ds)
+
+        return tf.data.Dataset.zip((a1, a2)).with_options(self.options)
+
+
+augment_versions = BTDatasetCreator(bt_augmentor)(train_features)
+
+"""
+View examples of dataset.
+"""
+
+sample_augment_versions = iter(augment_versions)
+
+
+def plot_values(batch: tuple):
+    fig, axs = plt.subplots(3, 3)
+    fig1, axs1 = plt.subplots(3, 3)
+
+    fig.suptitle("Augmentation 1")
+    fig1.suptitle("Augmentation 2")
+
+    a1, a2 = batch
+
+    # plots images on both tables
+    for i in range(3):
+        for j in range(3):
+            # CHANGE(add / 255)
+            axs[i][j].imshow(a1[3 * i + j])
+            axs[i][j].axis("off")
+            axs1[i][j].imshow(a2[3 * i + j])
+            axs1[i][j].axis("off")
+
+    plt.show()
+
+
+plot_values(next(sample_augment_versions))
+
+"""
+## Pseudocode of loss and model
+The following sections follow the original author's pseudocode containing both model and
+loss functions(see diagram below). Also contains a reference of variables used.
+"""
+
+"""
+![pseudocode](https://i.imgur.com/Tlrootj.png)
+"""
+
+"""
+Reference:
+
+```
+y_a: first augmented version of original image.
+y_b: second augmented version of original image.
+z_a: model representation(embeddings) of y_a.
+z_b: model representation(embeddings) of y_b.
+z_a_norm: normalized z_a.
+z_b_norm: normalized z_b.
+c: cross correlation matrix.
+c_diff: diagonal portion of loss(invariance term).
+off_diag: off-diagonal portion of loss(redundancy reduction term).
+```
+"""
+
+"""
+## BarlowLoss: barlow twins model's loss function
+
+Barlow Twins uses the cross correlation matrix for its loss. There are two parts to the
+loss function:
+
+*   ***The invariance term***(diagonal). This part is used to make the diagonals of the
+matrix into 1s. When this is the case, the matrix shows that the images are
+correlated(same).
+  * The loss function subtracts 1 from the diagonal and squares the values.
+*   ***The redundancy reduction term***(off-diagonal). Here, the barlow twins loss
+function aims to make these values zero. As mentioned before, it is redundant if the
+representation neurons are correlated with values that are not on the diagonal.
+  * Off diagonals are squared.
+
+After this the two parts are summed together.
+
+
+
+
+"""
+
+
+class BarlowLoss(keras.losses.Loss):
+    """BarlowLoss class.
+
+    BarlowLoss class. Creates a loss function based on the cross-correlation
+    matrix.
+
+    Attributes:
+        batch_size: the batch size of the dataset
+        lambda_amt: the value for lambda(used in cross_corr_matrix_loss)
+
+    Methods:
+        __init__: gets instance variables
+        call: gets the loss based on the cross-correlation matrix
+          make_diag_zeros: Used in calculating off-diagonal section
+          of loss function; makes diagonals zeros.
+        cross_corr_matrix_loss: creates loss based on cross correlation
+          matrix.
+    """
+
+    def __init__(self, batch_size: int):
+        """__init__ method.
+
+        Gets the instance variables
+
+        Arguments:
+            batch_size: An integer value representing the batch size of the
+              dataset. Used for cross correlation matrix calculation.
+        """
+
+        super().__init__()
+        self.lambda_amt = 5e-3
+        self.batch_size = batch_size
+
+    def get_off_diag(self, c: tf.Tensor) -> tf.Tensor:
+        """get_off_diag method.
+
+        Makes the diagonals of the cross correlation matrix zeros.
+        This is used in the off-diagonal portion of the loss function,
+        where we take the squares of the off-diagonal values and sum them.
+
+        Arguments:
+            c: A tf.tensor that represents the cross correlation
+              matrix
+
+        Returns:
+            Returns a tf.tensor which represents the cross correlation
+            matrix with its diagonals as zeros.
+        """
+
+        zero_diag = tf.zeros(c.shape[-1])
+        return tf.linalg.set_diag(c, zero_diag)
+
+    def cross_corr_matrix_loss(self, c: tf.Tensor) -> tf.Tensor:
+        """cross_corr_matrix_loss method.
+
+        Gets the loss based on the cross correlation matrix.
+        We want the diagonals to be 1's and everything else to be
+        zeros to show that the two augmented images are similar.
+
+        Loss function procedure:
+        take the diagonal of the cross-correlation matrix, subtract by 1,
+        and square that value so no negatives.
+
+        Take the off-diagonal of the cc-matrix(see get_off_diag()),
+        square those values to get rid of negatives and increase the value,
+        and multiply it by a lambda to weight it such that it is of equal
+        value to the optimizer as the diagonal(there are more values off-diag
+        then on-diag)
+
+        Take the sum of the first and second parts and then sum them together.
+
+        Arguments:
+            c: A tf.tensor that represents the cross correlation
+              matrix
+
+        Returns:
+            Returns a tf.tensor which represents the cross correlation
+            matrix with its diagonals as zeros.
+        """
+
+        # subtracts diagonals by one and squares them(first part)
+        c_diff = tf.pow(tf.linalg.diag_part(c) - 1, 2)
+
+        # takes off diagonal, squares it, multiplies with lambda(second part)
+        off_diag = tf.pow(self.get_off_diag(c), 2) * self.lambda_amt
+
+        # sum first and second parts together
+        loss = tf.reduce_sum(c_diff) + tf.reduce_sum(off_diag)
+
+        return loss
+
+    def normalize(self, output: tf.Tensor) -> tf.Tensor:
+        """normalize method.
+
+        Normalizes the model prediction.
+
+        Arguments:
+            output: the model prediction.
+
+        Returns:
+            Returns a normalized version of the model prediction.
+        """
+
+        return (output - tf.reduce_mean(output, axis=0)) / tf.math.reduce_std(
+            output, axis=0
+        )
+
+    def cross_corr_matrix(self, z_a_norm: tf.Tensor, z_b_norm: tf.Tensor) -> tf.Tensor:
+        """cross_corr_matrix method.
+
+        Creates a cross correlation matrix from the predictions.
+        It transposes the first prediction and multiplies this with
+        the second, creating a matrix with shape (n_dense_units, n_dense_units).
+        See build_twin() for more info. Then it divides this with the
+        batch size.
+
+        Arguments:
+            z_a_norm: A normalized version of the first prediction.
+            z_b_norm: A normalized version of the second prediction.
+
+        Returns:
+            Returns a cross correlation matrix.
+        """
+        return (tf.transpose(z_a_norm) @ z_b_norm) / self.batch_size
+
+    def call(self, z_a: tf.Tensor, z_b: tf.Tensor) -> tf.Tensor:
+        """call method.
+
+        Makes the cross-correlation loss. Uses the CreateCrossCorr
+        class to make the cross corr matrix, then finds the loss and
+        returns it(see cross_corr_matrix_loss()).
+
+        Arguments:
+            z_a: The prediction of the first set of augmented data.
+            z_b: the prediction of the second set of augmented data.
+
+        Returns:
+            Returns a (rank-0) tf.Tensor that represents the loss.
+        """
+
+        z_a_norm, z_b_norm = self.normalize(z_a), self.normalize(z_b)
+        c = self.cross_corr_matrix(z_a_norm, z_b_norm)
+        loss = self.cross_corr_matrix_loss(c)
+        return loss
+
+
+"""
+## Barlow Twins' Model Architecture
+The model has two parts:
+
+*   The encoder network, which is a resnet-34.
+*   The projector network, which creates the model embeddings.
+   * This consists of an MLP with 3 dense-batchnorm-relu layers.
+"""
+
+"""
+Resnet encoder network implementation:
+"""
+
+
+class ResNet34:
+    """Resnet34 class.
+
+        Responsible for the Resnet 34 architecture.
+    Modified from
+    https://www.analyticsvidhya.com/blog/2021/08/how-to-code-your-resnet-from-scratch-in-tensorflow/#h2_2.
+    https://www.analyticsvidhya.com/blog/2021/08/how-to-code-your-resnet-from-scratch-in-tensorflow/#h2_2.
+        View their website for more information.
+    """
+
+    def identity_block(self, x, filter):
+        # copy tensor to variable called x_skip
+        x_skip = x
+        # Layer 1
+        x = tf.keras.layers.Conv2D(filter, (3, 3), padding="same")(x)
+        x = tf.keras.layers.BatchNormalization(axis=3)(x)
+        x = tf.keras.layers.Activation("relu")(x)
+        # Layer 2
+        x = tf.keras.layers.Conv2D(filter, (3, 3), padding="same")(x)
+        x = tf.keras.layers.BatchNormalization(axis=3)(x)
+        # Add Residue
+        x = tf.keras.layers.Add()([x, x_skip])
+        x = tf.keras.layers.Activation("relu")(x)
+        return x
+
+    def convolutional_block(self, x, filter):
+        # copy tensor to variable called x_skip
+        x_skip = x
+        # Layer 1
+        x = tf.keras.layers.Conv2D(filter, (3, 3), padding="same", strides=(2, 2))(x)
+        x = tf.keras.layers.BatchNormalization(axis=3)(x)
+        x = tf.keras.layers.Activation("relu")(x)
+        # Layer 2
+        x = tf.keras.layers.Conv2D(filter, (3, 3), padding="same")(x)
+        x = tf.keras.layers.BatchNormalization(axis=3)(x)
+        # Processing Residue with conv(1,1)
+        x_skip = tf.keras.layers.Conv2D(filter, (1, 1), strides=(2, 2))(x_skip)
+        # Add Residue
+        x = tf.keras.layers.Add()([x, x_skip])
+        x = tf.keras.layers.Activation("relu")(x)
+        return x
+
+    def __call__(self, shape=(32, 32, 3)):
+        # Step 1 (Setup Input Layer)
+        x_input = tf.keras.layers.Input(shape)
+        x = tf.keras.layers.ZeroPadding2D((3, 3))(x_input)
+        # Step 2 (Initial Conv layer along with maxPool)
+        x = tf.keras.layers.Conv2D(64, kernel_size=7, strides=2, padding="same")(x)
+        x = tf.keras.layers.BatchNormalization()(x)
+        x = tf.keras.layers.Activation("relu")(x)
+        x = tf.keras.layers.MaxPool2D(pool_size=3, strides=2, padding="same")(x)
+        # Define size of sub-blocks and initial filter size
+        block_layers = [3, 4, 6, 3]
+        filter_size = 64
+        # Step 3 Add the Resnet Blocks
+        for i in range(4):
+            if i == 0:
+                # For sub-block 1 Residual/Convolutional block not needed
+                for j in range(block_layers[i]):
+                    x = self.identity_block(x, filter_size)
+            else:
+                # One Residual/Convolutional Block followed by Identity blocks
+                # The filter size will go on increasing by a factor of 2
+                filter_size = filter_size * 2
+                x = self.convolutional_block(x, filter_size)
+                for j in range(block_layers[i] - 1):
+                    x = self.identity_block(x, filter_size)
+        # Step 4 End Dense Network
+        x = tf.keras.layers.AveragePooling2D((2, 2), padding="same")(x)
+        x = tf.keras.layers.Flatten()(x)
+        model = tf.keras.models.Model(inputs=x_input, outputs=x, name="ResNet34")
+        return model
+
+
+"""
+Projector network:
+"""
+
+
+def build_twin() -> keras.Model:
+    """build_twin method.
+
+    Builds a barlow twins model consisting of an encoder(resnet-34)
+    and a projector, which generates embeddings for the images
+
+    Returns:
+        returns a barlow twins model
+    """
+
+    # number of dense neurons in the projector
+    n_dense_neurons = 5000
+
+    # encoder network
+    resnet = ResNet34()()
+    last_layer = resnet.layers[-1].output
+
+    # intermediate layers of the projector network
+    n_layers = 2
+    for i in range(n_layers):
+        dense = tf.keras.layers.Dense(n_dense_neurons, name=f"projector_dense_{i}")
+        if i == 0:
+            x = dense(last_layer)
+        else:
+            x = dense(x)
+        x = tf.keras.layers.BatchNormalization(name=f"projector_bn_{i}")(x)
+        x = tf.keras.layers.ReLU(name=f"projector_relu_{i}")(x)
+
+    x = tf.keras.layers.Dense(n_dense_neurons, name=f"projector_dense_{n_layers}")(x)
+
+    model = keras.Model(resnet.input, x)
+    return model
+
+
+"""
+## Training Loop Model
+
+See pseudocode for reference.
+"""
+
+
+class BarlowModel(keras.Model):
+    """BarlowModel class.
+
+    BarlowModel class. Responsible for making predictions and handling
+    gradient descent with the optimizer.
+
+    Attributes:
+        model: the barlow model architecture.
+        loss_tracker: the loss metric.
+
+    Methods:
+        train_step: one train step; do model predictions, loss, and
+            optimizer step.
+        metrics: Returns metrics.
+    """
+
+    def __init__(self):
+        super().__init__()
+        self.model = build_twin()
+        self.loss_tracker = keras.metrics.Mean(name="loss")
+
+    @property
+    def metrics(self):
+        return [self.loss_tracker]
+
+    def train_step(self, batch: tf.Tensor) -> tf.Tensor:
+        """train_step method.
+
+        Do one train step. Make model predictions, find loss, pass loss to
+        optimizer, and make optimizer apply gradients.
+
+        Arguments:
+            batch: one batch of data to be given to the loss function.
+
+        Returns:
+            Returns a dictionary with the loss metric.
+        """
+
+        # get the two augmentations from the batch
+        y_a, y_b = batch
+
+        with tf.GradientTape() as tape:
+            # get two versions of predictions
+            z_a, z_b = self.model(y_a, training=True), self.model(y_b, training=True)
+            loss = self.loss(z_a, z_b)
+
+        grads_model = tape.gradient(loss, self.model.trainable_variables)
+
+        self.optimizer.apply_gradients(zip(grads_model, self.model.trainable_variables))
+        self.loss_tracker.update_state(loss)
+
+        return {"loss": self.loss_tracker.result()}
+
+
+"""
+## Model Training
+
+* Used the LAMB optimizer, instead of ADAM or SGD.
+* Similar to the LARS optimizer used in the paper, and lets the model converge much
+faster than other methods.
+* Expected training time: 1 hour 30 min. Go and eat a snack or take a nap or something.
+"""
+
+# sets up model, optimizer, loss
+
+bm = BarlowModel()
+# chose the LAMB optimizer due to high batch sizes. Converged MUCH faster
+# than ADAM or SGD
+optimizer = tfa.optimizers.LAMB()
+loss = BarlowLoss(BATCH_SIZE)
+
+bm.compile(optimizer=optimizer, loss=loss)
+
+# Expected training time: 1 hours 30 min
+
+history = bm.fit(augment_versions, epochs=160)
+plt.plot(history.history["loss"])
+plt.show()
+
+"""
+## Evaluation
+
+**Linear evaluation:** to evaluate the model's performance, we add
+a linear dense layer at the end and freeze the main model's weights, only letting the
+dense layer to be tuned. If the model actually learned something, then the accuracy would
+be significantly higher than random chance.
+
+**Accuracy on CIFAR-10** : 64% for this notebook. This is much better than the 10% we get
+from random guessing.
+"""
+
+# Approx: 64% accuracy with this barlow twins model.
+
+xy_ds = (
+    tf.data.Dataset.from_tensor_slices((train_features, train_labels))
+    .shuffle(1000)
+    .batch(BATCH_SIZE, drop_remainder=True)
+    .prefetch(tf.data.AUTOTUNE)
+)
+
+test_ds = (
+    tf.data.Dataset.from_tensor_slices((test_features, test_labels))
+    .shuffle(1000)
+    .batch(BATCH_SIZE, drop_remainder=True)
+    .prefetch(tf.data.AUTOTUNE)
+)
+
+model = keras.models.Sequential(
+    [
+        bm.model,
+        keras.layers.Dense(
+            10, activation="softmax", kernel_regularizer=keras.regularizers.l2(0.02)
+        ),
+    ]
+)
+
+model.layers[0].trainable = False
+
+linear_optimizer = tfa.optimizers.LAMB()
+model.compile(
+    optimizer=linear_optimizer,
+    loss="sparse_categorical_crossentropy",
+    metrics=["accuracy"],
+)
+
+model.fit(xy_ds, epochs=35, validation_data=test_ds)
+
+"""
+## Conclusion
+
+*   Barlow Twins is a simple and concise method for contrastive and self-supervised
+learning.
+*   With this resnet-34 model architecture, we were able to reach 62-64% validation
+accuracy.
+
+## Use-Cases of Barlow-Twins(and contrastive learning in General)
+
+*   Semi-supervised learning: You can see that this model gave a 62-64% boost in accuracy
+when it wasn't even trained with the labels. It can be used when you have little labeled
+data but a lot of unlabeled data.
+* You do barlow twins training on the unlabeled data, and then you do secondary training
+with the labeled data.
+
+## Helpful links
+
+* [Paper](https://arxiv.org/abs/2103.03230)
+* [Original Pytorch Implementation](https://github.com/facebookresearch/barlowtwins)
+* [Sayak Paul's Implementation](https://colab.research.google.com/github/sayakpaul/Barlow-Twins-TF/blob/main/Barlow_Twins.ipynb#scrollTo=GlWepkM8_prl).
+* Thanks to Sayak Paul for his implementation. It helped me with debugging and
+comparisons of accuracy, loss.
+* [resnet34 implementation](https://www.analyticsvidhya.com/blog/2021/08/how-to-code-your-resnet-from-scratch-in-tensorflow/#h2_2)
+  * Thanks to Yashowardhan Shinde for writing the article.
+
+
+
+"""
diff --git a/knowledge_base/vision/basnet_segmentation.py b/knowledge_base/vision/basnet_segmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..bad2ce373733c0a639a8249818d710403526edd3
--- /dev/null
+++ b/knowledge_base/vision/basnet_segmentation.py
@@ -0,0 +1,485 @@
+"""
+Title: Highly accurate boundaries segmentation using BASNet
+Author: [Hamid Ali](https://github.com/hamidriasat)
+Date created: 2023/05/30
+Last modified: 2024/10/02
+Description: Boundaries aware segmentation model trained on the DUTS dataset.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+Deep semantic segmentation algorithms have improved a lot recently, but still fails to correctly
+predict pixels around object boundaries. In this example we implement
+**Boundary-Aware Segmentation Network (BASNet)**, using two stage predict and refine
+architecture, and a hybrid loss it can predict highly accurate boundaries and fine structures
+for image segmentation.
+
+### References:
+
+- [Boundary-Aware Segmentation Network for Mobile and Web Applications](https://arxiv.org/abs/2101.04704)
+- [BASNet Keras Implementation](https://github.com/hamidriasat/BASNet/tree/basnet_keras)
+- [Learning to Detect Salient Objects with Image-level Supervision](https://openaccess.thecvf.com/content_cvpr_2017/html/Wang_Learning_to_Detect_CVPR_2017_paper.html)
+"""
+
+"""
+## Download the Data
+
+We will use the [DUTS-TE](http://saliencydetection.net/duts/) dataset for training. It has 5,019
+images but we will use 140 for training and validation to save notebook running time. DUTS is
+relatively large salient object segmentation dataset. which contain diversified textures and
+structures common to real-world images in both foreground and background.
+"""
+
+import os
+
+# Because of the use of tf.image.ssim in the loss,
+# this example requires TensorFlow. The rest of the code
+# is backend-agnostic.
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import numpy as np
+from glob import glob
+import matplotlib.pyplot as plt
+
+import keras_hub
+import tensorflow as tf
+import keras
+from keras import layers, ops
+
+keras.config.disable_traceback_filtering()
+
+"""
+## Define Hyperparameters
+"""
+
+IMAGE_SIZE = 288
+BATCH_SIZE = 4
+OUT_CLASSES = 1
+TRAIN_SPLIT_RATIO = 0.90
+
+"""
+## Create `PyDataset`s
+
+We will use `load_paths()` to load and split 140 paths into train and validation set, and
+convert paths into `PyDataset` object.
+"""
+
+data_dir = keras.utils.get_file(
+    origin="http://saliencydetection.net/duts/download/DUTS-TE.zip",
+    extract=True,
+)
+data_dir = os.path.join(data_dir, "DUTS-TE")
+
+
+def load_paths(path, split_ratio):
+    images = sorted(glob(os.path.join(path, "DUTS-TE-Image/*")))[:140]
+    masks = sorted(glob(os.path.join(path, "DUTS-TE-Mask/*")))[:140]
+    len_ = int(len(images) * split_ratio)
+    return (images[:len_], masks[:len_]), (images[len_:], masks[len_:])
+
+
+class Dataset(keras.utils.PyDataset):
+    def __init__(
+        self,
+        image_paths,
+        mask_paths,
+        img_size,
+        out_classes,
+        batch,
+        shuffle=True,
+        **kwargs,
+    ):
+        if shuffle:
+            perm = np.random.permutation(len(image_paths))
+            image_paths = [image_paths[i] for i in perm]
+            mask_paths = [mask_paths[i] for i in perm]
+        self.image_paths = image_paths
+        self.mask_paths = mask_paths
+        self.img_size = img_size
+        self.out_classes = out_classes
+        self.batch_size = batch
+        super().__init__(*kwargs)
+
+    def __len__(self):
+        return len(self.image_paths) // self.batch_size
+
+    def __getitem__(self, idx):
+        batch_x, batch_y = [], []
+        for i in range(idx * self.batch_size, (idx + 1) * self.batch_size):
+            x, y = self.preprocess(
+                self.image_paths[i],
+                self.mask_paths[i],
+                self.img_size,
+            )
+            batch_x.append(x)
+            batch_y.append(y)
+        batch_x = np.stack(batch_x, axis=0)
+        batch_y = np.stack(batch_y, axis=0)
+        return batch_x, batch_y
+
+    def read_image(self, path, size, mode):
+        x = keras.utils.load_img(path, target_size=size, color_mode=mode)
+        x = keras.utils.img_to_array(x)
+        x = (x / 255.0).astype(np.float32)
+        return x
+
+    def preprocess(self, x_batch, y_batch, img_size):
+        images = self.read_image(x_batch, (img_size, img_size), mode="rgb")  # image
+        masks = self.read_image(y_batch, (img_size, img_size), mode="grayscale")  # mask
+        return images, masks
+
+
+train_paths, val_paths = load_paths(data_dir, TRAIN_SPLIT_RATIO)
+
+train_dataset = Dataset(
+    train_paths[0], train_paths[1], IMAGE_SIZE, OUT_CLASSES, BATCH_SIZE, shuffle=True
+)
+val_dataset = Dataset(
+    val_paths[0], val_paths[1], IMAGE_SIZE, OUT_CLASSES, BATCH_SIZE, shuffle=False
+)
+
+"""
+## Visualize Data
+"""
+
+
+def display(display_list):
+    title = ["Input Image", "True Mask", "Predicted Mask"]
+
+    for i in range(len(display_list)):
+        plt.subplot(1, len(display_list), i + 1)
+        plt.title(title[i])
+        plt.imshow(keras.utils.array_to_img(display_list[i]), cmap="gray")
+        plt.axis("off")
+    plt.show()
+
+
+for image, mask in val_dataset:
+    display([image[0], mask[0]])
+    break
+
+"""
+## Analyze Mask
+
+Lets print unique values of above displayed mask. You can see despite belonging to one class, it's
+intensity is changing between low(0) to high(255). This variation in intensity makes it hard for
+network to generate good segmentation map for **salient or camouflaged object segmentation**.
+Because of its Residual Refined Module (RMs), BASNet is good in generating highly accurate
+boundaries and fine structures.
+"""
+
+print(f"Unique values count: {len(np.unique((mask[0] * 255)))}")
+print("Unique values:")
+print(np.unique((mask[0] * 255)).astype(int))
+
+"""
+## Building the BASNet Model
+
+BASNet comprises of a predict-refine architecture and a hybrid loss. The predict-refine
+architecture consists of a densely supervised encoder-decoder network and a residual refinement
+module, which are respectively used to predict and refine a segmentation probability map.
+
+![](https://i.imgur.com/8jaZ2qs.png)
+"""
+
+
+def basic_block(x_input, filters, stride=1, down_sample=None, activation=None):
+    """Creates a residual(identity) block with two 3*3 convolutions."""
+    residual = x_input
+
+    x = layers.Conv2D(filters, (3, 3), strides=stride, padding="same", use_bias=False)(
+        x_input
+    )
+    x = layers.BatchNormalization()(x)
+    x = layers.Activation("relu")(x)
+
+    x = layers.Conv2D(filters, (3, 3), strides=(1, 1), padding="same", use_bias=False)(
+        x
+    )
+    x = layers.BatchNormalization()(x)
+
+    if down_sample is not None:
+        residual = down_sample
+
+    x = layers.Add()([x, residual])
+
+    if activation is not None:
+        x = layers.Activation(activation)(x)
+
+    return x
+
+
+def convolution_block(x_input, filters, dilation=1):
+    """Apply convolution + batch normalization + relu layer."""
+    x = layers.Conv2D(filters, (3, 3), padding="same", dilation_rate=dilation)(x_input)
+    x = layers.BatchNormalization()(x)
+    return layers.Activation("relu")(x)
+
+
+def segmentation_head(x_input, out_classes, final_size):
+    """Map each decoder stage output to model output classes."""
+    x = layers.Conv2D(out_classes, kernel_size=(3, 3), padding="same")(x_input)
+
+    if final_size is not None:
+        x = layers.Resizing(final_size[0], final_size[1])(x)
+
+    return x
+
+
+def get_resnet_block(resnet, block_num):
+    """Extract and return a ResNet-34 block."""
+    extractor_levels = ["P2", "P3", "P4", "P5"]
+    num_blocks = resnet.stackwise_num_blocks
+    if block_num == 0:
+        x = resnet.get_layer("pool1_pool").output
+    else:
+        x = resnet.pyramid_outputs[extractor_levels[block_num - 1]]
+    y = resnet.get_layer(f"stack{block_num}_block{num_blocks[block_num]-1}_add").output
+    return keras.models.Model(
+        inputs=x,
+        outputs=y,
+        name=f"resnet_block{block_num + 1}",
+    )
+
+
+"""
+## Prediction Module
+
+Prediction module is a heavy encoder decoder structure like U-Net. The encoder includes an input
+convolutional layer and six stages. First four are adopted from ResNet-34 and rest are basic
+res-blocks. Since first convolution and pooling layer of ResNet-34 is skipped so we will use
+`get_resnet_block()` to extract first four blocks. Both bridge and decoder uses three
+convolutional layers with side outputs. The module produces seven segmentation probability
+maps during training, with the last one considered the final output.
+"""
+
+
+def basnet_predict(input_shape, out_classes):
+    """BASNet Prediction Module, it outputs coarse label map."""
+    filters = 64
+    num_stages = 6
+
+    x_input = layers.Input(input_shape)
+
+    # -------------Encoder--------------
+    x = layers.Conv2D(filters, kernel_size=(3, 3), padding="same")(x_input)
+
+    resnet = keras_hub.models.ResNetBackbone(
+        input_conv_filters=[64],
+        input_conv_kernel_sizes=[7],
+        stackwise_num_filters=[64, 128, 256, 512],
+        stackwise_num_blocks=[3, 4, 6, 3],
+        stackwise_num_strides=[1, 2, 2, 2],
+        block_type="basic_block",
+    )
+
+    encoder_blocks = []
+    for i in range(num_stages):
+        if i < 4:  # First four stages are adopted from ResNet-34 blocks.
+            x = get_resnet_block(resnet, i)(x)
+            encoder_blocks.append(x)
+            x = layers.Activation("relu")(x)
+        else:  # Last 2 stages consist of three basic resnet blocks.
+            x = layers.MaxPool2D(pool_size=(2, 2), strides=(2, 2))(x)
+            x = basic_block(x, filters=filters * 8, activation="relu")
+            x = basic_block(x, filters=filters * 8, activation="relu")
+            x = basic_block(x, filters=filters * 8, activation="relu")
+            encoder_blocks.append(x)
+
+    # -------------Bridge-------------
+    x = convolution_block(x, filters=filters * 8, dilation=2)
+    x = convolution_block(x, filters=filters * 8, dilation=2)
+    x = convolution_block(x, filters=filters * 8, dilation=2)
+    encoder_blocks.append(x)
+
+    # -------------Decoder-------------
+    decoder_blocks = []
+    for i in reversed(range(num_stages)):
+        if i != (num_stages - 1):  # Except first, scale other decoder stages.
+            shape = x.shape
+            x = layers.Resizing(shape[1] * 2, shape[2] * 2)(x)
+
+        x = layers.concatenate([encoder_blocks[i], x], axis=-1)
+        x = convolution_block(x, filters=filters * 8)
+        x = convolution_block(x, filters=filters * 8)
+        x = convolution_block(x, filters=filters * 8)
+        decoder_blocks.append(x)
+
+    decoder_blocks.reverse()  # Change order from last to first decoder stage.
+    decoder_blocks.append(encoder_blocks[-1])  # Copy bridge to decoder.
+
+    # -------------Side Outputs--------------
+    decoder_blocks = [
+        segmentation_head(decoder_block, out_classes, input_shape[:2])
+        for decoder_block in decoder_blocks
+    ]
+
+    return keras.models.Model(inputs=x_input, outputs=decoder_blocks)
+
+
+"""
+## Residual Refinement Module
+
+Refinement Modules (RMs), designed as a residual block aim to refines the coarse(blurry and noisy
+boundaries) segmentation maps generated by prediction module. Similar to prediction module it's
+also an encode decoder structure but with light weight 4 stages, each containing one
+`convolutional block()` init. At the end it adds both coarse and residual output to generate
+refined output.
+"""
+
+
+def basnet_rrm(base_model, out_classes):
+    """BASNet Residual Refinement Module(RRM) module, output fine label map."""
+    num_stages = 4
+    filters = 64
+
+    x_input = base_model.output[0]
+
+    # -------------Encoder--------------
+    x = layers.Conv2D(filters, kernel_size=(3, 3), padding="same")(x_input)
+
+    encoder_blocks = []
+    for _ in range(num_stages):
+        x = convolution_block(x, filters=filters)
+        encoder_blocks.append(x)
+        x = layers.MaxPool2D(pool_size=(2, 2), strides=(2, 2))(x)
+
+    # -------------Bridge--------------
+    x = convolution_block(x, filters=filters)
+
+    # -------------Decoder--------------
+    for i in reversed(range(num_stages)):
+        shape = x.shape
+        x = layers.Resizing(shape[1] * 2, shape[2] * 2)(x)
+        x = layers.concatenate([encoder_blocks[i], x], axis=-1)
+        x = convolution_block(x, filters=filters)
+
+    x = segmentation_head(x, out_classes, None)  # Segmentation head.
+
+    # ------------- refined = coarse + residual
+    x = layers.Add()([x_input, x])  # Add prediction + refinement output
+
+    return keras.models.Model(inputs=[base_model.input], outputs=[x])
+
+
+"""
+## Combine Predict and Refinement Module
+"""
+
+
+class BASNet(keras.Model):
+    def __init__(self, input_shape, out_classes):
+        """BASNet, it's a combination of two modules
+        Prediction Module and Residual Refinement Module(RRM)."""
+
+        # Prediction model.
+        predict_model = basnet_predict(input_shape, out_classes)
+        # Refinement model.
+        refine_model = basnet_rrm(predict_model, out_classes)
+
+        output = refine_model.outputs  # Combine outputs.
+        output.extend(predict_model.output)
+
+        # Activations.
+        output = [layers.Activation("sigmoid")(x) for x in output]
+        super().__init__(inputs=predict_model.input, outputs=output)
+
+        self.smooth = 1.0e-9
+        # Binary Cross Entropy loss.
+        self.cross_entropy_loss = keras.losses.BinaryCrossentropy()
+        # Structural Similarity Index value.
+        self.ssim_value = tf.image.ssim
+        # Jaccard / IoU loss.
+        self.iou_value = self.calculate_iou
+
+    def calculate_iou(
+        self,
+        y_true,
+        y_pred,
+    ):
+        """Calculate intersection over union (IoU) between images."""
+        intersection = ops.sum(ops.abs(y_true * y_pred), axis=[1, 2, 3])
+        union = ops.sum(y_true, [1, 2, 3]) + ops.sum(y_pred, [1, 2, 3])
+        union = union - intersection
+        return ops.mean((intersection + self.smooth) / (union + self.smooth), axis=0)
+
+    def compute_loss(self, x, y_true, y_pred, sample_weight=None, training=False):
+        total = 0.0
+        for y_pred_i in y_pred:  # y_pred = refine_model.outputs + predict_model.output
+            cross_entropy_loss = self.cross_entropy_loss(y_true, y_pred_i)
+
+            ssim_value = self.ssim_value(y_true, y_pred, max_val=1)
+            ssim_loss = ops.mean(1 - ssim_value + self.smooth, axis=0)
+
+            iou_value = self.iou_value(y_true, y_pred)
+            iou_loss = 1 - iou_value
+
+            # Add all three losses.
+            total += cross_entropy_loss + ssim_loss + iou_loss
+        return total
+
+
+"""
+## Hybrid Loss
+
+Another important feature of BASNet is its hybrid loss function, which is a combination of
+binary cross entropy, structural similarity and intersection-over-union losses, which guide
+the network to learn three-level (i.e., pixel, patch and map level) hierarchy representations.
+"""
+
+
+basnet_model = BASNet(
+    input_shape=[IMAGE_SIZE, IMAGE_SIZE, 3], out_classes=OUT_CLASSES
+)  # Create model.
+basnet_model.summary()  # Show model summary.
+
+optimizer = keras.optimizers.Adam(learning_rate=1e-4, epsilon=1e-8)
+# Compile model.
+basnet_model.compile(
+    optimizer=optimizer,
+    metrics=[keras.metrics.MeanAbsoluteError(name="mae") for _ in basnet_model.outputs],
+)
+
+"""
+### Train the Model
+"""
+
+basnet_model.fit(train_dataset, validation_data=val_dataset, epochs=1)
+
+"""
+### Visualize Predictions
+
+In paper BASNet was trained on DUTS-TR dataset, which has 10553 images. Model was trained for 400k
+iterations with a batch size of eight and without a validation dataset. After training model was
+evaluated on DUTS-TE dataset and achieved a mean absolute error of `0.042`.
+
+Since BASNet is a deep model and cannot be trained in a short amount of time which is a
+requirement for keras example notebook, so we will load pretrained weights from [here](https://github.com/hamidriasat/BASNet/tree/basnet_keras)
+to show model prediction. Due to computer power limitation this model was trained for 120k
+iterations but it still demonstrates its capabilities. For further details about
+trainings parameters please check given link.
+"""
+
+import gdown
+
+gdown.download(id="1OWKouuAQ7XpXZbWA3mmxDPrFGW71Axrg", output="basnet_weights.h5")
+
+
+def normalize_output(prediction):
+    max_value = np.max(prediction)
+    min_value = np.min(prediction)
+    return (prediction - min_value) / (max_value - min_value)
+
+
+# Load weights.
+basnet_model.load_weights("./basnet_weights.h5")
+
+"""
+### Make Predictions
+"""
+
+for (image, mask), _ in zip(val_dataset, range(1)):
+    pred_mask = basnet_model.predict(image)
+    display([image[0], mask[0], normalize_output(pred_mask[0][0])])
diff --git a/knowledge_base/vision/bit.py b/knowledge_base/vision/bit.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b7d5ffa17e1a6121c35433531c4ee864875c015
--- /dev/null
+++ b/knowledge_base/vision/bit.py
@@ -0,0 +1,311 @@
+"""
+Title: Image Classification using BigTransfer (BiT)
+Author: [Sayan Nath](https://twitter.com/sayannath2350)
+Date created: 2021/09/24
+Last modified: 2024/01/03
+Description: BigTransfer (BiT) State-of-the-art transfer learning for image classification.
+Accelerator: GPU
+Converted to Keras 3 by: [Sitam Meur](https://github.com/sitamgithub-MSIT)
+"""
+
+"""
+## Introduction
+
+BigTransfer (also known as BiT) is a state-of-the-art transfer learning method for image
+classification. Transfer of pre-trained representations improves sample efficiency and
+simplifies hyperparameter tuning when training deep neural networks for vision. BiT
+revisit the paradigm of pre-training on large supervised datasets and fine-tuning the
+model on a target task. The importance of appropriately choosing normalization layers and
+scaling the architecture capacity as the amount of pre-training data increases.
+
+BigTransfer(BiT) is trained on public datasets, along with code in
+[TF2, Jax and Pytorch](https://github.com/google-research/big_transfer). This will help anyone to reach
+state of the art performance on their task of interest, even with just a handful of
+labeled images per class.
+
+You can find BiT models pre-trained on
+[ImageNet](https://image-net.org/challenges/LSVRC/2012/index) and ImageNet-21k in
+[TFHub](https://tfhub.dev/google/collections/bit/1) as TensorFlow2 SavedModels that you
+can use easily as Keras Layers. There are a variety of sizes ranging from a standard
+ResNet50 to a ResNet152x4 (152 layers deep, 4x wider than a typical ResNet50) for users
+with larger computational and memory budgets but higher accuracy requirements.
+
+![](https://i.imgur.com/XeWVfe7.jpeg)
+Figure: The x-axis shows the number of images used per class, ranging from 1 to the full
+dataset. On the plots on the left, the curve in blue above is our BiT-L model, whereas
+the curve below is a ResNet-50 pre-trained on ImageNet (ILSVRC-2012).
+"""
+
+"""
+## Setup
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+import numpy as np
+import pandas as pd
+import matplotlib.pyplot as plt
+
+import keras
+from keras import ops
+import tensorflow as tf
+import tensorflow_hub as hub
+import tensorflow_datasets as tfds
+
+tfds.disable_progress_bar()
+
+SEEDS = 42
+
+keras.utils.set_random_seed(SEEDS)
+
+"""
+## Gather Flower Dataset
+"""
+
+train_ds, validation_ds = tfds.load(
+    "tf_flowers",
+    split=["train[:85%]", "train[85%:]"],
+    as_supervised=True,
+)
+
+"""
+## Visualise the dataset
+"""
+
+plt.figure(figsize=(10, 10))
+for i, (image, label) in enumerate(train_ds.take(9)):
+    ax = plt.subplot(3, 3, i + 1)
+    plt.imshow(image)
+    plt.title(int(label))
+    plt.axis("off")
+
+"""
+## Define hyperparameters
+"""
+
+RESIZE_TO = 384
+CROP_TO = 224
+BATCH_SIZE = 64
+STEPS_PER_EPOCH = 10
+AUTO = tf.data.AUTOTUNE  # optimise the pipeline performance
+NUM_CLASSES = 5  # number of classes
+SCHEDULE_LENGTH = (
+    500  # we will train on lower resolution images and will still attain good results
+)
+SCHEDULE_BOUNDARIES = [
+    200,
+    300,
+    400,
+]  # more the dataset size the schedule length increase
+
+"""
+The hyperparamteres like `SCHEDULE_LENGTH` and `SCHEDULE_BOUNDARIES` are determined based
+on empirical results. The method has been explained in the [original
+paper](https://arxiv.org/abs/1912.11370) and in their [Google AI Blog
+Post](https://ai.googleblog.com/2020/05/open-sourcing-bit-exploring-large-scale.html).
+
+The `SCHEDULE_LENGTH` is aslo determined whether to use [MixUp
+Augmentation](https://arxiv.org/abs/1710.09412) or not. You can also find an easy MixUp
+Implementation in [Keras Coding Examples](https://keras.io/examples/vision/mixup/).
+
+![](https://i.imgur.com/oSaIBYZ.jpeg)
+"""
+
+"""
+## Define preprocessing helper functions
+"""
+
+SCHEDULE_LENGTH = SCHEDULE_LENGTH * 512 / BATCH_SIZE
+
+random_flip = keras.layers.RandomFlip("horizontal")
+random_crop = keras.layers.RandomCrop(CROP_TO, CROP_TO)
+
+
+def preprocess_train(image, label):
+    image = random_flip(image)
+    image = ops.image.resize(image, (RESIZE_TO, RESIZE_TO))
+    image = random_crop(image)
+    image = image / 255.0
+    return (image, label)
+
+
+def preprocess_test(image, label):
+    image = ops.image.resize(image, (RESIZE_TO, RESIZE_TO))
+    image = ops.cast(image, dtype="float32")
+    image = image / 255.0
+    return (image, label)
+
+
+DATASET_NUM_TRAIN_EXAMPLES = train_ds.cardinality().numpy()
+
+repeat_count = int(
+    SCHEDULE_LENGTH * BATCH_SIZE / DATASET_NUM_TRAIN_EXAMPLES * STEPS_PER_EPOCH
+)
+repeat_count += 50 + 1  # To ensure at least there are 50 epochs of training
+
+"""
+## Define the data pipeline
+"""
+
+# Training pipeline
+pipeline_train = (
+    train_ds.shuffle(10000)
+    .repeat(repeat_count)  # Repeat dataset_size / num_steps
+    .map(preprocess_train, num_parallel_calls=AUTO)
+    .batch(BATCH_SIZE)
+    .prefetch(AUTO)
+)
+
+# Validation pipeline
+pipeline_validation = (
+    validation_ds.map(preprocess_test, num_parallel_calls=AUTO)
+    .batch(BATCH_SIZE)
+    .prefetch(AUTO)
+)
+
+"""
+## Visualise the training samples
+"""
+
+image_batch, label_batch = next(iter(pipeline_train))
+
+plt.figure(figsize=(10, 10))
+for n in range(25):
+    ax = plt.subplot(5, 5, n + 1)
+    plt.imshow(image_batch[n])
+    plt.title(label_batch[n].numpy())
+    plt.axis("off")
+
+"""
+## Load pretrained TF-Hub model into a `KerasLayer`
+"""
+
+bit_model_url = "https://tfhub.dev/google/bit/m-r50x1/1"
+bit_module = hub.load(bit_model_url)
+
+"""
+## Create BigTransfer (BiT) model
+
+To create the new model, we:
+
+1. Cut off the BiT model’s original head. This leaves us with the “pre-logits” output.
+We do not have to do this if we use the ‘feature extractor’ models (i.e. all those in
+subdirectories titled `feature_vectors`), since for those models the head has already
+been cut off.
+
+2. Add a new head with the number of outputs equal to the number of classes of our new
+task. Note that it is important that we initialise the head to all zeroes.
+"""
+
+
+class MyBiTModel(keras.Model):
+    def __init__(self, num_classes, module, **kwargs):
+        super().__init__(**kwargs)
+
+        self.num_classes = num_classes
+        self.head = keras.layers.Dense(num_classes, kernel_initializer="zeros")
+        self.bit_model = module
+
+    def call(self, images):
+        bit_embedding = self.bit_model(images)
+        return self.head(bit_embedding)
+
+
+model = MyBiTModel(num_classes=NUM_CLASSES, module=bit_module)
+
+"""
+## Define optimizer and loss
+"""
+
+learning_rate = 0.003 * BATCH_SIZE / 512
+
+# Decay learning rate by a factor of 10 at SCHEDULE_BOUNDARIES.
+lr_schedule = keras.optimizers.schedules.PiecewiseConstantDecay(
+    boundaries=SCHEDULE_BOUNDARIES,
+    values=[
+        learning_rate,
+        learning_rate * 0.1,
+        learning_rate * 0.01,
+        learning_rate * 0.001,
+    ],
+)
+optimizer = keras.optimizers.SGD(learning_rate=lr_schedule, momentum=0.9)
+
+loss_fn = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+
+"""
+## Compile the model
+"""
+
+model.compile(optimizer=optimizer, loss=loss_fn, metrics=["accuracy"])
+
+"""
+## Set up callbacks
+"""
+
+train_callbacks = [
+    keras.callbacks.EarlyStopping(
+        monitor="val_accuracy", patience=2, restore_best_weights=True
+    )
+]
+
+"""
+## Train the model
+"""
+
+history = model.fit(
+    pipeline_train,
+    batch_size=BATCH_SIZE,
+    epochs=int(SCHEDULE_LENGTH / STEPS_PER_EPOCH),
+    steps_per_epoch=STEPS_PER_EPOCH,
+    validation_data=pipeline_validation,
+    callbacks=train_callbacks,
+)
+
+"""
+## Plot the training and validation metrics
+"""
+
+
+def plot_hist(hist):
+    plt.plot(hist.history["accuracy"])
+    plt.plot(hist.history["val_accuracy"])
+    plt.plot(hist.history["loss"])
+    plt.plot(hist.history["val_loss"])
+    plt.title("Training Progress")
+    plt.ylabel("Accuracy/Loss")
+    plt.xlabel("Epochs")
+    plt.legend(["train_acc", "val_acc", "train_loss", "val_loss"], loc="upper left")
+    plt.show()
+
+
+plot_hist(history)
+
+"""
+## Evaluate the model
+"""
+
+accuracy = model.evaluate(pipeline_validation)[1] * 100
+print("Accuracy: {:.2f}%".format(accuracy))
+
+"""
+## Conclusion
+
+BiT performs well across a surprisingly wide range of data regimes
+-- from 1 example per class to 1M total examples. BiT achieves 87.5% top-1 accuracy on
+ILSVRC-2012, 99.4% on CIFAR-10, and 76.3% on the 19 task Visual Task Adaptation Benchmark
+(VTAB). On small datasets, BiT attains 76.8% on ILSVRC-2012 with 10 examples per class,
+and 97.0% on CIFAR-10 with 10 examples per class.
+
+![](https://i.imgur.com/b1Lw5fz.png)
+
+You can experiment further with the BigTransfer Method by following the
+[original paper](https://arxiv.org/abs/1912.11370).
+
+
+**Example available on HuggingFace**
+| Trained Model | Demo |
+| :--: | :--: |
+| [![Generic badge](https://img.shields.io/badge/%F0%9F%A4%97%20Model-bit-black.svg)](https://huggingface.co/keras-io/bit) | [![Generic badge](https://img.shields.io/badge/%F0%9F%A4%97%20Spaces-bit-black.svg)](https://huggingface.co/spaces/keras-io/siamese-contrastive) |
+"""
diff --git a/knowledge_base/vision/cait.py b/knowledge_base/vision/cait.py
new file mode 100644
index 0000000000000000000000000000000000000000..39502d71beae009f2beb313f6cedb99d3d1a7539
--- /dev/null
+++ b/knowledge_base/vision/cait.py
@@ -0,0 +1,1031 @@
+"""
+Title: Class Attention Image Transformers with LayerScale
+Author: [Sayak Paul](https://twitter.com/RisingSayak)
+Date created: 2022/09/19
+Last modified: 2022/11/21
+Description: Implementing an image transformer equipped with Class Attention and LayerScale.
+Accelerator: None
+"""
+
+"""
+
+## Introduction
+
+In this tutorial, we implement the CaiT (Class-Attention in Image Transformers)
+proposed in [Going deeper with Image Transformers](https://arxiv.org/abs/2103.17239) by
+Touvron et al. Depth scaling, i.e. increasing the model depth for obtaining better
+performance and generalization has been quite successful for convolutional neural
+networks ([Tan et al.](https://arxiv.org/abs/1905.11946),
+[Dollár et al.](https://arxiv.org/abs/2103.06877), for example). But applying
+the same model scaling principles to
+Vision Transformers ([Dosovitskiy et al.](https://arxiv.org/abs/2010.11929)) doesn't
+translate equally well -- their performance gets saturated quickly with depth scaling.
+Note that one assumption here is that the underlying pre-training dataset is
+always kept fixed when performing model scaling.
+
+In the CaiT paper, the authors investigate this phenomenon and propose modifications to
+the vanilla ViT (Vision Transformers) architecture to mitigate this problem.
+
+The tutorial is structured like so:
+
+* Implementation of the individual blocks of CaiT
+* Collating all the blocks to create the CaiT model
+* Loading a pre-trained CaiT model
+* Obtaining prediction results
+* Visualization of the different attention layers of CaiT
+
+The readers are assumed to be familiar with Vision Transformers already. Here is
+an implementation of Vision Transformers in Keras:
+[Image classification with Vision Transformer](https://keras.io/examples/vision/image_classification_with_vision_transformer/).
+"""
+
+"""
+## Imports
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import io
+import typing
+from urllib.request import urlopen
+
+import matplotlib.pyplot as plt
+import numpy as np
+import PIL
+import keras
+from keras import layers
+from keras import ops
+
+"""
+## The LayerScale layer
+
+We begin by implementing a **LayerScale** layer which is one of the two modifications
+proposed in the CaiT paper.
+
+When increasing the depth of the ViT models, they meet with optimization instability and
+eventually don't converge. The residual connections within each Transformer block
+introduce information bottleneck. When there is an increased amount of depth, this
+bottleneck can quickly explode and deviate the optimization pathway for the underlying
+model.
+
+The following equations denote where residual connections are added within a Transformer
+block:
+
+<div align="center">
+    <img src="https://i.ibb.co/jWV5bFb/image.png"/>
+</div>
+
+where, **SA** stands for self-attention, **FFN** stands for feed-forward network, and
+**eta** denotes the LayerNorm operator ([Ba et al.](https://arxiv.org/abs/1607.06450)).
+
+LayerScale is formally implemented like so:
+
+<div align="center">
+    <img src="https://i.ibb.co/VYDWNn9/image.png"/>
+</div>
+
+where, the lambdas are learnable parameters and are initialized with a very small value
+({0.1, 1e-5, 1e-6}). **diag** represents a diagonal matrix.
+
+Intuitively, LayerScale helps control the contribution of the residual branches. The
+learnable parameters of LayerScale are initialized to a small value to let the branches
+act like identity functions and then let them figure out the degrees of interactions
+during the training. The diagonal matrix additionally helps control the contributions
+of the individual dimensions of the residual inputs as it is applied on a per-channel
+basis.
+
+The practical implementation of LayerScale is simpler than it might sound.
+"""
+
+
+class LayerScale(layers.Layer):
+    """LayerScale as introduced in CaiT: https://arxiv.org/abs/2103.17239.
+
+    Args:
+        init_values (float): value to initialize the diagonal matrix of LayerScale.
+        projection_dim (int): projection dimension used in LayerScale.
+    """
+
+    def __init__(self, init_values: float, projection_dim: int, **kwargs):
+        super().__init__(**kwargs)
+        self.gamma = self.add_weight(
+            shape=(projection_dim,),
+            initializer=keras.initializers.Constant(init_values),
+        )
+
+    def call(self, x, training=False):
+        return x * self.gamma
+
+
+"""
+## Stochastic depth layer
+
+Since its introduction ([Huang et al.](https://arxiv.org/abs/1603.09382)), Stochastic
+Depth has become a favorite component in almost all modern neural network architectures.
+CaiT is no exception. Discussing Stochastic Depth is out of scope for this notebook. You
+can refer to [this resource](https://paperswithcode.com/method/stochastic-depth) in case
+you need a refresher.
+"""
+
+
+class StochasticDepth(layers.Layer):
+    """Stochastic Depth layer (https://arxiv.org/abs/1603.09382).
+
+    Reference:
+        https://github.com/rwightman/pytorch-image-models
+    """
+
+    def __init__(self, drop_prob: float, **kwargs):
+        super().__init__(**kwargs)
+        self.drop_prob = drop_prob
+        self.seed_generator = keras.random.SeedGenerator(1337)
+
+    def call(self, x, training=False):
+        if training:
+            keep_prob = 1 - self.drop_prob
+            shape = (ops.shape(x)[0],) + (1,) * (len(x.shape) - 1)
+            random_tensor = keep_prob + ops.random.uniform(
+                shape, minval=0, maxval=1, seed=self.seed_generator
+            )
+            random_tensor = ops.floor(random_tensor)
+            return (x / keep_prob) * random_tensor
+        return x
+
+
+"""
+## Class attention
+
+The vanilla ViT uses self-attention (SA) layers for modelling how the image patches and
+the _learnable_ CLS token interact with each other. The CaiT authors propose to decouple
+the attention layers responsible for attending to the image patches and the CLS tokens.
+
+When using ViTs for any discriminative tasks (classification, for example), we usually
+take the representations belonging to the CLS token and then pass them to the
+task-specific heads. This is as opposed to using something like global average pooling as
+is typically done in convolutional neural networks.
+
+The interactions between the CLS token and other image patches are processed uniformly
+through self-attention layers. As the CaiT authors point out, this setup has got an
+entangled effect. On one hand, the self-attention layers are responsible for modelling
+the image patches. On the other hand, they're also responsible for summarizing the
+modelled information via the CLS token so that it's useful for the learning objective.
+
+To help disentangle these two things, the authors propose to:
+
+* Introduce the CLS token at a later stage in the network.
+* Model the interaction between the CLS token and the representations related to the
+image patches through a separate set of attention layers. The authors call this **Class
+Attention** (CA).
+
+The figure below (taken from the original paper) depicts this idea:
+
+<div align="center">
+    <img src="https://i.imgur.com/cxeooHr.png"/ width=350>
+</div>
+
+This is achieved by treating the CLS token embeddings as the queries in the CA layers.
+CLS token embeddings and the image patch embeddings are fed as keys as well values.
+
+**Note** that "embeddings" and "representations" have been used interchangeably here.
+"""
+
+
+class ClassAttention(layers.Layer):
+    """Class attention as proposed in CaiT: https://arxiv.org/abs/2103.17239.
+
+    Args:
+        projection_dim (int): projection dimension for the query, key, and value
+            of attention.
+        num_heads (int): number of attention heads.
+        dropout_rate (float): dropout rate to be used for dropout in the attention
+            scores as well as the final projected outputs.
+    """
+
+    def __init__(
+        self, projection_dim: int, num_heads: int, dropout_rate: float, **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.num_heads = num_heads
+
+        head_dim = projection_dim // num_heads
+        self.scale = head_dim**-0.5
+
+        self.q = layers.Dense(projection_dim)
+        self.k = layers.Dense(projection_dim)
+        self.v = layers.Dense(projection_dim)
+        self.attn_drop = layers.Dropout(dropout_rate)
+        self.proj = layers.Dense(projection_dim)
+        self.proj_drop = layers.Dropout(dropout_rate)
+
+    def call(self, x, training=False):
+        batch_size, num_patches, num_channels = (
+            ops.shape(x)[0],
+            ops.shape(x)[1],
+            ops.shape(x)[2],
+        )
+
+        # Query projection. `cls_token` embeddings are queries.
+        q = ops.expand_dims(self.q(x[:, 0]), axis=1)
+        q = ops.reshape(
+            q, (batch_size, 1, self.num_heads, num_channels // self.num_heads)
+        )  # Shape: (batch_size, 1, num_heads, dimension_per_head)
+        q = ops.transpose(q, axes=[0, 2, 1, 3])
+        scale = ops.cast(self.scale, dtype=q.dtype)
+        q = q * scale
+
+        # Key projection. Patch embeddings as well the cls embedding are used as keys.
+        k = self.k(x)
+        k = ops.reshape(
+            k, (batch_size, num_patches, self.num_heads, num_channels // self.num_heads)
+        )  # Shape: (batch_size, num_tokens, num_heads, dimension_per_head)
+        k = ops.transpose(k, axes=[0, 2, 3, 1])
+
+        # Value projection. Patch embeddings as well the cls embedding are used as values.
+        v = self.v(x)
+        v = ops.reshape(
+            v, (batch_size, num_patches, self.num_heads, num_channels // self.num_heads)
+        )
+        v = ops.transpose(v, axes=[0, 2, 1, 3])
+
+        # Calculate attention scores between cls_token embedding and patch embeddings.
+        attn = ops.matmul(q, k)
+        attn = ops.nn.softmax(attn, axis=-1)
+        attn = self.attn_drop(attn, training=training)
+
+        x_cls = ops.matmul(attn, v)
+        x_cls = ops.transpose(x_cls, axes=[0, 2, 1, 3])
+        x_cls = ops.reshape(x_cls, (batch_size, 1, num_channels))
+        x_cls = self.proj(x_cls)
+        x_cls = self.proj_drop(x_cls, training=training)
+
+        return x_cls, attn
+
+
+"""
+## Talking Head Attention
+
+The CaiT authors use the Talking Head attention
+([Shazeer et al.](https://arxiv.org/abs/2003.02436))
+instead of the vanilla scaled dot-product multi-head attention used in
+the original Transformer paper
+([Vaswani et al.](https://papers.nips.cc/paper/7181-attention-is-all-you-need)).
+They introduce two linear projections before and after the softmax
+operations for obtaining better results.
+
+For a more rigorous treatment of the Talking Head attention and the vanilla attention
+mechanisms, please refer to their respective papers (linked above).
+"""
+
+
+class TalkingHeadAttention(layers.Layer):
+    """Talking-head attention as proposed in CaiT: https://arxiv.org/abs/2003.02436.
+
+    Args:
+        projection_dim (int): projection dimension for the query, key, and value
+            of attention.
+        num_heads (int): number of attention heads.
+        dropout_rate (float): dropout rate to be used for dropout in the attention
+            scores as well as the final projected outputs.
+    """
+
+    def __init__(
+        self, projection_dim: int, num_heads: int, dropout_rate: float, **kwargs
+    ):
+        super().__init__(**kwargs)
+
+        self.num_heads = num_heads
+
+        head_dim = projection_dim // self.num_heads
+
+        self.scale = head_dim**-0.5
+
+        self.qkv = layers.Dense(projection_dim * 3)
+        self.attn_drop = layers.Dropout(dropout_rate)
+
+        self.proj = layers.Dense(projection_dim)
+
+        self.proj_l = layers.Dense(self.num_heads)
+        self.proj_w = layers.Dense(self.num_heads)
+
+        self.proj_drop = layers.Dropout(dropout_rate)
+
+    def call(self, x, training=False):
+        B, N, C = ops.shape(x)[0], ops.shape(x)[1], ops.shape(x)[2]
+
+        # Project the inputs all at once.
+        qkv = self.qkv(x)
+
+        # Reshape the projected output so that they're segregated in terms of
+        # query, key, and value projections.
+        qkv = ops.reshape(qkv, (B, N, 3, self.num_heads, C // self.num_heads))
+
+        # Transpose so that the `num_heads` becomes the leading dimensions.
+        # Helps to better segregate the representation sub-spaces.
+        qkv = ops.transpose(qkv, axes=[2, 0, 3, 1, 4])
+        scale = ops.cast(self.scale, dtype=qkv.dtype)
+        q, k, v = qkv[0] * scale, qkv[1], qkv[2]
+
+        # Obtain the raw attention scores.
+        attn = ops.matmul(q, ops.transpose(k, axes=[0, 1, 3, 2]))
+
+        # Linear projection of the similarities between the query and key projections.
+        attn = self.proj_l(ops.transpose(attn, axes=[0, 2, 3, 1]))
+
+        # Normalize the attention scores.
+        attn = ops.transpose(attn, axes=[0, 3, 1, 2])
+        attn = ops.nn.softmax(attn, axis=-1)
+
+        # Linear projection on the softmaxed scores.
+        attn = self.proj_w(ops.transpose(attn, axes=[0, 2, 3, 1]))
+        attn = ops.transpose(attn, axes=[0, 3, 1, 2])
+        attn = self.attn_drop(attn, training=training)
+
+        # Final set of projections as done in the vanilla attention mechanism.
+        x = ops.matmul(attn, v)
+        x = ops.transpose(x, axes=[0, 2, 1, 3])
+        x = ops.reshape(x, (B, N, C))
+
+        x = self.proj(x)
+        x = self.proj_drop(x, training=training)
+
+        return x, attn
+
+
+"""
+## Feed-forward Network
+
+Next, we implement the feed-forward network which is one of the components within a
+Transformer block.
+"""
+
+
+def mlp(x, dropout_rate: float, hidden_units: typing.List[int]):
+    """FFN for a Transformer block."""
+    for idx, units in enumerate(hidden_units):
+        x = layers.Dense(
+            units,
+            activation=ops.nn.gelu if idx == 0 else None,
+            bias_initializer=keras.initializers.RandomNormal(stddev=1e-6),
+        )(x)
+        x = layers.Dropout(dropout_rate)(x)
+    return x
+
+
+"""
+## Other blocks
+
+In the next two cells, we implement the remaining blocks as standalone functions:
+
+* `LayerScaleBlockClassAttention()` which returns a `keras.Model`. It is a Transformer block
+equipped with Class Attention, LayerScale, and Stochastic Depth. It operates on the CLS
+embeddings and the image patch embeddings.
+* `LayerScaleBlock()` which returns a `keras.model`. It is also a Transformer block that
+operates only on the embeddings of the image patches. It is equipped with LayerScale and
+Stochastic Depth.
+"""
+
+
+def LayerScaleBlockClassAttention(
+    projection_dim: int,
+    num_heads: int,
+    layer_norm_eps: float,
+    init_values: float,
+    mlp_units: typing.List[int],
+    dropout_rate: float,
+    sd_prob: float,
+    name: str,
+):
+    """Pre-norm transformer block meant to be applied to the embeddings of the
+    cls token and the embeddings of image patches.
+
+    Includes LayerScale and Stochastic Depth.
+
+    Args:
+        projection_dim (int): projection dimension to be used in the
+            Transformer blocks and patch projection layer.
+        num_heads (int): number of attention heads.
+        layer_norm_eps (float): epsilon to be used for Layer Normalization.
+        init_values (float): initial value for the diagonal matrix used in LayerScale.
+        mlp_units (List[int]): dimensions of the feed-forward network used in
+            the Transformer blocks.
+        dropout_rate (float): dropout rate to be used for dropout in the attention
+            scores as well as the final projected outputs.
+        sd_prob (float): stochastic depth rate.
+        name (str): a name identifier for the block.
+
+    Returns:
+        A keras.Model instance.
+    """
+    x = keras.Input((None, projection_dim))
+    x_cls = keras.Input((None, projection_dim))
+    inputs = keras.layers.Concatenate(axis=1)([x_cls, x])
+
+    # Class attention (CA).
+    x1 = layers.LayerNormalization(epsilon=layer_norm_eps)(inputs)
+    attn_output, attn_scores = ClassAttention(projection_dim, num_heads, dropout_rate)(
+        x1
+    )
+    attn_output = (
+        LayerScale(init_values, projection_dim)(attn_output)
+        if init_values
+        else attn_output
+    )
+    attn_output = StochasticDepth(sd_prob)(attn_output) if sd_prob else attn_output
+    x2 = keras.layers.Add()([x_cls, attn_output])
+
+    # FFN.
+    x3 = layers.LayerNormalization(epsilon=layer_norm_eps)(x2)
+    x4 = mlp(x3, hidden_units=mlp_units, dropout_rate=dropout_rate)
+    x4 = LayerScale(init_values, projection_dim)(x4) if init_values else x4
+    x4 = StochasticDepth(sd_prob)(x4) if sd_prob else x4
+    outputs = keras.layers.Add()([x2, x4])
+
+    return keras.Model([x, x_cls], [outputs, attn_scores], name=name)
+
+
+def LayerScaleBlock(
+    projection_dim: int,
+    num_heads: int,
+    layer_norm_eps: float,
+    init_values: float,
+    mlp_units: typing.List[int],
+    dropout_rate: float,
+    sd_prob: float,
+    name: str,
+):
+    """Pre-norm transformer block meant to be applied to the embeddings of the
+    image patches.
+
+    Includes LayerScale and Stochastic Depth.
+
+        Args:
+            projection_dim (int): projection dimension to be used in the
+                Transformer blocks and patch projection layer.
+            num_heads (int): number of attention heads.
+            layer_norm_eps (float): epsilon to be used for Layer Normalization.
+            init_values (float): initial value for the diagonal matrix used in LayerScale.
+            mlp_units (List[int]): dimensions of the feed-forward network used in
+                the Transformer blocks.
+            dropout_rate (float): dropout rate to be used for dropout in the attention
+                scores as well as the final projected outputs.
+            sd_prob (float): stochastic depth rate.
+            name (str): a name identifier for the block.
+
+    Returns:
+        A keras.Model instance.
+    """
+    encoded_patches = keras.Input((None, projection_dim))
+
+    # Self-attention.
+    x1 = layers.LayerNormalization(epsilon=layer_norm_eps)(encoded_patches)
+    attn_output, attn_scores = TalkingHeadAttention(
+        projection_dim, num_heads, dropout_rate
+    )(x1)
+    attn_output = (
+        LayerScale(init_values, projection_dim)(attn_output)
+        if init_values
+        else attn_output
+    )
+    attn_output = StochasticDepth(sd_prob)(attn_output) if sd_prob else attn_output
+    x2 = layers.Add()([encoded_patches, attn_output])
+
+    # FFN.
+    x3 = layers.LayerNormalization(epsilon=layer_norm_eps)(x2)
+    x4 = mlp(x3, hidden_units=mlp_units, dropout_rate=dropout_rate)
+    x4 = LayerScale(init_values, projection_dim)(x4) if init_values else x4
+    x4 = StochasticDepth(sd_prob)(x4) if sd_prob else x4
+    outputs = layers.Add()([x2, x4])
+
+    return keras.Model(encoded_patches, [outputs, attn_scores], name=name)
+
+
+"""
+Given all these blocks, we are now ready to collate them into the final CaiT model.
+"""
+
+"""
+## Putting the pieces together: The CaiT model
+"""
+
+
+class CaiT(keras.Model):
+    """CaiT model.
+
+    Args:
+        projection_dim (int): projection dimension to be used in the
+            Transformer blocks and patch projection layer.
+        patch_size (int): patch size of the input images.
+        num_patches (int): number of patches after extracting the image patches.
+        init_values (float): initial value for the diagonal matrix used in LayerScale.
+        mlp_units: (List[int]): dimensions of the feed-forward network used in
+            the Transformer blocks.
+        sa_ffn_layers (int): number of self-attention Transformer blocks.
+        ca_ffn_layers (int): number of class-attention Transformer blocks.
+        num_heads (int): number of attention heads.
+        layer_norm_eps (float): epsilon to be used for Layer Normalization.
+        dropout_rate (float): dropout rate to be used for dropout in the attention
+            scores as well as the final projected outputs.
+        sd_prob (float): stochastic depth rate.
+        global_pool (str): denotes how to pool the representations coming out of
+            the final Transformer block.
+        pre_logits (bool): if set to True then don't add a classification head.
+        num_classes (int): number of classes to construct the final classification
+            layer with.
+    """
+
+    def __init__(
+        self,
+        projection_dim: int,
+        patch_size: int,
+        num_patches: int,
+        init_values: float,
+        mlp_units: typing.List[int],
+        sa_ffn_layers: int,
+        ca_ffn_layers: int,
+        num_heads: int,
+        layer_norm_eps: float,
+        dropout_rate: float,
+        sd_prob: float,
+        global_pool: str,
+        pre_logits: bool,
+        num_classes: int,
+        **kwargs,
+    ):
+        if global_pool not in ["token", "avg"]:
+            raise ValueError(
+                'Invalid value received for `global_pool`, should be either `"token"` or `"avg"`.'
+            )
+
+        super().__init__(**kwargs)
+
+        # Responsible for patchifying the input images and the linearly projecting them.
+        self.projection = keras.Sequential(
+            [
+                layers.Conv2D(
+                    filters=projection_dim,
+                    kernel_size=(patch_size, patch_size),
+                    strides=(patch_size, patch_size),
+                    padding="VALID",
+                    name="conv_projection",
+                    kernel_initializer="lecun_normal",
+                ),
+                layers.Reshape(
+                    target_shape=(-1, projection_dim),
+                    name="flatten_projection",
+                ),
+            ],
+            name="projection",
+        )
+
+        # CLS token and the positional embeddings.
+        self.cls_token = self.add_weight(
+            shape=(1, 1, projection_dim), initializer="zeros"
+        )
+        self.pos_embed = self.add_weight(
+            shape=(1, num_patches, projection_dim), initializer="zeros"
+        )
+
+        # Projection dropout.
+        self.pos_drop = layers.Dropout(dropout_rate, name="projection_dropout")
+
+        # Stochastic depth schedule.
+        dpr = [sd_prob for _ in range(sa_ffn_layers)]
+
+        # Self-attention (SA) Transformer blocks operating only on the image patch
+        # embeddings.
+        self.blocks = [
+            LayerScaleBlock(
+                projection_dim=projection_dim,
+                num_heads=num_heads,
+                layer_norm_eps=layer_norm_eps,
+                init_values=init_values,
+                mlp_units=mlp_units,
+                dropout_rate=dropout_rate,
+                sd_prob=dpr[i],
+                name=f"sa_ffn_block_{i}",
+            )
+            for i in range(sa_ffn_layers)
+        ]
+
+        # Class Attention (CA) Transformer blocks operating on the CLS token and image patch
+        # embeddings.
+        self.blocks_token_only = [
+            LayerScaleBlockClassAttention(
+                projection_dim=projection_dim,
+                num_heads=num_heads,
+                layer_norm_eps=layer_norm_eps,
+                init_values=init_values,
+                mlp_units=mlp_units,
+                dropout_rate=dropout_rate,
+                name=f"ca_ffn_block_{i}",
+                sd_prob=0.0,  # No Stochastic Depth in the class attention layers.
+            )
+            for i in range(ca_ffn_layers)
+        ]
+
+        # Pre-classification layer normalization.
+        self.norm = layers.LayerNormalization(epsilon=layer_norm_eps, name="head_norm")
+
+        # Representation pooling for classification head.
+        self.global_pool = global_pool
+
+        # Classification head.
+        self.pre_logits = pre_logits
+        self.num_classes = num_classes
+        if not pre_logits:
+            self.head = layers.Dense(num_classes, name="classification_head")
+
+    def call(self, x, training=False):
+        # Notice how CLS token is not added here.
+        x = self.projection(x)
+        x = x + self.pos_embed
+        x = self.pos_drop(x)
+
+        # SA+FFN layers.
+        sa_ffn_attn = {}
+        for blk in self.blocks:
+            x, attn_scores = blk(x)
+            sa_ffn_attn[f"{blk.name}_att"] = attn_scores
+
+        # CA+FFN layers.
+        ca_ffn_attn = {}
+        cls_tokens = ops.tile(self.cls_token, (ops.shape(x)[0], 1, 1))
+        for blk in self.blocks_token_only:
+            cls_tokens, attn_scores = blk([x, cls_tokens])
+            ca_ffn_attn[f"{blk.name}_att"] = attn_scores
+
+        x = ops.concatenate([cls_tokens, x], axis=1)
+        x = self.norm(x)
+
+        # Always return the attention scores from the SA+FFN and CA+FFN layers
+        # for convenience.
+        if self.global_pool:
+            x = (
+                ops.reduce_mean(x[:, 1:], axis=1)
+                if self.global_pool == "avg"
+                else x[:, 0]
+            )
+        return (
+            (x, sa_ffn_attn, ca_ffn_attn)
+            if self.pre_logits
+            else (self.head(x), sa_ffn_attn, ca_ffn_attn)
+        )
+
+
+"""
+Having the SA and CA layers segregated this way helps the model to focus on underlying
+objectives more concretely:
+
+* model dependencies in between the image patches
+* summarize the information from the image patches in a CLS token that can be used for
+the task at hand
+
+Now that we have defined the CaiT model, it's time to test it. We will start by defining
+a model configuration that will be passed to our `CaiT` class for initialization.
+"""
+
+"""
+## Defining Model Configuration
+"""
+
+
+def get_config(
+    image_size: int = 224,
+    patch_size: int = 16,
+    projection_dim: int = 192,
+    sa_ffn_layers: int = 24,
+    ca_ffn_layers: int = 2,
+    num_heads: int = 4,
+    mlp_ratio: int = 4,
+    layer_norm_eps=1e-6,
+    init_values: float = 1e-5,
+    dropout_rate: float = 0.0,
+    sd_prob: float = 0.0,
+    global_pool: str = "token",
+    pre_logits: bool = False,
+    num_classes: int = 1000,
+) -> typing.Dict:
+    """Default configuration for CaiT models (cait_xxs24_224).
+
+    Reference:
+        https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/cait.py
+    """
+    config = {}
+
+    # Patchification and projection.
+    config["patch_size"] = patch_size
+    config["num_patches"] = (image_size // patch_size) ** 2
+
+    # LayerScale.
+    config["init_values"] = init_values
+
+    # Dropout and Stochastic Depth.
+    config["dropout_rate"] = dropout_rate
+    config["sd_prob"] = sd_prob
+
+    # Shared across different blocks and layers.
+    config["layer_norm_eps"] = layer_norm_eps
+    config["projection_dim"] = projection_dim
+    config["mlp_units"] = [
+        projection_dim * mlp_ratio,
+        projection_dim,
+    ]
+
+    # Attention layers.
+    config["num_heads"] = num_heads
+    config["sa_ffn_layers"] = sa_ffn_layers
+    config["ca_ffn_layers"] = ca_ffn_layers
+
+    # Representation pooling and task specific parameters.
+    config["global_pool"] = global_pool
+    config["pre_logits"] = pre_logits
+    config["num_classes"] = num_classes
+
+    return config
+
+
+"""
+Most of the configuration variables should sound familiar to you if you already know the
+ViT architecture. Point of focus is given to `sa_ffn_layers` and `ca_ffn_layers` that
+control the number of SA-Transformer blocks and CA-Transformer blocks. You can easily
+amend this `get_config()` method to instantiate a CaiT model for your own dataset.
+"""
+
+"""
+## Model Instantiation
+"""
+
+image_size = 224
+num_channels = 3
+batch_size = 2
+
+config = get_config()
+cait_xxs24_224 = CaiT(**config)
+
+dummy_inputs = ops.ones((batch_size, image_size, image_size, num_channels))
+_ = cait_xxs24_224(dummy_inputs)
+
+"""
+We can successfully perform inference with the model. But what about implementation
+correctness? There are many ways to verify it:
+
+* Obtain the performance of the model (given it's been populated with the pre-trained
+parameters) on the ImageNet-1k validation set (as the pretraining dataset was
+ImageNet-1k).
+* Fine-tune the model on a different dataset.
+
+In order to verify that, we will load another instance of the same model that has been
+already populated with the pre-trained parameters. Please refer to
+[this repository](https://github.com/sayakpaul/cait-tf)
+(developed by the author of this notebook) for more details.
+Additionally, the repository provides code to verify model performance on the
+[ImageNet-1k validation set](https://github.com/sayakpaul/cait-tf/tree/main/i1k_eval)
+as well as
+[fine-tuning](https://github.com/sayakpaul/cait-tf/blob/main/notebooks/finetune.ipynb).
+"""
+
+"""
+## Load a pretrained model
+"""
+
+model_gcs_path = "gs://kaggle-tfhub-models-uncompressed/tfhub-modules/sayakpaul/cait_xxs24_224/1/uncompressed"
+pretrained_model = keras.Sequential(
+    [keras.layers.TFSMLayer(model_gcs_path, call_endpoint="serving_default")]
+)
+
+"""
+## Inference utilities
+
+In the next couple of cells, we develop preprocessing utilities needed to run inference
+with the pretrained model.
+"""
+# The preprocessing transformations include center cropping, and normalizing
+# the pixel values with the ImageNet-1k training stats (mean and standard deviation).
+crop_layer = keras.layers.CenterCrop(image_size, image_size)
+norm_layer = keras.layers.Normalization(
+    mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
+    variance=[(0.229 * 255) ** 2, (0.224 * 255) ** 2, (0.225 * 255) ** 2],
+)
+
+
+def preprocess_image(image, size=image_size):
+    image = np.array(image)
+    image_resized = ops.expand_dims(image, 0)
+    resize_size = int((256 / image_size) * size)
+    image_resized = ops.image.resize(
+        image_resized, (resize_size, resize_size), interpolation="bicubic"
+    )
+    image_resized = crop_layer(image_resized)
+    return norm_layer(image_resized).numpy()
+
+
+def load_image_from_url(url):
+    image_bytes = io.BytesIO(urlopen(url).read())
+    image = PIL.Image.open(image_bytes)
+    preprocessed_image = preprocess_image(image)
+    return image, preprocessed_image
+
+
+"""
+Now, we retrieve the ImageNet-1k labels and load them as the model we're
+loading was pretrained on the ImageNet-1k dataset.
+"""
+
+# ImageNet-1k class labels.
+imagenet_labels = (
+    "https://storage.googleapis.com/bit_models/ilsvrc2012_wordnet_lemmas.txt"
+)
+label_path = keras.utils.get_file(origin=imagenet_labels)
+
+with open(label_path, "r") as f:
+    lines = f.readlines()
+imagenet_labels = [line.rstrip() for line in lines]
+
+"""
+## Load an Image
+"""
+
+img_url = "https://i.imgur.com/ErgfLTn.jpg"
+image, preprocessed_image = load_image_from_url(img_url)
+
+# https://unsplash.com/photos/Ho93gVTRWW8
+plt.imshow(image)
+plt.axis("off")
+plt.show()
+
+"""
+## Obtain Predictions
+"""
+
+outputs = pretrained_model.predict(preprocessed_image)
+logits = outputs["output_1"]
+ca_ffn_block_0_att = outputs["output_3_ca_ffn_block_0_att"]
+ca_ffn_block_1_att = outputs["output_3_ca_ffn_block_1_att"]
+
+predicted_label = imagenet_labels[int(np.argmax(logits))]
+print(predicted_label)
+
+"""
+Now that we have obtained the predictions (which appear to be as expected), we can
+further extend our investigation. Following the CaiT authors, we can investigate the
+attention scores from the attention layers. This helps us to get deeper insights into the
+modifications introduced in the CaiT paper.
+"""
+
+"""
+## Visualizing the Attention Layers
+
+We start by inspecting the shape of the attention weights returned by a Class Attention
+layer.
+"""
+
+# (batch_size, nb_attention_heads, num_cls_token, seq_length)
+print("Shape of the attention scores from a class attention block:")
+print(ca_ffn_block_0_att.shape)
+
+"""
+The shape denotes we have got attention weights for each of the individual attention
+heads. They quantify the information about how the CLS token is related to itself and the
+rest of the image patches.
+
+Next, we write a utility to:
+
+* Visualize what the individual attention heads in the Class Attention layers are
+focusing on. This helps us to get an idea of how the _spatial-class relationship_ is
+induced in the CaiT model.
+* Obtain a saliency map from the first Class Attention layer that helps to understand how
+CA layer aggregates information from the region(s) of interest in the images.
+
+This utility is referred from Figures 6 and 7 of the original
+[CaiT paper](https://arxiv.org/abs/2103.17239). This is also a part of
+[this notebook](https://github.com/sayakpaul/cait-tf/blob/main/notebooks/classification.ipynb)
+(developed by the author of this tutorial).
+"""
+
+# Reference:
+# https://github.com/facebookresearch/dino/blob/main/visualize_attention.py
+
+patch_size = 16
+
+
+def get_cls_attention_map(
+    attention_scores,
+    return_saliency=False,
+) -> np.ndarray:
+    """
+    Returns attention scores from a particular attention block.
+
+    Args:
+        attention_scores: the attention scores from the attention block to
+            visualize.
+        return_saliency: a boolean flag if set to True also returns the salient
+            representations of the attention block.
+    """
+    w_featmap = preprocessed_image.shape[2] // patch_size
+    h_featmap = preprocessed_image.shape[1] // patch_size
+
+    nh = attention_scores.shape[1]  # Number of attention heads.
+
+    # Taking the representations from CLS token.
+    attentions = attention_scores[0, :, 0, 1:].reshape(nh, -1)
+
+    # Reshape the attention scores to resemble mini patches.
+    attentions = attentions.reshape(nh, w_featmap, h_featmap)
+
+    if not return_saliency:
+        attentions = attentions.transpose((1, 2, 0))
+
+    else:
+        attentions = np.mean(attentions, axis=0)
+        attentions = (attentions - attentions.min()) / (
+            attentions.max() - attentions.min()
+        )
+        attentions = np.expand_dims(attentions, -1)
+
+    # Resize the attention patches to 224x224 (224: 14x16)
+    attentions = ops.image.resize(
+        attentions,
+        size=(h_featmap * patch_size, w_featmap * patch_size),
+        interpolation="bicubic",
+    )
+
+    return attentions
+
+
+"""
+In the first CA layer, we notice that the model is focusing solely on the region of
+interest.
+"""
+
+attentions_ca_block_0 = get_cls_attention_map(ca_ffn_block_0_att)
+
+
+fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(13, 13))
+img_count = 0
+
+for i in range(attentions_ca_block_0.shape[-1]):
+    if img_count < attentions_ca_block_0.shape[-1]:
+        axes[i].imshow(attentions_ca_block_0[:, :, img_count])
+        axes[i].title.set_text(f"Attention head: {img_count}")
+        axes[i].axis("off")
+        img_count += 1
+
+fig.tight_layout()
+plt.show()
+
+"""
+Whereas in the second CA layer, the model is trying to focus more on the context that
+contains discriminative signals.
+"""
+
+attentions_ca_block_1 = get_cls_attention_map(ca_ffn_block_1_att)
+
+
+fig, axes = plt.subplots(nrows=1, ncols=4, figsize=(13, 13))
+img_count = 0
+
+for i in range(attentions_ca_block_1.shape[-1]):
+    if img_count < attentions_ca_block_1.shape[-1]:
+        axes[i].imshow(attentions_ca_block_1[:, :, img_count])
+        axes[i].title.set_text(f"Attention head: {img_count}")
+        axes[i].axis("off")
+        img_count += 1
+
+fig.tight_layout()
+plt.show()
+
+"""
+Finally, we obtain the saliency map for the given image.
+"""
+
+saliency_attention = get_cls_attention_map(ca_ffn_block_0_att, return_saliency=True)
+
+image = np.array(image)
+image_resized = ops.expand_dims(image, 0)
+resize_size = int((256 / 224) * image_size)
+image_resized = ops.image.resize(
+    image_resized, (resize_size, resize_size), interpolation="bicubic"
+)
+image_resized = crop_layer(image_resized)
+
+plt.imshow(image_resized.numpy().squeeze().astype("int32"))
+plt.imshow(saliency_attention.numpy().squeeze(), cmap="cividis", alpha=0.9)
+plt.axis("off")
+
+plt.show()
+
+"""
+## Conclusion
+
+In this notebook, we implemented the CaiT model. It shows how to mitigate the issues in
+ViTs when trying scale their depth while keeping the pretraining dataset fixed. I hope
+the additional visualizations provided in the notebook spark excitement in the community
+and people develop interesting methods to probe what models like ViT learn.
+
+## Acknowledgement
+
+Thanks to the ML Developer Programs team at Google providing Google Cloud Platform
+support.
+"""
diff --git a/knowledge_base/vision/captcha_ocr.py b/knowledge_base/vision/captcha_ocr.py
new file mode 100644
index 0000000000000000000000000000000000000000..574a6a1b240df917d8905663b623583580a70f8f
--- /dev/null
+++ b/knowledge_base/vision/captcha_ocr.py
@@ -0,0 +1,428 @@
+"""
+Title: OCR model for reading Captchas
+Author: [A_K_Nain](https://twitter.com/A_K_Nain)
+Date created: 2020/06/14
+Last modified: 2024/03/13
+Description: How to implement an OCR model using CNNs, RNNs and CTC loss.
+Accelerator: GPU
+Converted to Keras 3 by: [Sitam Meur](https://github.com/sitamgithub-MSIT)
+"""
+
+"""
+## Introduction
+
+This example demonstrates a simple OCR model built with the Functional API. Apart from
+combining CNN and RNN, it also illustrates how you can instantiate a new layer
+and use it as an "Endpoint layer" for implementing CTC loss. For a detailed
+guide to layer subclassing, please check out
+[this page](https://keras.io/guides/making_new_layers_and_models_via_subclassing/)
+in the developer guides.
+"""
+
+"""
+## Setup
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+from pathlib import Path
+
+import tensorflow as tf
+import keras
+from keras import ops
+from keras import layers
+
+"""
+## Load the data: [Captcha Images](https://www.kaggle.com/fournierp/captcha-version-2-images)
+Let's download the data.
+"""
+
+
+"""shell
+curl -LO https://github.com/AakashKumarNain/CaptchaCracker/raw/master/captcha_images_v2.zip
+unzip -qq captcha_images_v2.zip
+"""
+
+
+"""
+The dataset contains 1040 captcha files as `png` images. The label for each sample is a string,
+the name of the file (minus the file extension).
+We will map each character in the string to an integer for training the model. Similary,
+we will need to map the predictions of the model back to strings. For this purpose
+we will maintain two dictionaries, mapping characters to integers, and integers to characters,
+respectively.
+"""
+
+
+# Path to the data directory
+data_dir = Path("./captcha_images_v2/")
+
+# Get list of all the images
+images = sorted(list(map(str, list(data_dir.glob("*.png")))))
+labels = [img.split(os.path.sep)[-1].split(".png")[0] for img in images]
+characters = set(char for label in labels for char in label)
+characters = sorted(list(characters))
+
+print("Number of images found: ", len(images))
+print("Number of labels found: ", len(labels))
+print("Number of unique characters: ", len(characters))
+print("Characters present: ", characters)
+
+# Batch size for training and validation
+batch_size = 16
+
+# Desired image dimensions
+img_width = 200
+img_height = 50
+
+# Factor by which the image is going to be downsampled
+# by the convolutional blocks. We will be using two
+# convolution blocks and each block will have
+# a pooling layer which downsample the features by a factor of 2.
+# Hence total downsampling factor would be 4.
+downsample_factor = 4
+
+# Maximum length of any captcha in the dataset
+max_length = max([len(label) for label in labels])
+
+
+"""
+## Preprocessing
+"""
+
+
+# Mapping characters to integers
+char_to_num = layers.StringLookup(vocabulary=list(characters), mask_token=None)
+
+# Mapping integers back to original characters
+num_to_char = layers.StringLookup(
+    vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True
+)
+
+
+def split_data(images, labels, train_size=0.9, shuffle=True):
+    # 1. Get the total size of the dataset
+    size = len(images)
+    # 2. Make an indices array and shuffle it, if required
+    indices = ops.arange(size)
+    if shuffle:
+        indices = keras.random.shuffle(indices)
+    # 3. Get the size of training samples
+    train_samples = int(size * train_size)
+    # 4. Split data into training and validation sets
+    x_train, y_train = images[indices[:train_samples]], labels[indices[:train_samples]]
+    x_valid, y_valid = images[indices[train_samples:]], labels[indices[train_samples:]]
+    return x_train, x_valid, y_train, y_valid
+
+
+# Splitting data into training and validation sets
+x_train, x_valid, y_train, y_valid = split_data(np.array(images), np.array(labels))
+
+
+def encode_single_sample(img_path, label):
+    # 1. Read image
+    img = tf.io.read_file(img_path)
+    # 2. Decode and convert to grayscale
+    img = tf.io.decode_png(img, channels=1)
+    # 3. Convert to float32 in [0, 1] range
+    img = tf.image.convert_image_dtype(img, tf.float32)
+    # 4. Resize to the desired size
+    img = ops.image.resize(img, [img_height, img_width])
+    # 5. Transpose the image because we want the time
+    # dimension to correspond to the width of the image.
+    img = ops.transpose(img, axes=[1, 0, 2])
+    # 6. Map the characters in label to numbers
+    label = char_to_num(tf.strings.unicode_split(label, input_encoding="UTF-8"))
+    # 7. Return a dict as our model is expecting two inputs
+    return {"image": img, "label": label}
+
+
+"""
+## Create `Dataset` objects
+"""
+
+
+train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+train_dataset = (
+    train_dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE)
+    .batch(batch_size)
+    .prefetch(buffer_size=tf.data.AUTOTUNE)
+)
+
+validation_dataset = tf.data.Dataset.from_tensor_slices((x_valid, y_valid))
+validation_dataset = (
+    validation_dataset.map(encode_single_sample, num_parallel_calls=tf.data.AUTOTUNE)
+    .batch(batch_size)
+    .prefetch(buffer_size=tf.data.AUTOTUNE)
+)
+
+"""
+## Visualize the data
+"""
+
+
+_, ax = plt.subplots(4, 4, figsize=(10, 5))
+for batch in train_dataset.take(1):
+    images = batch["image"]
+    labels = batch["label"]
+    for i in range(16):
+        img = (images[i] * 255).numpy().astype("uint8")
+        label = tf.strings.reduce_join(num_to_char(labels[i])).numpy().decode("utf-8")
+        ax[i // 4, i % 4].imshow(img[:, :, 0].T, cmap="gray")
+        ax[i // 4, i % 4].set_title(label)
+        ax[i // 4, i % 4].axis("off")
+plt.show()
+
+"""
+## Model
+"""
+
+
+def ctc_batch_cost(y_true, y_pred, input_length, label_length):
+    label_length = ops.cast(ops.squeeze(label_length, axis=-1), dtype="int32")
+    input_length = ops.cast(ops.squeeze(input_length, axis=-1), dtype="int32")
+    sparse_labels = ops.cast(
+        ctc_label_dense_to_sparse(y_true, label_length), dtype="int32"
+    )
+
+    y_pred = ops.log(ops.transpose(y_pred, axes=[1, 0, 2]) + keras.backend.epsilon())
+
+    return ops.expand_dims(
+        tf.compat.v1.nn.ctc_loss(
+            inputs=y_pred, labels=sparse_labels, sequence_length=input_length
+        ),
+        1,
+    )
+
+
+def ctc_label_dense_to_sparse(labels, label_lengths):
+    label_shape = ops.shape(labels)
+    num_batches_tns = ops.stack([label_shape[0]])
+    max_num_labels_tns = ops.stack([label_shape[1]])
+
+    def range_less_than(old_input, current_input):
+        return ops.expand_dims(ops.arange(ops.shape(old_input)[1]), 0) < tf.fill(
+            max_num_labels_tns, current_input
+        )
+
+    init = ops.cast(tf.fill([1, label_shape[1]], 0), dtype="bool")
+    dense_mask = tf.compat.v1.scan(
+        range_less_than, label_lengths, initializer=init, parallel_iterations=1
+    )
+    dense_mask = dense_mask[:, 0, :]
+
+    label_array = ops.reshape(
+        ops.tile(ops.arange(0, label_shape[1]), num_batches_tns), label_shape
+    )
+    label_ind = tf.compat.v1.boolean_mask(label_array, dense_mask)
+
+    batch_array = ops.transpose(
+        ops.reshape(
+            ops.tile(ops.arange(0, label_shape[0]), max_num_labels_tns),
+            tf.reverse(label_shape, [0]),
+        )
+    )
+    batch_ind = tf.compat.v1.boolean_mask(batch_array, dense_mask)
+    indices = ops.transpose(
+        ops.reshape(ops.concatenate([batch_ind, label_ind], axis=0), [2, -1])
+    )
+
+    vals_sparse = tf.compat.v1.gather_nd(labels, indices)
+
+    return tf.SparseTensor(
+        ops.cast(indices, dtype="int64"),
+        vals_sparse,
+        ops.cast(label_shape, dtype="int64"),
+    )
+
+
+class CTCLayer(layers.Layer):
+    def __init__(self, name=None):
+        super().__init__(name=name)
+        self.loss_fn = ctc_batch_cost
+
+    def call(self, y_true, y_pred):
+        # Compute the training-time loss value and add it
+        # to the layer using `self.add_loss()`.
+        batch_len = ops.cast(ops.shape(y_true)[0], dtype="int64")
+        input_length = ops.cast(ops.shape(y_pred)[1], dtype="int64")
+        label_length = ops.cast(ops.shape(y_true)[1], dtype="int64")
+
+        input_length = input_length * ops.ones(shape=(batch_len, 1), dtype="int64")
+        label_length = label_length * ops.ones(shape=(batch_len, 1), dtype="int64")
+
+        loss = self.loss_fn(y_true, y_pred, input_length, label_length)
+        self.add_loss(loss)
+
+        # At test time, just return the computed predictions
+        return y_pred
+
+
+def build_model():
+    # Inputs to the model
+    input_img = layers.Input(
+        shape=(img_width, img_height, 1), name="image", dtype="float32"
+    )
+    labels = layers.Input(name="label", shape=(None,), dtype="float32")
+
+    # First conv block
+    x = layers.Conv2D(
+        32,
+        (3, 3),
+        activation="relu",
+        kernel_initializer="he_normal",
+        padding="same",
+        name="Conv1",
+    )(input_img)
+    x = layers.MaxPooling2D((2, 2), name="pool1")(x)
+
+    # Second conv block
+    x = layers.Conv2D(
+        64,
+        (3, 3),
+        activation="relu",
+        kernel_initializer="he_normal",
+        padding="same",
+        name="Conv2",
+    )(x)
+    x = layers.MaxPooling2D((2, 2), name="pool2")(x)
+
+    # We have used two max pool with pool size and strides 2.
+    # Hence, downsampled feature maps are 4x smaller. The number of
+    # filters in the last layer is 64. Reshape accordingly before
+    # passing the output to the RNN part of the model
+    new_shape = ((img_width // 4), (img_height // 4) * 64)
+    x = layers.Reshape(target_shape=new_shape, name="reshape")(x)
+    x = layers.Dense(64, activation="relu", name="dense1")(x)
+    x = layers.Dropout(0.2)(x)
+
+    # RNNs
+    x = layers.Bidirectional(layers.LSTM(128, return_sequences=True, dropout=0.25))(x)
+    x = layers.Bidirectional(layers.LSTM(64, return_sequences=True, dropout=0.25))(x)
+
+    # Output layer
+    x = layers.Dense(
+        len(char_to_num.get_vocabulary()) + 1, activation="softmax", name="dense2"
+    )(x)
+
+    # Add CTC layer for calculating CTC loss at each step
+    output = CTCLayer(name="ctc_loss")(labels, x)
+
+    # Define the model
+    model = keras.models.Model(
+        inputs=[input_img, labels], outputs=output, name="ocr_model_v1"
+    )
+    # Optimizer
+    opt = keras.optimizers.Adam()
+    # Compile the model and return
+    model.compile(optimizer=opt)
+    return model
+
+
+# Get the model
+model = build_model()
+model.summary()
+
+"""
+## Training
+"""
+
+
+# TODO restore epoch count.
+epochs = 100
+early_stopping_patience = 10
+# Add early stopping
+early_stopping = keras.callbacks.EarlyStopping(
+    monitor="val_loss", patience=early_stopping_patience, restore_best_weights=True
+)
+
+# Train the model
+history = model.fit(
+    train_dataset,
+    validation_data=validation_dataset,
+    epochs=epochs,
+    callbacks=[early_stopping],
+)
+
+
+"""
+## Inference
+
+You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/keras-io/ocr-for-captcha)
+and try the demo on [Hugging Face Spaces](https://huggingface.co/spaces/keras-io/ocr-for-captcha).
+"""
+
+
+def ctc_decode(y_pred, input_length, greedy=True, beam_width=100, top_paths=1):
+    input_shape = ops.shape(y_pred)
+    num_samples, num_steps = input_shape[0], input_shape[1]
+    y_pred = ops.log(ops.transpose(y_pred, axes=[1, 0, 2]) + keras.backend.epsilon())
+    input_length = ops.cast(input_length, dtype="int32")
+
+    if greedy:
+        (decoded, log_prob) = tf.nn.ctc_greedy_decoder(
+            inputs=y_pred, sequence_length=input_length
+        )
+    else:
+        (decoded, log_prob) = tf.compat.v1.nn.ctc_beam_search_decoder(
+            inputs=y_pred,
+            sequence_length=input_length,
+            beam_width=beam_width,
+            top_paths=top_paths,
+        )
+    decoded_dense = []
+    for st in decoded:
+        st = tf.SparseTensor(st.indices, st.values, (num_samples, num_steps))
+        decoded_dense.append(tf.sparse.to_dense(sp_input=st, default_value=-1))
+    return (decoded_dense, log_prob)
+
+
+# Get the prediction model by extracting layers till the output layer
+prediction_model = keras.models.Model(
+    model.input[0], model.get_layer(name="dense2").output
+)
+prediction_model.summary()
+
+
+# A utility function to decode the output of the network
+def decode_batch_predictions(pred):
+    input_len = np.ones(pred.shape[0]) * pred.shape[1]
+    # Use greedy search. For complex tasks, you can use beam search
+    results = ctc_decode(pred, input_length=input_len, greedy=True)[0][0][
+        :, :max_length
+    ]
+    # Iterate over the results and get back the text
+    output_text = []
+    for res in results:
+        res = tf.strings.reduce_join(num_to_char(res)).numpy().decode("utf-8")
+        output_text.append(res)
+    return output_text
+
+
+#  Let's check results on some validation samples
+for batch in validation_dataset.take(1):
+    batch_images = batch["image"]
+    batch_labels = batch["label"]
+
+    preds = prediction_model.predict(batch_images)
+    pred_texts = decode_batch_predictions(preds)
+
+    orig_texts = []
+    for label in batch_labels:
+        label = tf.strings.reduce_join(num_to_char(label)).numpy().decode("utf-8")
+        orig_texts.append(label)
+
+    _, ax = plt.subplots(4, 4, figsize=(15, 5))
+    for i in range(len(pred_texts)):
+        img = (batch_images[i, :, :, 0] * 255).numpy().astype(np.uint8)
+        img = img.T
+        title = f"Prediction: {pred_texts[i]}"
+        ax[i // 4, i % 4].imshow(img, cmap="gray")
+        ax[i // 4, i % 4].set_title(title)
+        ax[i // 4, i % 4].axis("off")
+plt.show()
diff --git a/knowledge_base/vision/cct.py b/knowledge_base/vision/cct.py
new file mode 100644
index 0000000000000000000000000000000000000000..999856597337efefa9126d9d27f438103e53a1b6
--- /dev/null
+++ b/knowledge_base/vision/cct.py
@@ -0,0 +1,441 @@
+"""
+Title: Compact Convolutional Transformers
+Author: [Sayak Paul](https://twitter.com/RisingSayak)
+Date created: 2021/06/30
+Last modified: 2023/08/07
+Description: Compact Convolutional Transformers for efficient image classification.
+Accelerator: GPU
+Converted to Keras 3 by: [Muhammad Anas Raza](https://anasrz.com), [Guillaume Baquiast](https://www.linkedin.com/in/guillaume-baquiast-478965ba/)
+"""
+
+"""
+As discussed in the [Vision Transformers (ViT)](https://arxiv.org/abs/2010.11929) paper,
+a Transformer-based architecture for vision typically requires a larger dataset than
+usual, as well as a longer pre-training schedule. [ImageNet-1k](http://imagenet.org/)
+(which has about a million images) is considered to fall under the medium-sized data regime with
+respect to ViTs. This is primarily because, unlike CNNs, ViTs (or a typical
+Transformer-based architecture) do not have well-informed inductive biases (such as
+convolutions for processing images). This begs the question: can't we combine the
+benefits of convolution and the benefits of Transformers
+in a single network architecture? These benefits include parameter-efficiency, and
+self-attention to process long-range and global dependencies (interactions between
+different regions in an image).
+
+In [Escaping the Big Data Paradigm with Compact Transformers](https://arxiv.org/abs/2104.05704),
+Hassani et al. present an approach for doing exactly this. They proposed the
+**Compact Convolutional Transformer** (CCT) architecture. In this example, we will work on an
+implementation of CCT and we will see how well it performs on the CIFAR-10 dataset.
+
+If you are unfamiliar with the concept of self-attention or Transformers, you can read
+[this chapter](https://livebook.manning.com/book/deep-learning-with-python-second-edition/chapter-11/r-3/312)
+from  François Chollet's book *Deep Learning with Python*. This example uses
+code snippets from another example,
+[Image classification with Vision Transformer](https://keras.io/examples/vision/image_classification_with_vision_transformer/).
+"""
+
+"""
+## Imports
+"""
+
+from keras import layers
+import keras
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+"""
+## Hyperparameters and constants
+"""
+
+positional_emb = True
+conv_layers = 2
+projection_dim = 128
+
+num_heads = 2
+transformer_units = [
+    projection_dim,
+    projection_dim,
+]
+transformer_layers = 2
+stochastic_depth_rate = 0.1
+
+learning_rate = 0.001
+weight_decay = 0.0001
+batch_size = 128
+num_epochs = 30
+image_size = 32
+
+"""
+## Load CIFAR-10 dataset
+"""
+
+num_classes = 10
+input_shape = (32, 32, 3)
+
+(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()
+
+y_train = keras.utils.to_categorical(y_train, num_classes)
+y_test = keras.utils.to_categorical(y_test, num_classes)
+
+print(f"x_train shape: {x_train.shape} - y_train shape: {y_train.shape}")
+print(f"x_test shape: {x_test.shape} - y_test shape: {y_test.shape}")
+
+"""
+## The CCT tokenizer
+
+The first recipe introduced by the CCT authors is the tokenizer for processing the
+images. In a standard ViT, images are organized into uniform *non-overlapping* patches.
+This eliminates the boundary-level information present in between different patches. This
+is important for a neural network to effectively exploit the locality information. The
+figure below presents an illustration of how images are organized into patches.
+
+![](https://i.imgur.com/IkBK9oY.png)
+
+We already know that convolutions are quite good at exploiting locality information. So,
+based on this, the authors introduce an all-convolution mini-network to produce image
+patches.
+"""
+
+
+class CCTTokenizer(layers.Layer):
+    def __init__(
+        self,
+        kernel_size=3,
+        stride=1,
+        padding=1,
+        pooling_kernel_size=3,
+        pooling_stride=2,
+        num_conv_layers=conv_layers,
+        num_output_channels=[64, 128],
+        positional_emb=positional_emb,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        # This is our tokenizer.
+        self.conv_model = keras.Sequential()
+        for i in range(num_conv_layers):
+            self.conv_model.add(
+                layers.Conv2D(
+                    num_output_channels[i],
+                    kernel_size,
+                    stride,
+                    padding="valid",
+                    use_bias=False,
+                    activation="relu",
+                    kernel_initializer="he_normal",
+                )
+            )
+            self.conv_model.add(layers.ZeroPadding2D(padding))
+            self.conv_model.add(
+                layers.MaxPooling2D(pooling_kernel_size, pooling_stride, "same")
+            )
+
+        self.positional_emb = positional_emb
+
+    def call(self, images):
+        outputs = self.conv_model(images)
+        # After passing the images through our mini-network the spatial dimensions
+        # are flattened to form sequences.
+        reshaped = keras.ops.reshape(
+            outputs,
+            (
+                -1,
+                keras.ops.shape(outputs)[1] * keras.ops.shape(outputs)[2],
+                keras.ops.shape(outputs)[-1],
+            ),
+        )
+        return reshaped
+
+
+"""
+Positional embeddings are optional in CCT. If we want to use them, we can use
+the Layer defined below.
+"""
+
+
+class PositionEmbedding(keras.layers.Layer):
+    def __init__(
+        self,
+        sequence_length,
+        initializer="glorot_uniform",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if sequence_length is None:
+            raise ValueError("`sequence_length` must be an Integer, received `None`.")
+        self.sequence_length = int(sequence_length)
+        self.initializer = keras.initializers.get(initializer)
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "sequence_length": self.sequence_length,
+                "initializer": keras.initializers.serialize(self.initializer),
+            }
+        )
+        return config
+
+    def build(self, input_shape):
+        feature_size = input_shape[-1]
+        self.position_embeddings = self.add_weight(
+            name="embeddings",
+            shape=[self.sequence_length, feature_size],
+            initializer=self.initializer,
+            trainable=True,
+        )
+
+        super().build(input_shape)
+
+    def call(self, inputs, start_index=0):
+        shape = keras.ops.shape(inputs)
+        feature_length = shape[-1]
+        sequence_length = shape[-2]
+        # trim to match the length of the input sequence, which might be less
+        # than the sequence_length of the layer.
+        position_embeddings = keras.ops.convert_to_tensor(self.position_embeddings)
+        position_embeddings = keras.ops.slice(
+            position_embeddings,
+            (start_index, 0),
+            (sequence_length, feature_length),
+        )
+        return keras.ops.broadcast_to(position_embeddings, shape)
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+
+"""
+## Sequence Pooling
+Another recipe introduced in CCT is attention pooling or sequence pooling. In ViT, only
+the feature map corresponding to the class token is pooled and is then used for the
+subsequent classification task (or any other downstream task).
+"""
+
+
+class SequencePooling(layers.Layer):
+    def __init__(self):
+        super().__init__()
+        self.attention = layers.Dense(1)
+
+    def call(self, x):
+        attention_weights = keras.ops.softmax(self.attention(x), axis=1)
+        attention_weights = keras.ops.transpose(attention_weights, axes=(0, 2, 1))
+        weighted_representation = keras.ops.matmul(attention_weights, x)
+        return keras.ops.squeeze(weighted_representation, -2)
+
+
+"""
+## Stochastic depth for regularization
+
+[Stochastic depth](https://arxiv.org/abs/1603.09382) is a regularization technique that
+randomly drops a set of layers. During inference, the layers are kept as they are. It is
+very much similar to [Dropout](https://jmlr.org/papers/v15/srivastava14a.html) but only
+that it operates on a block of layers rather than individual nodes present inside a
+layer. In CCT, stochastic depth is used just before the residual blocks of a Transformers
+encoder.
+"""
+
+
+# Referred from: github.com:rwightman/pytorch-image-models.
+class StochasticDepth(layers.Layer):
+    def __init__(self, drop_prop, **kwargs):
+        super().__init__(**kwargs)
+        self.drop_prob = drop_prop
+        self.seed_generator = keras.random.SeedGenerator(1337)
+
+    def call(self, x, training=None):
+        if training:
+            keep_prob = 1 - self.drop_prob
+            shape = (keras.ops.shape(x)[0],) + (1,) * (len(x.shape) - 1)
+            random_tensor = keep_prob + keras.random.uniform(
+                shape, 0, 1, seed=self.seed_generator
+            )
+            random_tensor = keras.ops.floor(random_tensor)
+            return (x / keep_prob) * random_tensor
+        return x
+
+
+"""
+## MLP for the Transformers encoder
+"""
+
+
+def mlp(x, hidden_units, dropout_rate):
+    for units in hidden_units:
+        x = layers.Dense(units, activation=keras.ops.gelu)(x)
+        x = layers.Dropout(dropout_rate)(x)
+    return x
+
+
+"""
+## Data augmentation
+
+In the [original paper](https://arxiv.org/abs/2104.05704), the authors use
+[AutoAugment](https://arxiv.org/abs/1805.09501) to induce stronger regularization. For
+this example, we will be using the standard geometric augmentations like random cropping
+and flipping.
+"""
+
+# Note the rescaling layer. These layers have pre-defined inference behavior.
+data_augmentation = keras.Sequential(
+    [
+        layers.Rescaling(scale=1.0 / 255),
+        layers.RandomCrop(image_size, image_size),
+        layers.RandomFlip("horizontal"),
+    ],
+    name="data_augmentation",
+)
+
+"""
+## The final CCT model
+
+In CCT, outputs from the Transformers encoder are weighted and then passed on to the final task-specific layer (in
+this example, we do classification).
+"""
+
+
+def create_cct_model(
+    image_size=image_size,
+    input_shape=input_shape,
+    num_heads=num_heads,
+    projection_dim=projection_dim,
+    transformer_units=transformer_units,
+):
+    inputs = layers.Input(input_shape)
+
+    # Augment data.
+    augmented = data_augmentation(inputs)
+
+    # Encode patches.
+    cct_tokenizer = CCTTokenizer()
+    encoded_patches = cct_tokenizer(augmented)
+
+    # Apply positional embedding.
+    if positional_emb:
+        sequence_length = encoded_patches.shape[1]
+        encoded_patches += PositionEmbedding(sequence_length=sequence_length)(
+            encoded_patches
+        )
+
+    # Calculate Stochastic Depth probabilities.
+    dpr = [x for x in np.linspace(0, stochastic_depth_rate, transformer_layers)]
+
+    # Create multiple layers of the Transformer block.
+    for i in range(transformer_layers):
+        # Layer normalization 1.
+        x1 = layers.LayerNormalization(epsilon=1e-5)(encoded_patches)
+
+        # Create a multi-head attention layer.
+        attention_output = layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=projection_dim, dropout=0.1
+        )(x1, x1)
+
+        # Skip connection 1.
+        attention_output = StochasticDepth(dpr[i])(attention_output)
+        x2 = layers.Add()([attention_output, encoded_patches])
+
+        # Layer normalization 2.
+        x3 = layers.LayerNormalization(epsilon=1e-5)(x2)
+
+        # MLP.
+        x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1)
+
+        # Skip connection 2.
+        x3 = StochasticDepth(dpr[i])(x3)
+        encoded_patches = layers.Add()([x3, x2])
+
+    # Apply sequence pooling.
+    representation = layers.LayerNormalization(epsilon=1e-5)(encoded_patches)
+    weighted_representation = SequencePooling()(representation)
+
+    # Classify outputs.
+    logits = layers.Dense(num_classes)(weighted_representation)
+    # Create the Keras model.
+    model = keras.Model(inputs=inputs, outputs=logits)
+    return model
+
+
+"""
+## Model training and evaluation
+"""
+
+
+def run_experiment(model):
+    optimizer = keras.optimizers.AdamW(learning_rate=0.001, weight_decay=0.0001)
+
+    model.compile(
+        optimizer=optimizer,
+        loss=keras.losses.CategoricalCrossentropy(
+            from_logits=True, label_smoothing=0.1
+        ),
+        metrics=[
+            keras.metrics.CategoricalAccuracy(name="accuracy"),
+            keras.metrics.TopKCategoricalAccuracy(5, name="top-5-accuracy"),
+        ],
+    )
+
+    checkpoint_filepath = "/tmp/checkpoint.weights.h5"
+    checkpoint_callback = keras.callbacks.ModelCheckpoint(
+        checkpoint_filepath,
+        monitor="val_accuracy",
+        save_best_only=True,
+        save_weights_only=True,
+    )
+
+    history = model.fit(
+        x=x_train,
+        y=y_train,
+        batch_size=batch_size,
+        epochs=num_epochs,
+        validation_split=0.1,
+        callbacks=[checkpoint_callback],
+    )
+
+    model.load_weights(checkpoint_filepath)
+    _, accuracy, top_5_accuracy = model.evaluate(x_test, y_test)
+    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
+    print(f"Test top 5 accuracy: {round(top_5_accuracy * 100, 2)}%")
+
+    return history
+
+
+cct_model = create_cct_model()
+history = run_experiment(cct_model)
+
+"""
+Let's now visualize the training progress of the model.
+"""
+
+plt.plot(history.history["loss"], label="train_loss")
+plt.plot(history.history["val_loss"], label="val_loss")
+plt.xlabel("Epochs")
+plt.ylabel("Loss")
+plt.title("Train and Validation Losses Over Epochs", fontsize=14)
+plt.legend()
+plt.grid()
+plt.show()
+
+"""
+The CCT model we just trained has just **0.4 million** parameters, and it gets us to
+~79% top-1 accuracy within 30 epochs. The plot above shows no signs of overfitting as
+well. This means we can train this network for longer (perhaps with a bit more
+regularization) and may obtain even better performance. This performance can further be
+improved by additional recipes like cosine decay learning rate schedule, other data augmentation
+techniques like [AutoAugment](https://arxiv.org/abs/1805.09501),
+[MixUp](https://arxiv.org/abs/1710.09412) or
+[Cutmix](https://arxiv.org/abs/1905.04899). With these modifications, the authors present
+95.1% top-1 accuracy on the CIFAR-10 dataset. The authors also present a number of
+experiments to study how the number of convolution blocks, Transformers layers, etc.
+affect the final performance of CCTs.
+
+For a comparison, a ViT model takes about **4.7 million** parameters and **100
+epochs** of training to reach a top-1 accuracy of 78.22% on the CIFAR-10 dataset. You can
+refer to
+[this notebook](https://colab.research.google.com/gist/sayakpaul/1a80d9f582b044354a1a26c5cb3d69e5/image_classification_with_vision_transformer.ipynb)
+to know about the experimental setup.
+
+The authors also demonstrate the performance of Compact Convolutional Transformers on
+NLP tasks and they report competitive results there.
+"""
diff --git a/knowledge_base/vision/consistency_training.py b/knowledge_base/vision/consistency_training.py
new file mode 100644
index 0000000000000000000000000000000000000000..39ef2af899a61c3b160d3db1f81a83cadecfdc72
--- /dev/null
+++ b/knowledge_base/vision/consistency_training.py
@@ -0,0 +1,396 @@
+"""
+Title: Consistency training with supervision
+Author: [Sayak Paul](https://twitter.com/RisingSayak)
+Date created: 2021/04/13
+Last modified: 2021/04/19
+Description: Training with consistency regularization for robustness against data distribution shifts.
+Accelerator: GPU
+"""
+
+"""
+Deep learning models excel in many image recognition tasks when the data is independent
+and identically distributed (i.i.d.). However, they can suffer from performance
+degradation caused by subtle distribution shifts in the input data  (such as random
+noise, contrast change, and blurring). So, naturally, there arises a question of
+why. As discussed in [A Fourier Perspective on Model Robustness in Computer Vision](https://arxiv.org/pdf/1906.08988.pdf)),
+there's no reason for deep learning models to be robust against such shifts. Standard
+model training procedures (such as standard image classification training workflows)
+*don't* enable a model to learn beyond what's fed to it in the form of training data.
+
+In this example, we will be training an image classification model enforcing a sense of
+*consistency* inside it by doing the following:
+
+* Train a standard image classification model.
+* Train an _equal or larger_ model on a noisy version of the dataset (augmented using
+[RandAugment](https://arxiv.org/abs/1909.13719)).
+* To do this, we will first obtain predictions of the previous model on the clean images
+of the dataset.
+* We will then use these predictions and train the second model to match these
+predictions on the noisy variant of the same images. This is identical to the workflow of
+[*Knowledge Distillation*](https://keras.io/examples/vision/knowledge_distillation/) but
+since the student model is equal or larger in size this process is also referred to as
+***Self-Training***.
+
+This overall training workflow finds its roots in works like
+[FixMatch](https://arxiv.org/abs/2001.07685), [Unsupervised Data Augmentation for Consistency Training](https://arxiv.org/abs/1904.12848),
+and [Noisy Student Training](https://arxiv.org/abs/1911.04252). Since this training
+process encourages a model yield consistent predictions for clean as well as noisy
+images, it's often referred to as *consistency training* or *training with consistency
+regularization*. Although the example focuses on using consistency training to enhance
+the robustness of models to common corruptions this example can also serve a template
+for performing _weakly supervised learning_.
+
+This example requires TensorFlow 2.4 or higher, as well as TensorFlow Hub and TensorFlow
+Models, which can be installed using the following command:
+
+"""
+
+"""shell
+pip install -q tf-models-official tensorflow-addons
+"""
+
+"""
+## Imports and setup
+"""
+
+from official.vision.image_classification.augment import RandAugment
+from tensorflow.keras import layers
+
+import tensorflow as tf
+import tensorflow_addons as tfa
+import matplotlib.pyplot as plt
+
+tf.random.set_seed(42)
+
+"""
+## Define hyperparameters
+"""
+
+AUTO = tf.data.AUTOTUNE
+BATCH_SIZE = 128
+EPOCHS = 5
+
+CROP_TO = 72
+RESIZE_TO = 96
+
+"""
+## Load the CIFAR-10 dataset
+"""
+
+(x_train, y_train), (x_test, y_test) = tf.keras.datasets.cifar10.load_data()
+
+val_samples = 49500
+new_train_x, new_y_train = x_train[: val_samples + 1], y_train[: val_samples + 1]
+val_x, val_y = x_train[val_samples:], y_train[val_samples:]
+
+"""
+## Create TensorFlow `Dataset` objects
+"""
+
+# Initialize `RandAugment` object with 2 layers of
+# augmentation transforms and strength of 9.
+augmenter = RandAugment(num_layers=2, magnitude=9)
+
+"""
+For training the teacher model, we will only be using two geometric augmentation
+transforms: random horizontal flip and random crop.
+"""
+
+
+def preprocess_train(image, label, noisy=True):
+    image = tf.image.random_flip_left_right(image)
+    # We first resize the original image to a larger dimension
+    # and then we take random crops from it.
+    image = tf.image.resize(image, [RESIZE_TO, RESIZE_TO])
+    image = tf.image.random_crop(image, [CROP_TO, CROP_TO, 3])
+    if noisy:
+        image = augmenter.distort(image)
+    return image, label
+
+
+def preprocess_test(image, label):
+    image = tf.image.resize(image, [CROP_TO, CROP_TO])
+    return image, label
+
+
+train_ds = tf.data.Dataset.from_tensor_slices((new_train_x, new_y_train))
+validation_ds = tf.data.Dataset.from_tensor_slices((val_x, val_y))
+test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test))
+
+"""
+We make sure `train_clean_ds` and `train_noisy_ds` are shuffled using the *same* seed to
+ensure their orders are exactly the same. This will be helpful during training the
+student model.
+"""
+
+# This dataset will be used to train the first model.
+train_clean_ds = (
+    train_ds.shuffle(BATCH_SIZE * 10, seed=42)
+    .map(lambda x, y: (preprocess_train(x, y, noisy=False)), num_parallel_calls=AUTO)
+    .batch(BATCH_SIZE)
+    .prefetch(AUTO)
+)
+
+# This prepares the `Dataset` object to use RandAugment.
+train_noisy_ds = (
+    train_ds.shuffle(BATCH_SIZE * 10, seed=42)
+    .map(preprocess_train, num_parallel_calls=AUTO)
+    .batch(BATCH_SIZE)
+    .prefetch(AUTO)
+)
+
+validation_ds = (
+    validation_ds.map(preprocess_test, num_parallel_calls=AUTO)
+    .batch(BATCH_SIZE)
+    .prefetch(AUTO)
+)
+
+test_ds = (
+    test_ds.map(preprocess_test, num_parallel_calls=AUTO)
+    .batch(BATCH_SIZE)
+    .prefetch(AUTO)
+)
+
+# This dataset will be used to train the second model.
+consistency_training_ds = tf.data.Dataset.zip((train_clean_ds, train_noisy_ds))
+
+"""
+## Visualize the datasets
+"""
+
+sample_images, sample_labels = next(iter(train_clean_ds))
+plt.figure(figsize=(10, 10))
+for i, image in enumerate(sample_images[:9]):
+    ax = plt.subplot(3, 3, i + 1)
+    plt.imshow(image.numpy().astype("int"))
+    plt.axis("off")
+
+sample_images, sample_labels = next(iter(train_noisy_ds))
+plt.figure(figsize=(10, 10))
+for i, image in enumerate(sample_images[:9]):
+    ax = plt.subplot(3, 3, i + 1)
+    plt.imshow(image.numpy().astype("int"))
+    plt.axis("off")
+
+"""
+## Define a model building utility function
+
+We now define our model building utility. Our model is based on the [ResNet50V2 architecture](https://arxiv.org/abs/1603.05027).
+"""
+
+
+def get_training_model(num_classes=10):
+    resnet50_v2 = tf.keras.applications.ResNet50V2(
+        weights=None,
+        include_top=False,
+        input_shape=(CROP_TO, CROP_TO, 3),
+    )
+    model = tf.keras.Sequential(
+        [
+            layers.Input((CROP_TO, CROP_TO, 3)),
+            layers.Rescaling(scale=1.0 / 127.5, offset=-1),
+            resnet50_v2,
+            layers.GlobalAveragePooling2D(),
+            layers.Dense(num_classes),
+        ]
+    )
+    return model
+
+
+"""
+In the interest of reproducibility, we serialize the initial random weights of the
+teacher  network.
+"""
+
+initial_teacher_model = get_training_model()
+initial_teacher_model.save_weights("initial_teacher_model.h5")
+
+"""
+## Train the teacher model
+
+As noted in Noisy Student Training, if the teacher model is trained with *geometric
+ensembling* and when the student model is forced to mimic that, it leads to better
+performance. The original work uses [Stochastic Depth](https://arxiv.org/abs/1603.09382)
+and [Dropout](https://jmlr.org/papers/v15/srivastava14a.html) to bring in the ensembling
+part but for this example, we will use [Stochastic Weight Averaging](https://arxiv.org/abs/1803.05407)
+(SWA) which also resembles geometric ensembling.
+"""
+
+# Define the callbacks.
+reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(patience=3)
+early_stopping = tf.keras.callbacks.EarlyStopping(
+    patience=10, restore_best_weights=True
+)
+
+# Initialize SWA from tf-hub.
+SWA = tfa.optimizers.SWA
+
+# Compile and train the teacher model.
+teacher_model = get_training_model()
+teacher_model.load_weights("initial_teacher_model.h5")
+teacher_model.compile(
+    # Notice that we are wrapping our optimizer within SWA
+    optimizer=SWA(tf.keras.optimizers.Adam()),
+    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    metrics=["accuracy"],
+)
+history = teacher_model.fit(
+    train_clean_ds,
+    epochs=EPOCHS,
+    validation_data=validation_ds,
+    callbacks=[reduce_lr, early_stopping],
+)
+
+# Evaluate the teacher model on the test set.
+_, acc = teacher_model.evaluate(test_ds, verbose=0)
+print(f"Test accuracy: {acc*100}%")
+
+"""
+## Define a self-training utility
+
+For this part, we will borrow the `Distiller` class from [this Keras Example](https://keras.io/examples/vision/knowledge_distillation/).
+"""
+
+
+# Majority of the code is taken from:
+# https://keras.io/examples/vision/knowledge_distillation/
+class SelfTrainer(tf.keras.Model):
+    def __init__(self, student, teacher):
+        super().__init__()
+        self.student = student
+        self.teacher = teacher
+
+    def compile(
+        self,
+        optimizer,
+        metrics,
+        student_loss_fn,
+        distillation_loss_fn,
+        temperature=3,
+    ):
+        super().compile(optimizer=optimizer, metrics=metrics)
+        self.student_loss_fn = student_loss_fn
+        self.distillation_loss_fn = distillation_loss_fn
+        self.temperature = temperature
+
+    def train_step(self, data):
+        # Since our dataset is a zip of two independent datasets,
+        # after initially parsing them, we segregate the
+        # respective images and labels next.
+        clean_ds, noisy_ds = data
+        clean_images, _ = clean_ds
+        noisy_images, y = noisy_ds
+
+        # Forward pass of teacher
+        teacher_predictions = self.teacher(clean_images, training=False)
+
+        with tf.GradientTape() as tape:
+            # Forward pass of student
+            student_predictions = self.student(noisy_images, training=True)
+
+            # Compute losses
+            student_loss = self.student_loss_fn(y, student_predictions)
+            distillation_loss = self.distillation_loss_fn(
+                tf.nn.softmax(teacher_predictions / self.temperature, axis=1),
+                tf.nn.softmax(student_predictions / self.temperature, axis=1),
+            )
+            total_loss = (student_loss + distillation_loss) / 2
+
+        # Compute gradients
+        trainable_vars = self.student.trainable_variables
+        gradients = tape.gradient(total_loss, trainable_vars)
+
+        # Update weights
+        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
+
+        # Update the metrics configured in `compile()`
+        self.compiled_metrics.update_state(
+            y, tf.nn.softmax(student_predictions, axis=1)
+        )
+
+        # Return a dict of performance
+        results = {m.name: m.result() for m in self.metrics}
+        results.update({"total_loss": total_loss})
+        return results
+
+    def test_step(self, data):
+        # During inference, we only pass a dataset consisting images and labels.
+        x, y = data
+
+        # Compute predictions
+        y_prediction = self.student(x, training=False)
+
+        # Update the metrics
+        self.compiled_metrics.update_state(y, tf.nn.softmax(y_prediction, axis=1))
+
+        # Return a dict of performance
+        results = {m.name: m.result() for m in self.metrics}
+        return results
+
+
+"""
+The only difference in this implementation is the way loss is being calculated. **Instead
+of weighted the distillation loss and student loss differently we are taking their
+average following Noisy Student Training**.
+"""
+
+"""
+## Train the student model
+"""
+
+# Define the callbacks.
+# We are using a larger decay factor to stabilize the training.
+reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
+    patience=3, factor=0.5, monitor="val_accuracy"
+)
+early_stopping = tf.keras.callbacks.EarlyStopping(
+    patience=10, restore_best_weights=True, monitor="val_accuracy"
+)
+
+# Compile and train the student model.
+self_trainer = SelfTrainer(student=get_training_model(), teacher=teacher_model)
+self_trainer.compile(
+    # Notice we are *not* using SWA here.
+    optimizer="adam",
+    metrics=["accuracy"],
+    student_loss_fn=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    distillation_loss_fn=tf.keras.losses.KLDivergence(),
+    temperature=10,
+)
+history = self_trainer.fit(
+    consistency_training_ds,
+    epochs=EPOCHS,
+    validation_data=validation_ds,
+    callbacks=[reduce_lr, early_stopping],
+)
+
+# Evaluate the student model.
+acc = self_trainer.evaluate(test_ds, verbose=0)
+print(f"Test accuracy from student model: {acc*100}%")
+
+"""
+## Assess the robustness of the models
+
+A standard benchmark of assessing the robustness of vision models is to record their
+performance on corrupted datasets like ImageNet-C and CIFAR-10-C both of which were
+proposed in [Benchmarking Neural Network Robustness to Common Corruptions and
+Perturbations](https://arxiv.org/abs/1903.12261). For this example, we will be using the
+CIFAR-10-C dataset which has 19 different corruptions on 5 different severity levels. To
+assess the robustness of the models on this dataset, we will do the following:
+
+* Run the pre-trained models on the highest level of severities and obtain the top-1
+accuracies.
+* Compute the mean top-1 accuracy.
+
+For the purpose of this example, we won't be going through these steps. This is why we
+trained the models for only 5 epochs. You can check out [this
+repository](https://github.com/sayakpaul/Consistency-Training-with-Supervision) that
+demonstrates the full-scale training experiments and also the aforementioned assessment.
+The figure below presents an executive summary of that assessment:
+
+![](https://i.ibb.co/HBJkM9R/image.png)
+
+**Mean Top-1** results stand for the CIFAR-10-C dataset and **Test Top-1** results stand
+for the CIFAR-10 test set. It's clear that consistency training has an advantage on not
+only enhancing the model robustness but also on improving the standard test performance.
+"""
diff --git a/knowledge_base/vision/conv_lstm.py b/knowledge_base/vision/conv_lstm.py
new file mode 100644
index 0000000000000000000000000000000000000000..b17725dea97a36be1ffe04bcfd55c4de4d2d15bc
--- /dev/null
+++ b/knowledge_base/vision/conv_lstm.py
@@ -0,0 +1,294 @@
+"""
+Title: Next-Frame Video Prediction with Convolutional LSTMs
+Author: [Amogh Joshi](https://github.com/amogh7joshi)
+Date created: 2021/06/02
+Last modified: 2023/11/10
+Description: How to build and train a convolutional LSTM model for next-frame video prediction.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+The
+[Convolutional LSTM](https://papers.nips.cc/paper/2015/file/07563a3fe3bbe7e3ba84431ad9d055af-Paper.pdf)
+architectures bring together time series processing and computer vision by
+introducing a convolutional recurrent cell in a LSTM layer. In this example, we will explore the
+Convolutional LSTM model in an application to next-frame prediction, the process
+of predicting what video frames come next given a series of past frames.
+"""
+
+"""
+## Setup
+"""
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+import keras
+from keras import layers
+
+import io
+import imageio
+from IPython.display import Image, display
+from ipywidgets import widgets, Layout, HBox
+
+"""
+## Dataset Construction
+
+For this example, we will be using the
+[Moving MNIST](http://www.cs.toronto.edu/~nitish/unsupervised_video/)
+dataset.
+
+We will download the dataset and then construct and
+preprocess training and validation sets.
+
+For next-frame prediction, our model will be using a previous frame,
+which we'll call `f_n`, to predict a new frame, called `f_(n + 1)`.
+To allow the model to create these predictions, we'll need to process
+the data such that we have "shifted" inputs and outputs, where the
+input data is frame `x_n`, being used to predict frame `y_(n + 1)`.
+"""
+
+# Download and load the dataset.
+fpath = keras.utils.get_file(
+    "moving_mnist.npy",
+    "http://www.cs.toronto.edu/~nitish/unsupervised_video/mnist_test_seq.npy",
+)
+dataset = np.load(fpath)
+
+# Swap the axes representing the number of frames and number of data samples.
+dataset = np.swapaxes(dataset, 0, 1)
+# We'll pick out 1000 of the 10000 total examples and use those.
+dataset = dataset[:1000, ...]
+# Add a channel dimension since the images are grayscale.
+dataset = np.expand_dims(dataset, axis=-1)
+
+# Split into train and validation sets using indexing to optimize memory.
+indexes = np.arange(dataset.shape[0])
+np.random.shuffle(indexes)
+train_index = indexes[: int(0.9 * dataset.shape[0])]
+val_index = indexes[int(0.9 * dataset.shape[0]) :]
+train_dataset = dataset[train_index]
+val_dataset = dataset[val_index]
+
+# Normalize the data to the 0-1 range.
+train_dataset = train_dataset / 255
+val_dataset = val_dataset / 255
+
+
+# We'll define a helper function to shift the frames, where
+# `x` is frames 0 to n - 1, and `y` is frames 1 to n.
+def create_shifted_frames(data):
+    x = data[:, 0 : data.shape[1] - 1, :, :]
+    y = data[:, 1 : data.shape[1], :, :]
+    return x, y
+
+
+# Apply the processing function to the datasets.
+x_train, y_train = create_shifted_frames(train_dataset)
+x_val, y_val = create_shifted_frames(val_dataset)
+
+# Inspect the dataset.
+print("Training Dataset Shapes: " + str(x_train.shape) + ", " + str(y_train.shape))
+print("Validation Dataset Shapes: " + str(x_val.shape) + ", " + str(y_val.shape))
+
+"""
+## Data Visualization
+
+Our data consists of sequences of frames, each of which
+are used to predict the upcoming frame. Let's take a look
+at some of these sequential frames.
+"""
+
+# Construct a figure on which we will visualize the images.
+fig, axes = plt.subplots(4, 5, figsize=(10, 8))
+
+# Plot each of the sequential images for one random data example.
+data_choice = np.random.choice(range(len(train_dataset)), size=1)[0]
+for idx, ax in enumerate(axes.flat):
+    ax.imshow(np.squeeze(train_dataset[data_choice][idx]), cmap="gray")
+    ax.set_title(f"Frame {idx + 1}")
+    ax.axis("off")
+
+# Print information and display the figure.
+print(f"Displaying frames for example {data_choice}.")
+plt.show()
+
+"""
+## Model Construction
+
+To build a Convolutional LSTM model, we will use the
+`ConvLSTM2D` layer, which will accept inputs of shape
+`(batch_size, num_frames, width, height, channels)`, and return
+a prediction movie of the same shape.
+"""
+
+# Construct the input layer with no definite frame size.
+inp = layers.Input(shape=(None, *x_train.shape[2:]))
+
+# We will construct 3 `ConvLSTM2D` layers with batch normalization,
+# followed by a `Conv3D` layer for the spatiotemporal outputs.
+x = layers.ConvLSTM2D(
+    filters=64,
+    kernel_size=(5, 5),
+    padding="same",
+    return_sequences=True,
+    activation="relu",
+)(inp)
+x = layers.BatchNormalization()(x)
+x = layers.ConvLSTM2D(
+    filters=64,
+    kernel_size=(3, 3),
+    padding="same",
+    return_sequences=True,
+    activation="relu",
+)(x)
+x = layers.BatchNormalization()(x)
+x = layers.ConvLSTM2D(
+    filters=64,
+    kernel_size=(1, 1),
+    padding="same",
+    return_sequences=True,
+    activation="relu",
+)(x)
+x = layers.Conv3D(
+    filters=1, kernel_size=(3, 3, 3), activation="sigmoid", padding="same"
+)(x)
+
+# Next, we will build the complete model and compile it.
+model = keras.models.Model(inp, x)
+model.compile(
+    loss=keras.losses.binary_crossentropy,
+    optimizer=keras.optimizers.Adam(),
+)
+
+"""
+## Model Training
+
+With our model and data constructed, we can now train the model.
+"""
+
+# Define some callbacks to improve training.
+early_stopping = keras.callbacks.EarlyStopping(monitor="val_loss", patience=10)
+reduce_lr = keras.callbacks.ReduceLROnPlateau(monitor="val_loss", patience=5)
+
+# Define modifiable training hyperparameters.
+epochs = 20
+batch_size = 5
+
+# Fit the model to the training data.
+model.fit(
+    x_train,
+    y_train,
+    batch_size=batch_size,
+    epochs=epochs,
+    validation_data=(x_val, y_val),
+    callbacks=[early_stopping, reduce_lr],
+)
+
+"""
+## Frame Prediction Visualizations
+
+With our model now constructed and trained, we can generate
+some example frame predictions based on a new video.
+
+We'll pick a random example from the validation set and
+then choose the first ten frames from them. From there, we can
+allow the model to predict 10 new frames, which we can compare
+to the ground truth frame predictions.
+"""
+
+# Select a random example from the validation dataset.
+example = val_dataset[np.random.choice(range(len(val_dataset)), size=1)[0]]
+
+# Pick the first/last ten frames from the example.
+frames = example[:10, ...]
+original_frames = example[10:, ...]
+
+# Predict a new set of 10 frames.
+for _ in range(10):
+    # Extract the model's prediction and post-process it.
+    new_prediction = model.predict(np.expand_dims(frames, axis=0))
+    new_prediction = np.squeeze(new_prediction, axis=0)
+    predicted_frame = np.expand_dims(new_prediction[-1, ...], axis=0)
+
+    # Extend the set of prediction frames.
+    frames = np.concatenate((frames, predicted_frame), axis=0)
+
+# Construct a figure for the original and new frames.
+fig, axes = plt.subplots(2, 10, figsize=(20, 4))
+
+# Plot the original frames.
+for idx, ax in enumerate(axes[0]):
+    ax.imshow(np.squeeze(original_frames[idx]), cmap="gray")
+    ax.set_title(f"Frame {idx + 11}")
+    ax.axis("off")
+
+# Plot the new frames.
+new_frames = frames[10:, ...]
+for idx, ax in enumerate(axes[1]):
+    ax.imshow(np.squeeze(new_frames[idx]), cmap="gray")
+    ax.set_title(f"Frame {idx + 11}")
+    ax.axis("off")
+
+# Display the figure.
+plt.show()
+
+"""
+## Predicted Videos
+
+Finally, we'll pick a few examples from the validation set
+and construct some GIFs with them to see the model's
+predicted videos.
+
+You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/keras-io/conv-lstm)
+and try the demo on [Hugging Face Spaces](https://huggingface.co/spaces/keras-io/conv-lstm).
+"""
+
+# Select a few random examples from the dataset.
+examples = val_dataset[np.random.choice(range(len(val_dataset)), size=5)]
+
+# Iterate over the examples and predict the frames.
+predicted_videos = []
+for example in examples:
+    # Pick the first/last ten frames from the example.
+    frames = example[:10, ...]
+    original_frames = example[10:, ...]
+    new_predictions = np.zeros(shape=(10, *frames[0].shape))
+
+    # Predict a new set of 10 frames.
+    for i in range(10):
+        # Extract the model's prediction and post-process it.
+        frames = example[: 10 + i + 1, ...]
+        new_prediction = model.predict(np.expand_dims(frames, axis=0))
+        new_prediction = np.squeeze(new_prediction, axis=0)
+        predicted_frame = np.expand_dims(new_prediction[-1, ...], axis=0)
+
+        # Extend the set of prediction frames.
+        new_predictions[i] = predicted_frame
+
+    # Create and save GIFs for each of the ground truth/prediction images.
+    for frame_set in [original_frames, new_predictions]:
+        # Construct a GIF from the selected video frames.
+        current_frames = np.squeeze(frame_set)
+        current_frames = current_frames[..., np.newaxis] * np.ones(3)
+        current_frames = (current_frames * 255).astype(np.uint8)
+        current_frames = list(current_frames)
+
+        # Construct a GIF from the frames.
+        with io.BytesIO() as gif:
+            imageio.mimsave(gif, current_frames, "GIF", duration=200)
+            predicted_videos.append(gif.getvalue())
+
+# Display the videos.
+print(" Truth\tPrediction")
+for i in range(0, len(predicted_videos), 2):
+    # Construct and display an `HBox` with the ground truth and prediction.
+    box = HBox(
+        [
+            widgets.Image(value=predicted_videos[i]),
+            widgets.Image(value=predicted_videos[i + 1]),
+        ]
+    )
+    display(box)
diff --git a/knowledge_base/vision/convmixer.py b/knowledge_base/vision/convmixer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f78f701a7f33bda80a6fbf075d49c6c0861a1734
--- /dev/null
+++ b/knowledge_base/vision/convmixer.py
@@ -0,0 +1,318 @@
+"""
+Title: Image classification with ConvMixer
+Author: [Sayak Paul](https://twitter.com/RisingSayak)
+Date created: 2021/10/12
+Last modified: 2021/10/12
+Description: An all-convolutional network applied to patches of images.
+Accelerator: GPU
+Converted to Keras 3 by: [Md Awsafur Rahman](https://awsaf49.github.io)
+"""
+
+"""
+## Introduction
+
+Vision Transformers (ViT; [Dosovitskiy et al.](https://arxiv.org/abs/1612.00593)) extract
+small patches from the input images, linearly project them, and then apply the
+Transformer ([Vaswani et al.](https://arxiv.org/abs/1706.03762)) blocks. The application
+of ViTs to image recognition tasks is quickly becoming a promising area of research,
+because ViTs eliminate the need to have strong inductive biases (such as convolutions) for
+modeling locality. This presents them as a general computation primititive capable of
+learning just from the training data with as minimal inductive priors as possible. ViTs
+yield great downstream performance when trained with proper regularization, data
+augmentation, and relatively large datasets.
+
+In the [Patches Are All You Need](https://openreview.net/pdf?id=TVHS5Y4dNvM) paper (note:
+at
+the time of writing, it is a submission to the ICLR 2022 conference), the authors extend
+the idea of using patches to train an all-convolutional network and demonstrate
+competitive results. Their architecture namely **ConvMixer** uses recipes from the recent
+isotrophic architectures like ViT, MLP-Mixer
+([Tolstikhin et al.](https://arxiv.org/abs/2105.01601)), such as using the same
+depth and resolution across different layers in the network, residual connections,
+and so on.
+
+In this example, we will implement the ConvMixer model and demonstrate its performance on
+the CIFAR-10 dataset.
+"""
+
+"""
+## Imports
+"""
+
+import keras
+from keras import layers
+
+import matplotlib.pyplot as plt
+import tensorflow as tf
+import numpy as np
+
+"""
+## Hyperparameters
+
+To keep run time short, we will train the model for only 10 epochs. To focus on
+the core ideas of ConvMixer, we will not use other training-specific elements like
+RandAugment ([Cubuk et al.](https://arxiv.org/abs/1909.13719)). If you are interested in
+learning more about those details, please refer to the
+[original paper](https://openreview.net/pdf?id=TVHS5Y4dNvM).
+"""
+
+learning_rate = 0.001
+weight_decay = 0.0001
+batch_size = 128
+num_epochs = 10
+
+"""
+## Load the CIFAR-10 dataset
+"""
+
+(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()
+val_split = 0.1
+
+val_indices = int(len(x_train) * val_split)
+new_x_train, new_y_train = x_train[val_indices:], y_train[val_indices:]
+x_val, y_val = x_train[:val_indices], y_train[:val_indices]
+
+print(f"Training data samples: {len(new_x_train)}")
+print(f"Validation data samples: {len(x_val)}")
+print(f"Test data samples: {len(x_test)}")
+
+"""
+## Prepare `tf.data.Dataset` objects
+
+Our data augmentation pipeline is different from what the authors used for the CIFAR-10
+dataset, which is fine for the purpose of the example.
+Note that, it's ok to use **TF APIs for data I/O and preprocessing** with other backends
+(jax, torch) as it is feature-complete framework when it comes to data preprocessing.
+"""
+
+image_size = 32
+auto = tf.data.AUTOTUNE
+
+augmentation_layers = [
+    keras.layers.RandomCrop(image_size, image_size),
+    keras.layers.RandomFlip("horizontal"),
+]
+
+
+def augment_images(images):
+    for layer in augmentation_layers:
+        images = layer(images, training=True)
+    return images
+
+
+def make_datasets(images, labels, is_train=False):
+    dataset = tf.data.Dataset.from_tensor_slices((images, labels))
+    if is_train:
+        dataset = dataset.shuffle(batch_size * 10)
+    dataset = dataset.batch(batch_size)
+    if is_train:
+        dataset = dataset.map(
+            lambda x, y: (augment_images(x), y), num_parallel_calls=auto
+        )
+    return dataset.prefetch(auto)
+
+
+train_dataset = make_datasets(new_x_train, new_y_train, is_train=True)
+val_dataset = make_datasets(x_val, y_val)
+test_dataset = make_datasets(x_test, y_test)
+
+"""
+## ConvMixer utilities
+
+The following figure (taken from the original paper) depicts the ConvMixer model:
+
+![](https://i.imgur.com/yF8actg.png)
+
+ConvMixer is very similar to the MLP-Mixer, model with the following key
+differences:
+
+* Instead of using fully-connected layers, it uses standard convolution layers.
+* Instead of LayerNorm (which is typical for ViTs and MLP-Mixers), it uses BatchNorm.
+
+Two types of convolution layers are used in ConvMixer. **(1)**: Depthwise convolutions,
+for mixing spatial locations of the images, **(2)**: Pointwise convolutions (which follow
+the depthwise convolutions), for mixing channel-wise information across the patches.
+Another keypoint is the use of *larger kernel sizes* to allow a larger receptive field.
+"""
+
+
+def activation_block(x):
+    x = layers.Activation("gelu")(x)
+    return layers.BatchNormalization()(x)
+
+
+def conv_stem(x, filters: int, patch_size: int):
+    x = layers.Conv2D(filters, kernel_size=patch_size, strides=patch_size)(x)
+    return activation_block(x)
+
+
+def conv_mixer_block(x, filters: int, kernel_size: int):
+    # Depthwise convolution.
+    x0 = x
+    x = layers.DepthwiseConv2D(kernel_size=kernel_size, padding="same")(x)
+    x = layers.Add()([activation_block(x), x0])  # Residual.
+
+    # Pointwise convolution.
+    x = layers.Conv2D(filters, kernel_size=1)(x)
+    x = activation_block(x)
+
+    return x
+
+
+def get_conv_mixer_256_8(
+    image_size=32, filters=256, depth=8, kernel_size=5, patch_size=2, num_classes=10
+):
+    """ConvMixer-256/8: https://openreview.net/pdf?id=TVHS5Y4dNvM.
+    The hyperparameter values are taken from the paper.
+    """
+    inputs = keras.Input((image_size, image_size, 3))
+    x = layers.Rescaling(scale=1.0 / 255)(inputs)
+
+    # Extract patch embeddings.
+    x = conv_stem(x, filters, patch_size)
+
+    # ConvMixer blocks.
+    for _ in range(depth):
+        x = conv_mixer_block(x, filters, kernel_size)
+
+    # Classification block.
+    x = layers.GlobalAvgPool2D()(x)
+    outputs = layers.Dense(num_classes, activation="softmax")(x)
+
+    return keras.Model(inputs, outputs)
+
+
+"""
+The model used in this experiment is termed as **ConvMixer-256/8** where 256 denotes the
+number of channels and 8 denotes the depth. The resulting model only has 0.8 million
+parameters.
+"""
+
+"""
+## Model training and evaluation utility
+"""
+
+# Code reference:
+# https://keras.io/examples/vision/image_classification_with_vision_transformer/.
+
+
+def run_experiment(model):
+    optimizer = keras.optimizers.AdamW(
+        learning_rate=learning_rate, weight_decay=weight_decay
+    )
+
+    model.compile(
+        optimizer=optimizer,
+        loss="sparse_categorical_crossentropy",
+        metrics=["accuracy"],
+    )
+
+    checkpoint_filepath = "/tmp/checkpoint.keras"
+    checkpoint_callback = keras.callbacks.ModelCheckpoint(
+        checkpoint_filepath,
+        monitor="val_accuracy",
+        save_best_only=True,
+        save_weights_only=False,
+    )
+
+    history = model.fit(
+        train_dataset,
+        validation_data=val_dataset,
+        epochs=num_epochs,
+        callbacks=[checkpoint_callback],
+    )
+
+    model.load_weights(checkpoint_filepath)
+    _, accuracy = model.evaluate(test_dataset)
+    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
+
+    return history, model
+
+
+"""
+## Train and evaluate model
+"""
+
+conv_mixer_model = get_conv_mixer_256_8()
+history, conv_mixer_model = run_experiment(conv_mixer_model)
+
+"""
+The gap in training and validation performance can be mitigated by using additional
+regularization techniques. Nevertheless, being able to get to ~83% accuracy within 10
+epochs with 0.8 million parameters is a strong result.
+"""
+
+"""
+## Visualizing the internals of ConvMixer
+
+We can visualize the patch embeddings and the learned convolution filters. Recall
+that each patch embedding and intermediate feature map have the same number of channels
+(256 in this case). This will make our visualization utility easier to implement.
+"""
+
+# Code reference: https://bit.ly/3awIRbP.
+
+
+def visualization_plot(weights, idx=1):
+    # First, apply min-max normalization to the
+    # given weights to avoid isotrophic scaling.
+    p_min, p_max = weights.min(), weights.max()
+    weights = (weights - p_min) / (p_max - p_min)
+
+    # Visualize all the filters.
+    num_filters = 256
+    plt.figure(figsize=(8, 8))
+
+    for i in range(num_filters):
+        current_weight = weights[:, :, :, i]
+        if current_weight.shape[-1] == 1:
+            current_weight = current_weight.squeeze()
+        ax = plt.subplot(16, 16, idx)
+        ax.set_xticks([])
+        ax.set_yticks([])
+        plt.imshow(current_weight)
+        idx += 1
+
+
+# We first visualize the learned patch embeddings.
+patch_embeddings = conv_mixer_model.layers[2].get_weights()[0]
+visualization_plot(patch_embeddings)
+
+"""
+Even though we did not train the network to convergence, we can notice that different
+patches show different patterns. Some share similarity with others while some are very
+different. These visualizations are more salient with larger image sizes.
+
+Similarly, we can visualize the raw convolution kernels. This can help us understand
+the patterns to which a given kernel is receptive.
+"""
+
+# First, print the indices of the convolution layers that are not
+# pointwise convolutions.
+for i, layer in enumerate(conv_mixer_model.layers):
+    if isinstance(layer, layers.DepthwiseConv2D):
+        if layer.get_config()["kernel_size"] == (5, 5):
+            print(i, layer)
+
+idx = 26  # Taking a kernel from the middle of the network.
+
+kernel = conv_mixer_model.layers[idx].get_weights()[0]
+kernel = np.expand_dims(kernel.squeeze(), axis=2)
+visualization_plot(kernel)
+
+"""
+We see that different filters in the kernel have different locality spans, and this
+pattern
+is likely to evolve with more training.
+"""
+
+"""
+## Final notes
+
+There's been a recent trend on fusing convolutions with other data-agnostic operations
+like self-attention. Following works are along this line of research:
+
+* ConViT ([d'Ascoli et al.](https://arxiv.org/abs/2103.10697))
+* CCT ([Hassani et al.](https://arxiv.org/abs/2104.05704))
+* CoAtNet ([Dai et al.](https://arxiv.org/abs/2106.04803))
+"""
diff --git a/knowledge_base/vision/cutmix.py b/knowledge_base/vision/cutmix.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a27c0c7a997ff8c2a986c8bd5b4209a691b5080
--- /dev/null
+++ b/knowledge_base/vision/cutmix.py
@@ -0,0 +1,387 @@
+"""
+Title: CutMix data augmentation for image classification
+Author: [Sayan Nath](https://twitter.com/sayannath2350)
+Date created: 2021/06/08
+Last modified: 2023/11/14
+Description: Data augmentation with CutMix for image classification on CIFAR-10.
+Accelerator: GPU
+Converted to Keras 3 By: [Piyush Thakur](https://github.com/cosmo3769)
+"""
+
+"""
+## Introduction
+"""
+
+"""
+_CutMix_ is a data augmentation technique that addresses the issue of information loss
+and inefficiency present in regional dropout strategies.
+Instead of removing pixels and filling them with black or grey pixels or Gaussian noise,
+you replace the removed regions with a patch from another image,
+while the ground truth labels are mixed proportionally to the number of pixels of combined images.
+CutMix was proposed in
+[CutMix: Regularization Strategy to Train Strong Classifiers with Localizable Features](https://arxiv.org/abs/1905.04899)
+(Yun et al., 2019)
+
+It's implemented via the following formulas:
+
+<img src="https://i.imgur.com/cGvd13V.png" width="200"/>
+
+where `M` is the binary mask which indicates the cutout and the fill-in
+regions from the two randomly drawn images and `λ` (in `[0, 1]`) is drawn from a
+[`Beta(α, α)` distribution](https://en.wikipedia.org/wiki/Beta_distribution)
+
+The coordinates of bounding boxes are:
+
+<img src="https://i.imgur.com/eNisep4.png" width="150"/>
+
+which indicates the cutout and fill-in regions in case of the images.
+The bounding box sampling is represented by:
+
+<img src="https://i.imgur.com/Snph9aj.png" width="200"/>
+
+where `rx, ry` are randomly drawn from a uniform distribution with upper bound.
+"""
+
+"""
+## Setup
+"""
+
+import numpy as np
+import keras
+import matplotlib.pyplot as plt
+
+from keras import layers
+
+# TF imports related to tf.data preprocessing
+from tensorflow import clip_by_value
+from tensorflow import data as tf_data
+from tensorflow import image as tf_image
+from tensorflow import random as tf_random
+
+keras.utils.set_random_seed(42)
+
+"""
+## Load the CIFAR-10 dataset
+
+In this example, we will use the
+[CIFAR-10 image classification dataset](https://www.cs.toronto.edu/~kriz/cifar.html).
+"""
+
+(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()
+y_train = keras.utils.to_categorical(y_train, num_classes=10)
+y_test = keras.utils.to_categorical(y_test, num_classes=10)
+
+print(x_train.shape)
+print(y_train.shape)
+print(x_test.shape)
+print(y_test.shape)
+
+class_names = [
+    "Airplane",
+    "Automobile",
+    "Bird",
+    "Cat",
+    "Deer",
+    "Dog",
+    "Frog",
+    "Horse",
+    "Ship",
+    "Truck",
+]
+
+"""
+## Define hyperparameters
+"""
+
+AUTO = tf_data.AUTOTUNE
+BATCH_SIZE = 32
+IMG_SIZE = 32
+
+"""
+## Define the image preprocessing function
+"""
+
+
+def preprocess_image(image, label):
+    image = tf_image.resize(image, (IMG_SIZE, IMG_SIZE))
+    image = tf_image.convert_image_dtype(image, "float32") / 255.0
+    label = keras.ops.cast(label, dtype="float32")
+    return image, label
+
+
+"""
+## Convert the data into TensorFlow `Dataset` objects
+"""
+
+train_ds_one = (
+    tf_data.Dataset.from_tensor_slices((x_train, y_train))
+    .shuffle(1024)
+    .map(preprocess_image, num_parallel_calls=AUTO)
+)
+train_ds_two = (
+    tf_data.Dataset.from_tensor_slices((x_train, y_train))
+    .shuffle(1024)
+    .map(preprocess_image, num_parallel_calls=AUTO)
+)
+
+train_ds_simple = tf_data.Dataset.from_tensor_slices((x_train, y_train))
+
+test_ds = tf_data.Dataset.from_tensor_slices((x_test, y_test))
+
+train_ds_simple = (
+    train_ds_simple.map(preprocess_image, num_parallel_calls=AUTO)
+    .batch(BATCH_SIZE)
+    .prefetch(AUTO)
+)
+
+# Combine two shuffled datasets from the same training data.
+train_ds = tf_data.Dataset.zip((train_ds_one, train_ds_two))
+
+test_ds = (
+    test_ds.map(preprocess_image, num_parallel_calls=AUTO)
+    .batch(BATCH_SIZE)
+    .prefetch(AUTO)
+)
+
+"""
+## Define the CutMix data augmentation function
+
+The CutMix function takes two `image` and `label` pairs to perform the augmentation.
+It samples `λ(l)` from the [Beta distribution](https://en.wikipedia.org/wiki/Beta_distribution)
+and returns a bounding box from `get_box` function. We then crop the second image (`image2`)
+and pad this image in the final padded image at the same location.
+"""
+
+
+def sample_beta_distribution(size, concentration_0=0.2, concentration_1=0.2):
+    gamma_1_sample = tf_random.gamma(shape=[size], alpha=concentration_1)
+    gamma_2_sample = tf_random.gamma(shape=[size], alpha=concentration_0)
+    return gamma_1_sample / (gamma_1_sample + gamma_2_sample)
+
+
+def get_box(lambda_value):
+    cut_rat = keras.ops.sqrt(1.0 - lambda_value)
+
+    cut_w = IMG_SIZE * cut_rat  # rw
+    cut_w = keras.ops.cast(cut_w, "int32")
+
+    cut_h = IMG_SIZE * cut_rat  # rh
+    cut_h = keras.ops.cast(cut_h, "int32")
+
+    cut_x = keras.random.uniform((1,), minval=0, maxval=IMG_SIZE)  # rx
+    cut_x = keras.ops.cast(cut_x, "int32")
+    cut_y = keras.random.uniform((1,), minval=0, maxval=IMG_SIZE)  # ry
+    cut_y = keras.ops.cast(cut_y, "int32")
+
+    boundaryx1 = clip_by_value(cut_x[0] - cut_w // 2, 0, IMG_SIZE)
+    boundaryy1 = clip_by_value(cut_y[0] - cut_h // 2, 0, IMG_SIZE)
+    bbx2 = clip_by_value(cut_x[0] + cut_w // 2, 0, IMG_SIZE)
+    bby2 = clip_by_value(cut_y[0] + cut_h // 2, 0, IMG_SIZE)
+
+    target_h = bby2 - boundaryy1
+    if target_h == 0:
+        target_h += 1
+
+    target_w = bbx2 - boundaryx1
+    if target_w == 0:
+        target_w += 1
+
+    return boundaryx1, boundaryy1, target_h, target_w
+
+
+def cutmix(train_ds_one, train_ds_two):
+    (image1, label1), (image2, label2) = train_ds_one, train_ds_two
+
+    alpha = [0.25]
+    beta = [0.25]
+
+    # Get a sample from the Beta distribution
+    lambda_value = sample_beta_distribution(1, alpha, beta)
+
+    # Define Lambda
+    lambda_value = lambda_value[0][0]
+
+    # Get the bounding box offsets, heights and widths
+    boundaryx1, boundaryy1, target_h, target_w = get_box(lambda_value)
+
+    # Get a patch from the second image (`image2`)
+    crop2 = tf_image.crop_to_bounding_box(
+        image2, boundaryy1, boundaryx1, target_h, target_w
+    )
+    # Pad the `image2` patch (`crop2`) with the same offset
+    image2 = tf_image.pad_to_bounding_box(
+        crop2, boundaryy1, boundaryx1, IMG_SIZE, IMG_SIZE
+    )
+    # Get a patch from the first image (`image1`)
+    crop1 = tf_image.crop_to_bounding_box(
+        image1, boundaryy1, boundaryx1, target_h, target_w
+    )
+    # Pad the `image1` patch (`crop1`) with the same offset
+    img1 = tf_image.pad_to_bounding_box(
+        crop1, boundaryy1, boundaryx1, IMG_SIZE, IMG_SIZE
+    )
+
+    # Modify the first image by subtracting the patch from `image1`
+    # (before applying the `image2` patch)
+    image1 = image1 - img1
+    # Add the modified `image1` and `image2`  together to get the CutMix image
+    image = image1 + image2
+
+    # Adjust Lambda in accordance to the pixel ration
+    lambda_value = 1 - (target_w * target_h) / (IMG_SIZE * IMG_SIZE)
+    lambda_value = keras.ops.cast(lambda_value, "float32")
+
+    # Combine the labels of both images
+    label = lambda_value * label1 + (1 - lambda_value) * label2
+    return image, label
+
+
+"""
+**Note**: we are combining two images to create a single one.
+
+## Visualize the new dataset after applying the CutMix augmentation
+"""
+
+# Create the new dataset using our `cutmix` utility
+train_ds_cmu = (
+    train_ds.shuffle(1024)
+    .map(cutmix, num_parallel_calls=AUTO)
+    .batch(BATCH_SIZE)
+    .prefetch(AUTO)
+)
+
+# Let's preview 9 samples from the dataset
+image_batch, label_batch = next(iter(train_ds_cmu))
+plt.figure(figsize=(10, 10))
+for i in range(9):
+    ax = plt.subplot(3, 3, i + 1)
+    plt.title(class_names[np.argmax(label_batch[i])])
+    plt.imshow(image_batch[i])
+    plt.axis("off")
+
+"""
+## Define a ResNet-20 model
+"""
+
+
+def resnet_layer(
+    inputs,
+    num_filters=16,
+    kernel_size=3,
+    strides=1,
+    activation="relu",
+    batch_normalization=True,
+    conv_first=True,
+):
+    conv = layers.Conv2D(
+        num_filters,
+        kernel_size=kernel_size,
+        strides=strides,
+        padding="same",
+        kernel_initializer="he_normal",
+        kernel_regularizer=keras.regularizers.L2(1e-4),
+    )
+    x = inputs
+    if conv_first:
+        x = conv(x)
+        if batch_normalization:
+            x = layers.BatchNormalization()(x)
+        if activation is not None:
+            x = layers.Activation(activation)(x)
+    else:
+        if batch_normalization:
+            x = layers.BatchNormalization()(x)
+        if activation is not None:
+            x = layers.Activation(activation)(x)
+        x = conv(x)
+    return x
+
+
+def resnet_v20(input_shape, depth, num_classes=10):
+    if (depth - 2) % 6 != 0:
+        raise ValueError("depth should be 6n+2 (eg 20, 32, 44 in [a])")
+    # Start model definition.
+    num_filters = 16
+    num_res_blocks = int((depth - 2) / 6)
+
+    inputs = layers.Input(shape=input_shape)
+    x = resnet_layer(inputs=inputs)
+    # Instantiate the stack of residual units
+    for stack in range(3):
+        for res_block in range(num_res_blocks):
+            strides = 1
+            if stack > 0 and res_block == 0:  # first layer but not first stack
+                strides = 2  # downsample
+            y = resnet_layer(inputs=x, num_filters=num_filters, strides=strides)
+            y = resnet_layer(inputs=y, num_filters=num_filters, activation=None)
+            if stack > 0 and res_block == 0:  # first layer but not first stack
+                # linear projection residual shortcut connection to match
+                # changed dims
+                x = resnet_layer(
+                    inputs=x,
+                    num_filters=num_filters,
+                    kernel_size=1,
+                    strides=strides,
+                    activation=None,
+                    batch_normalization=False,
+                )
+            x = layers.add([x, y])
+            x = layers.Activation("relu")(x)
+        num_filters *= 2
+
+    # Add classifier on top.
+    # v1 does not use BN after last shortcut connection-ReLU
+    x = layers.AveragePooling2D(pool_size=8)(x)
+    y = layers.Flatten()(x)
+    outputs = layers.Dense(
+        num_classes, activation="softmax", kernel_initializer="he_normal"
+    )(y)
+
+    # Instantiate model.
+    model = keras.Model(inputs=inputs, outputs=outputs)
+    return model
+
+
+def training_model():
+    return resnet_v20((32, 32, 3), 20)
+
+
+initial_model = training_model()
+initial_model.save_weights("initial_weights.weights.h5")
+
+"""
+## Train the model with the dataset augmented by CutMix
+"""
+
+model = training_model()
+model.load_weights("initial_weights.weights.h5")
+
+model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
+model.fit(train_ds_cmu, validation_data=test_ds, epochs=15)
+
+test_loss, test_accuracy = model.evaluate(test_ds)
+print("Test accuracy: {:.2f}%".format(test_accuracy * 100))
+
+"""
+## Train the model using the original non-augmented dataset
+"""
+
+model = training_model()
+model.load_weights("initial_weights.weights.h5")
+model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
+model.fit(train_ds_simple, validation_data=test_ds, epochs=15)
+
+test_loss, test_accuracy = model.evaluate(test_ds)
+print("Test accuracy: {:.2f}%".format(test_accuracy * 100))
+
+"""
+## Notes
+
+In this example, we trained our model for 15 epochs.
+In our experiment, the model with CutMix achieves a better accuracy on the CIFAR-10 dataset
+(77.34% in our experiment) compared to the model that doesn't use the augmentation (66.90%).
+You may notice it takes less time to train the model with the CutMix augmentation.
+
+You can experiment further with the CutMix technique by following the
+[original paper](https://arxiv.org/abs/1905.04899).
+"""
diff --git a/knowledge_base/vision/deeplabv3_plus.py b/knowledge_base/vision/deeplabv3_plus.py
new file mode 100644
index 0000000000000000000000000000000000000000..894f2ee22a70d2945f3db59046a490953ca3aaa7
--- /dev/null
+++ b/knowledge_base/vision/deeplabv3_plus.py
@@ -0,0 +1,331 @@
+"""
+Title: Multiclass semantic segmentation using DeepLabV3+
+Author: [Soumik Rakshit](http://github.com/soumik12345)
+Date created: 2021/08/31
+Last modified: 2024/01/05
+Description: Implement DeepLabV3+ architecture for Multi-class Semantic Segmentation.
+Accelerator: GPU
+Converted to Keras 3: [Muhammad Anas Raza](https://anasrz.com)
+"""
+
+"""
+## Introduction
+
+Semantic segmentation, with the goal to assign semantic labels to every pixel in an image,
+is an essential computer vision task. In this example, we implement
+the **DeepLabV3+** model for multi-class semantic segmentation, a fully-convolutional
+architecture that performs well on semantic segmentation benchmarks.
+
+### References:
+
+- [Encoder-Decoder with Atrous Separable Convolution for Semantic Image Segmentation](https://arxiv.org/abs/1802.02611)
+- [Rethinking Atrous Convolution for Semantic Image Segmentation](https://arxiv.org/abs/1706.05587)
+- [DeepLab: Semantic Image Segmentation with Deep Convolutional Nets, Atrous Convolution, and Fully Connected CRFs](https://arxiv.org/abs/1606.00915)
+"""
+
+"""
+## Downloading the data
+
+We will use the [Crowd Instance-level Human Parsing Dataset](https://arxiv.org/abs/1811.12596)
+for training our model. The Crowd Instance-level Human Parsing (CIHP) dataset has 38,280 diverse human images.
+Each image in CIHP is labeled with pixel-wise annotations for 20 categories, as well as instance-level identification.
+This dataset can be used for the "human part segmentation" task.
+"""
+
+
+import keras
+from keras import layers
+from keras import ops
+
+import os
+import numpy as np
+from glob import glob
+import cv2
+from scipy.io import loadmat
+import matplotlib.pyplot as plt
+
+# For data preprocessing
+from tensorflow import image as tf_image
+from tensorflow import data as tf_data
+from tensorflow import io as tf_io
+
+"""shell
+gdown "1B9A9UCJYMwTL4oBEo4RZfbMZMaZhKJaz&confirm=t"
+unzip -q instance-level-human-parsing.zip
+"""
+
+"""
+## Creating a TensorFlow Dataset
+
+Training on the entire CIHP dataset with 38,280 images takes a lot of time, hence we will be using
+a smaller subset of 200 images for training our model in this example.
+"""
+
+IMAGE_SIZE = 512
+BATCH_SIZE = 4
+NUM_CLASSES = 20
+DATA_DIR = "./instance-level_human_parsing/instance-level_human_parsing/Training"
+NUM_TRAIN_IMAGES = 1000
+NUM_VAL_IMAGES = 50
+
+train_images = sorted(glob(os.path.join(DATA_DIR, "Images/*")))[:NUM_TRAIN_IMAGES]
+train_masks = sorted(glob(os.path.join(DATA_DIR, "Category_ids/*")))[:NUM_TRAIN_IMAGES]
+val_images = sorted(glob(os.path.join(DATA_DIR, "Images/*")))[
+    NUM_TRAIN_IMAGES : NUM_VAL_IMAGES + NUM_TRAIN_IMAGES
+]
+val_masks = sorted(glob(os.path.join(DATA_DIR, "Category_ids/*")))[
+    NUM_TRAIN_IMAGES : NUM_VAL_IMAGES + NUM_TRAIN_IMAGES
+]
+
+
+def read_image(image_path, mask=False):
+    image = tf_io.read_file(image_path)
+    if mask:
+        image = tf_image.decode_png(image, channels=1)
+        image.set_shape([None, None, 1])
+        image = tf_image.resize(images=image, size=[IMAGE_SIZE, IMAGE_SIZE])
+    else:
+        image = tf_image.decode_png(image, channels=3)
+        image.set_shape([None, None, 3])
+        image = tf_image.resize(images=image, size=[IMAGE_SIZE, IMAGE_SIZE])
+    return image
+
+
+def load_data(image_list, mask_list):
+    image = read_image(image_list)
+    mask = read_image(mask_list, mask=True)
+    return image, mask
+
+
+def data_generator(image_list, mask_list):
+    dataset = tf_data.Dataset.from_tensor_slices((image_list, mask_list))
+    dataset = dataset.map(load_data, num_parallel_calls=tf_data.AUTOTUNE)
+    dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
+    return dataset
+
+
+train_dataset = data_generator(train_images, train_masks)
+val_dataset = data_generator(val_images, val_masks)
+
+print("Train Dataset:", train_dataset)
+print("Val Dataset:", val_dataset)
+
+"""
+## Building the DeepLabV3+ model
+
+DeepLabv3+ extends DeepLabv3 by adding an encoder-decoder structure. The encoder module
+processes multiscale contextual information by applying dilated convolution at multiple
+scales, while the decoder module refines the segmentation results along object boundaries.
+
+![](https://github.com/lattice-ai/DeepLabV3-Plus/raw/master/assets/deeplabv3_plus_diagram.png)
+
+**Dilated convolution:** With dilated convolution, as we go deeper in the network, we can keep the
+stride constant but with larger field-of-view without increasing the number of parameters
+or the amount of computation. Besides, it enables larger output feature maps, which is
+useful for semantic segmentation.
+
+The reason for using **Dilated Spatial Pyramid Pooling** is that it was shown that as the
+sampling rate becomes larger, the number of valid filter weights (i.e., weights that
+are applied to the valid feature region, instead of padded zeros) becomes smaller.
+"""
+
+
+def convolution_block(
+    block_input,
+    num_filters=256,
+    kernel_size=3,
+    dilation_rate=1,
+    use_bias=False,
+):
+    x = layers.Conv2D(
+        num_filters,
+        kernel_size=kernel_size,
+        dilation_rate=dilation_rate,
+        padding="same",
+        use_bias=use_bias,
+        kernel_initializer=keras.initializers.HeNormal(),
+    )(block_input)
+    x = layers.BatchNormalization()(x)
+    return ops.nn.relu(x)
+
+
+def DilatedSpatialPyramidPooling(dspp_input):
+    dims = dspp_input.shape
+    x = layers.AveragePooling2D(pool_size=(dims[-3], dims[-2]))(dspp_input)
+    x = convolution_block(x, kernel_size=1, use_bias=True)
+    out_pool = layers.UpSampling2D(
+        size=(dims[-3] // x.shape[1], dims[-2] // x.shape[2]),
+        interpolation="bilinear",
+    )(x)
+
+    out_1 = convolution_block(dspp_input, kernel_size=1, dilation_rate=1)
+    out_6 = convolution_block(dspp_input, kernel_size=3, dilation_rate=6)
+    out_12 = convolution_block(dspp_input, kernel_size=3, dilation_rate=12)
+    out_18 = convolution_block(dspp_input, kernel_size=3, dilation_rate=18)
+
+    x = layers.Concatenate(axis=-1)([out_pool, out_1, out_6, out_12, out_18])
+    output = convolution_block(x, kernel_size=1)
+    return output
+
+
+"""
+The encoder features are first bilinearly upsampled by a factor 4, and then
+concatenated with the corresponding low-level features from the network backbone that
+have the same spatial resolution. For this example, we
+use a ResNet50 pretrained on ImageNet as the backbone model, and we use
+the low-level features from the `conv4_block6_2_relu` block of the backbone.
+"""
+
+
+def DeeplabV3Plus(image_size, num_classes):
+    model_input = keras.Input(shape=(image_size, image_size, 3))
+    preprocessed = keras.applications.resnet50.preprocess_input(model_input)
+    resnet50 = keras.applications.ResNet50(
+        weights="imagenet", include_top=False, input_tensor=preprocessed
+    )
+    x = resnet50.get_layer("conv4_block6_2_relu").output
+    x = DilatedSpatialPyramidPooling(x)
+
+    input_a = layers.UpSampling2D(
+        size=(image_size // 4 // x.shape[1], image_size // 4 // x.shape[2]),
+        interpolation="bilinear",
+    )(x)
+    input_b = resnet50.get_layer("conv2_block3_2_relu").output
+    input_b = convolution_block(input_b, num_filters=48, kernel_size=1)
+
+    x = layers.Concatenate(axis=-1)([input_a, input_b])
+    x = convolution_block(x)
+    x = convolution_block(x)
+    x = layers.UpSampling2D(
+        size=(image_size // x.shape[1], image_size // x.shape[2]),
+        interpolation="bilinear",
+    )(x)
+    model_output = layers.Conv2D(num_classes, kernel_size=(1, 1), padding="same")(x)
+    return keras.Model(inputs=model_input, outputs=model_output)
+
+
+model = DeeplabV3Plus(image_size=IMAGE_SIZE, num_classes=NUM_CLASSES)
+model.summary()
+
+"""
+## Training
+
+We train the model using sparse categorical crossentropy as the loss function, and
+Adam as the optimizer.
+"""
+
+loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+model.compile(
+    optimizer=keras.optimizers.Adam(learning_rate=0.001),
+    loss=loss,
+    metrics=["accuracy"],
+)
+
+history = model.fit(train_dataset, validation_data=val_dataset, epochs=25)
+
+plt.plot(history.history["loss"])
+plt.title("Training Loss")
+plt.ylabel("loss")
+plt.xlabel("epoch")
+plt.show()
+
+plt.plot(history.history["accuracy"])
+plt.title("Training Accuracy")
+plt.ylabel("accuracy")
+plt.xlabel("epoch")
+plt.show()
+
+plt.plot(history.history["val_loss"])
+plt.title("Validation Loss")
+plt.ylabel("val_loss")
+plt.xlabel("epoch")
+plt.show()
+
+plt.plot(history.history["val_accuracy"])
+plt.title("Validation Accuracy")
+plt.ylabel("val_accuracy")
+plt.xlabel("epoch")
+plt.show()
+
+"""
+## Inference using Colormap Overlay
+
+The raw predictions from the model represent a one-hot encoded tensor of shape `(N, 512, 512, 20)`
+where each one of the 20 channels is a binary mask corresponding to a predicted label.
+In order to visualize the results, we plot them as RGB segmentation masks where each pixel
+is represented by a unique color corresponding to the particular label predicted. We can easily
+find the color corresponding to each label from the `human_colormap.mat` file provided as part
+of the dataset. We would also plot an overlay of the RGB segmentation mask on the input image as
+this further helps us to identify the different categories present in the image more intuitively.
+"""
+
+# Loading the Colormap
+colormap = loadmat(
+    "./instance-level_human_parsing/instance-level_human_parsing/human_colormap.mat"
+)["colormap"]
+colormap = colormap * 100
+colormap = colormap.astype(np.uint8)
+
+
+def infer(model, image_tensor):
+    predictions = model.predict(np.expand_dims((image_tensor), axis=0))
+    predictions = np.squeeze(predictions)
+    predictions = np.argmax(predictions, axis=2)
+    return predictions
+
+
+def decode_segmentation_masks(mask, colormap, n_classes):
+    r = np.zeros_like(mask).astype(np.uint8)
+    g = np.zeros_like(mask).astype(np.uint8)
+    b = np.zeros_like(mask).astype(np.uint8)
+    for l in range(0, n_classes):
+        idx = mask == l
+        r[idx] = colormap[l, 0]
+        g[idx] = colormap[l, 1]
+        b[idx] = colormap[l, 2]
+    rgb = np.stack([r, g, b], axis=2)
+    return rgb
+
+
+def get_overlay(image, colored_mask):
+    image = keras.utils.array_to_img(image)
+    image = np.array(image).astype(np.uint8)
+    overlay = cv2.addWeighted(image, 0.35, colored_mask, 0.65, 0)
+    return overlay
+
+
+def plot_samples_matplotlib(display_list, figsize=(5, 3)):
+    _, axes = plt.subplots(nrows=1, ncols=len(display_list), figsize=figsize)
+    for i in range(len(display_list)):
+        if display_list[i].shape[-1] == 3:
+            axes[i].imshow(keras.utils.array_to_img(display_list[i]))
+        else:
+            axes[i].imshow(display_list[i])
+    plt.show()
+
+
+def plot_predictions(images_list, colormap, model):
+    for image_file in images_list:
+        image_tensor = read_image(image_file)
+        prediction_mask = infer(image_tensor=image_tensor, model=model)
+        prediction_colormap = decode_segmentation_masks(prediction_mask, colormap, 20)
+        overlay = get_overlay(image_tensor, prediction_colormap)
+        plot_samples_matplotlib(
+            [image_tensor, overlay, prediction_colormap], figsize=(18, 14)
+        )
+
+
+"""
+### Inference on Train Images
+"""
+
+plot_predictions(train_images[:4], colormap, model=model)
+
+"""
+### Inference on Validation Images
+
+You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/keras-io/deeplabv3p-resnet50)
+and try the demo on [Hugging Face Spaces](https://huggingface.co/spaces/keras-io/Human-Part-Segmentation).
+"""
+
+plot_predictions(val_images[:4], colormap, model=model)
diff --git a/knowledge_base/vision/deit.py b/knowledge_base/vision/deit.py
new file mode 100644
index 0000000000000000000000000000000000000000..47529a38e6b5a2b229eecef8478723c20e8a0e76
--- /dev/null
+++ b/knowledge_base/vision/deit.py
@@ -0,0 +1,618 @@
+"""
+Title: Distilling Vision Transformers
+Author: [Sayak Paul](https://twitter.com/RisingSayak)
+Date created: 2022/04/05
+Last modified: 2022/04/08
+Description: Distillation of Vision Transformers through attention.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+In the original *Vision Transformers* (ViT) paper
+([Dosovitskiy et al.](https://arxiv.org/abs/2010.11929)),
+the authors concluded that to perform on par with Convolutional Neural Networks (CNNs),
+ViTs need to be pre-trained on larger datasets. The larger the better. This is mainly
+due to the lack of inductive biases in the ViT architecture -- unlike CNNs,
+they don't have layers that exploit locality. In a follow-up paper
+([Steiner et al.](https://arxiv.org/abs/2106.10270)),
+the authors show that it is possible to substantially improve the performance of ViTs
+with stronger regularization and longer training.
+
+Many groups have proposed different ways to deal with the problem
+of data-intensiveness of ViT training.
+One such way was shown in the *Data-efficient image Transformers*,
+(DeiT) paper ([Touvron et al.](https://arxiv.org/abs/2012.12877)). The
+authors introduced a distillation technique that is specific to transformer-based vision
+models. DeiT is among the first works to show that it's possible to train ViTs well
+without using larger datasets.
+
+In this example, we implement the distillation recipe proposed in DeiT. This
+requires us to slightly tweak the original ViT architecture and write a custom training
+loop to implement the distillation recipe.
+
+To run the example, you'll need TensorFlow Addons, which you can install with the
+following command:
+
+```
+pip install tensorflow-addons
+```
+
+To comfortably navigate through this example, you'll be expected to know how a ViT and
+knowledge distillation work. The following are good resources in case you needed a
+refresher:
+
+* [ViT on keras.io](https://keras.io/examples/vision/image_classification_with_vision_transformer)
+* [Knowledge distillation on keras.io](https://keras.io/examples/vision/knowledge_distillation/)
+"""
+
+"""
+## Imports
+"""
+
+from typing import List
+
+import tensorflow as tf
+import tensorflow_addons as tfa
+import tensorflow_datasets as tfds
+import tensorflow_hub as hub
+from tensorflow import keras
+from tensorflow.keras import layers
+
+tfds.disable_progress_bar()
+tf.keras.utils.set_random_seed(42)
+
+"""
+## Constants
+"""
+
+# Model
+MODEL_TYPE = "deit_distilled_tiny_patch16_224"
+RESOLUTION = 224
+PATCH_SIZE = 16
+NUM_PATCHES = (RESOLUTION // PATCH_SIZE) ** 2
+LAYER_NORM_EPS = 1e-6
+PROJECTION_DIM = 192
+NUM_HEADS = 3
+NUM_LAYERS = 12
+MLP_UNITS = [
+    PROJECTION_DIM * 4,
+    PROJECTION_DIM,
+]
+DROPOUT_RATE = 0.0
+DROP_PATH_RATE = 0.1
+
+# Training
+NUM_EPOCHS = 20
+BASE_LR = 0.0005
+WEIGHT_DECAY = 0.0001
+
+# Data
+BATCH_SIZE = 256
+AUTO = tf.data.AUTOTUNE
+NUM_CLASSES = 5
+
+"""
+You probably noticed that `DROPOUT_RATE` has been set 0.0. Dropout has been used
+in the implementation to keep it complete. For smaller models (like the one used in
+this example), you don't need it, but for bigger models, using dropout helps.
+"""
+
+"""
+## Load the `tf_flowers` dataset and prepare preprocessing utilities
+
+The authors use an array of different augmentation techniques, including MixUp
+([Zhang et al.](https://arxiv.org/abs/1710.09412)),
+RandAugment ([Cubuk et al.](https://arxiv.org/abs/1909.13719)),
+and so on. However, to keep the example simple to work through, we'll discard them.
+"""
+
+
+def preprocess_dataset(is_training=True):
+    def fn(image, label):
+        if is_training:
+            # Resize to a bigger spatial resolution and take the random
+            # crops.
+            image = tf.image.resize(image, (RESOLUTION + 20, RESOLUTION + 20))
+            image = tf.image.random_crop(image, (RESOLUTION, RESOLUTION, 3))
+            image = tf.image.random_flip_left_right(image)
+        else:
+            image = tf.image.resize(image, (RESOLUTION, RESOLUTION))
+        label = tf.one_hot(label, depth=NUM_CLASSES)
+        return image, label
+
+    return fn
+
+
+def prepare_dataset(dataset, is_training=True):
+    if is_training:
+        dataset = dataset.shuffle(BATCH_SIZE * 10)
+    dataset = dataset.map(preprocess_dataset(is_training), num_parallel_calls=AUTO)
+    return dataset.batch(BATCH_SIZE).prefetch(AUTO)
+
+
+train_dataset, val_dataset = tfds.load(
+    "tf_flowers", split=["train[:90%]", "train[90%:]"], as_supervised=True
+)
+num_train = train_dataset.cardinality()
+num_val = val_dataset.cardinality()
+print(f"Number of training examples: {num_train}")
+print(f"Number of validation examples: {num_val}")
+
+train_dataset = prepare_dataset(train_dataset, is_training=True)
+val_dataset = prepare_dataset(val_dataset, is_training=False)
+
+"""
+## Implementing the DeiT variants of ViT
+
+Since DeiT is an extension of ViT it'd make sense to first implement ViT and then extend
+it to support DeiT's components.
+
+First, we'll implement a layer for Stochastic Depth
+([Huang et al.](https://arxiv.org/abs/1603.09382))
+which is used in DeiT for regularization.
+"""
+
+
+# Referred from: github.com:rwightman/pytorch-image-models.
+class StochasticDepth(layers.Layer):
+    def __init__(self, drop_prop, **kwargs):
+        super().__init__(**kwargs)
+        self.drop_prob = drop_prop
+
+    def call(self, x, training=True):
+        if training:
+            keep_prob = 1 - self.drop_prob
+            shape = (tf.shape(x)[0],) + (1,) * (len(tf.shape(x)) - 1)
+            random_tensor = keep_prob + tf.random.uniform(shape, 0, 1)
+            random_tensor = tf.floor(random_tensor)
+            return (x / keep_prob) * random_tensor
+        return x
+
+
+"""
+Now, we'll implement the MLP and Transformer blocks.
+"""
+
+
+def mlp(x, dropout_rate: float, hidden_units: List):
+    """FFN for a Transformer block."""
+    # Iterate over the hidden units and
+    # add Dense => Dropout.
+    for idx, units in enumerate(hidden_units):
+        x = layers.Dense(
+            units,
+            activation=tf.nn.gelu if idx == 0 else None,
+        )(x)
+        x = layers.Dropout(dropout_rate)(x)
+    return x
+
+
+def transformer(drop_prob: float, name: str) -> keras.Model:
+    """Transformer block with pre-norm."""
+    num_patches = NUM_PATCHES + 2 if "distilled" in MODEL_TYPE else NUM_PATCHES + 1
+    encoded_patches = layers.Input((num_patches, PROJECTION_DIM))
+
+    # Layer normalization 1.
+    x1 = layers.LayerNormalization(epsilon=LAYER_NORM_EPS)(encoded_patches)
+
+    # Multi Head Self Attention layer 1.
+    attention_output = layers.MultiHeadAttention(
+        num_heads=NUM_HEADS,
+        key_dim=PROJECTION_DIM,
+        dropout=DROPOUT_RATE,
+    )(x1, x1)
+    attention_output = (
+        StochasticDepth(drop_prob)(attention_output) if drop_prob else attention_output
+    )
+
+    # Skip connection 1.
+    x2 = layers.Add()([attention_output, encoded_patches])
+
+    # Layer normalization 2.
+    x3 = layers.LayerNormalization(epsilon=LAYER_NORM_EPS)(x2)
+
+    # MLP layer 1.
+    x4 = mlp(x3, hidden_units=MLP_UNITS, dropout_rate=DROPOUT_RATE)
+    x4 = StochasticDepth(drop_prob)(x4) if drop_prob else x4
+
+    # Skip connection 2.
+    outputs = layers.Add()([x2, x4])
+
+    return keras.Model(encoded_patches, outputs, name=name)
+
+
+"""
+We'll now implement a `ViTClassifier` class building on top of the components we just
+developed. Here we'll be following the original pooling strategy used in the ViT paper --
+use a class token and use the feature representations corresponding to it for
+classification.
+"""
+
+
+class ViTClassifier(keras.Model):
+    """Vision Transformer base class."""
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+        # Patchify + linear projection + reshaping.
+        self.projection = keras.Sequential(
+            [
+                layers.Conv2D(
+                    filters=PROJECTION_DIM,
+                    kernel_size=(PATCH_SIZE, PATCH_SIZE),
+                    strides=(PATCH_SIZE, PATCH_SIZE),
+                    padding="VALID",
+                    name="conv_projection",
+                ),
+                layers.Reshape(
+                    target_shape=(NUM_PATCHES, PROJECTION_DIM),
+                    name="flatten_projection",
+                ),
+            ],
+            name="projection",
+        )
+
+        # Positional embedding.
+        init_shape = (
+            1,
+            NUM_PATCHES + 1,
+            PROJECTION_DIM,
+        )
+        self.positional_embedding = tf.Variable(
+            tf.zeros(init_shape), name="pos_embedding"
+        )
+
+        # Transformer blocks.
+        dpr = [x for x in tf.linspace(0.0, DROP_PATH_RATE, NUM_LAYERS)]
+        self.transformer_blocks = [
+            transformer(drop_prob=dpr[i], name=f"transformer_block_{i}")
+            for i in range(NUM_LAYERS)
+        ]
+
+        # CLS token.
+        initial_value = tf.zeros((1, 1, PROJECTION_DIM))
+        self.cls_token = tf.Variable(
+            initial_value=initial_value, trainable=True, name="cls"
+        )
+
+        # Other layers.
+        self.dropout = layers.Dropout(DROPOUT_RATE)
+        self.layer_norm = layers.LayerNormalization(epsilon=LAYER_NORM_EPS)
+        self.head = layers.Dense(
+            NUM_CLASSES,
+            name="classification_head",
+        )
+
+    def call(self, inputs, training=True):
+        n = tf.shape(inputs)[0]
+
+        # Create patches and project the patches.
+        projected_patches = self.projection(inputs)
+
+        # Append class token if needed.
+        cls_token = tf.tile(self.cls_token, (n, 1, 1))
+        cls_token = tf.cast(cls_token, projected_patches.dtype)
+        projected_patches = tf.concat([cls_token, projected_patches], axis=1)
+
+        # Add positional embeddings to the projected patches.
+        encoded_patches = (
+            self.positional_embedding + projected_patches
+        )  # (B, number_patches, projection_dim)
+        encoded_patches = self.dropout(encoded_patches)
+
+        # Iterate over the number of layers and stack up blocks of
+        # Transformer.
+        for transformer_module in self.transformer_blocks:
+            # Add a Transformer block.
+            encoded_patches = transformer_module(encoded_patches)
+
+        # Final layer normalization.
+        representation = self.layer_norm(encoded_patches)
+
+        # Pool representation.
+        encoded_patches = representation[:, 0]
+
+        # Classification head.
+        output = self.head(encoded_patches)
+        return output
+
+
+"""
+This class can be used standalone as ViT and is end-to-end trainable. Just remove the
+`distilled` phrase in `MODEL_TYPE` and it should work with `vit_tiny = ViTClassifier()`.
+Let's now extend it to DeiT. The following figure presents the schematic of DeiT (taken
+from the DeiT paper):
+
+![](https://i.imgur.com/5lmg2Xs.png)
+
+Apart from the class token, DeiT has another token for distillation. During distillation,
+the logits corresponding to the class token are compared to the true labels, and the
+logits corresponding to the distillation token are compared to the teacher's predictions.
+"""
+
+
+class ViTDistilled(ViTClassifier):
+    def __init__(self, regular_training=False, **kwargs):
+        super().__init__(**kwargs)
+        self.num_tokens = 2
+        self.regular_training = regular_training
+
+        # CLS and distillation tokens, positional embedding.
+        init_value = tf.zeros((1, 1, PROJECTION_DIM))
+        self.dist_token = tf.Variable(init_value, name="dist_token")
+        self.positional_embedding = tf.Variable(
+            tf.zeros(
+                (
+                    1,
+                    NUM_PATCHES + self.num_tokens,
+                    PROJECTION_DIM,
+                )
+            ),
+            name="pos_embedding",
+        )
+
+        # Head layers.
+        self.head = layers.Dense(
+            NUM_CLASSES,
+            name="classification_head",
+        )
+        self.head_dist = layers.Dense(
+            NUM_CLASSES,
+            name="distillation_head",
+        )
+
+    def call(self, inputs, training=True):
+        n = tf.shape(inputs)[0]
+
+        # Create patches and project the patches.
+        projected_patches = self.projection(inputs)
+
+        # Append the tokens.
+        cls_token = tf.tile(self.cls_token, (n, 1, 1))
+        dist_token = tf.tile(self.dist_token, (n, 1, 1))
+        cls_token = tf.cast(cls_token, projected_patches.dtype)
+        dist_token = tf.cast(dist_token, projected_patches.dtype)
+        projected_patches = tf.concat(
+            [cls_token, dist_token, projected_patches], axis=1
+        )
+
+        # Add positional embeddings to the projected patches.
+        encoded_patches = (
+            self.positional_embedding + projected_patches
+        )  # (B, number_patches, projection_dim)
+        encoded_patches = self.dropout(encoded_patches)
+
+        # Iterate over the number of layers and stack up blocks of
+        # Transformer.
+        for transformer_module in self.transformer_blocks:
+            # Add a Transformer block.
+            encoded_patches = transformer_module(encoded_patches)
+
+        # Final layer normalization.
+        representation = self.layer_norm(encoded_patches)
+
+        # Classification heads.
+        x, x_dist = (
+            self.head(representation[:, 0]),
+            self.head_dist(representation[:, 1]),
+        )
+
+        if not training or self.regular_training:
+            # During standard train / finetune, inference average the classifier
+            # predictions.
+            return (x + x_dist) / 2
+
+        elif training:
+            # Only return separate classification predictions when training in distilled
+            # mode.
+            return x, x_dist
+
+
+"""
+Let's verify if the `ViTDistilled` class can be initialized and called as expected.
+"""
+
+deit_tiny_distilled = ViTDistilled()
+
+dummy_inputs = tf.ones((2, 224, 224, 3))
+outputs = deit_tiny_distilled(dummy_inputs, training=False)
+print(outputs.shape)
+
+"""
+## Implementing the trainer
+
+Unlike what happens in standard knowledge distillation
+([Hinton et al.](https://arxiv.org/abs/1503.02531)),
+where a temperature-scaled softmax is used as well as KL divergence,
+DeiT authors use the following loss function:
+
+![](https://i.imgur.com/bXdxsBq.png)
+
+
+Here,
+
+* CE is cross-entropy
+* `psi` is the softmax function
+* Z_s denotes student predictions
+* y denotes true labels
+* y_t denotes teacher predictions
+"""
+
+
+class DeiT(keras.Model):
+    # Reference:
+    # https://keras.io/examples/vision/knowledge_distillation/
+    def __init__(self, student, teacher, **kwargs):
+        super().__init__(**kwargs)
+        self.student = student
+        self.teacher = teacher
+
+        self.student_loss_tracker = keras.metrics.Mean(name="student_loss")
+        self.dist_loss_tracker = keras.metrics.Mean(name="distillation_loss")
+
+    @property
+    def metrics(self):
+        metrics = super().metrics
+        metrics.append(self.student_loss_tracker)
+        metrics.append(self.dist_loss_tracker)
+        return metrics
+
+    def compile(
+        self,
+        optimizer,
+        metrics,
+        student_loss_fn,
+        distillation_loss_fn,
+    ):
+        super().compile(optimizer=optimizer, metrics=metrics)
+        self.student_loss_fn = student_loss_fn
+        self.distillation_loss_fn = distillation_loss_fn
+
+    def train_step(self, data):
+        # Unpack data.
+        x, y = data
+
+        # Forward pass of teacher
+        teacher_predictions = tf.nn.softmax(self.teacher(x, training=False), -1)
+        teacher_predictions = tf.argmax(teacher_predictions, -1)
+
+        with tf.GradientTape() as tape:
+            # Forward pass of student.
+            cls_predictions, dist_predictions = self.student(x / 255.0, training=True)
+
+            # Compute losses.
+            student_loss = self.student_loss_fn(y, cls_predictions)
+            distillation_loss = self.distillation_loss_fn(
+                teacher_predictions, dist_predictions
+            )
+            loss = (student_loss + distillation_loss) / 2
+
+        # Compute gradients.
+        trainable_vars = self.student.trainable_variables
+        gradients = tape.gradient(loss, trainable_vars)
+
+        # Update weights.
+        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
+
+        # Update the metrics configured in `compile()`.
+        student_predictions = (cls_predictions + dist_predictions) / 2
+        self.compiled_metrics.update_state(y, student_predictions)
+        self.dist_loss_tracker.update_state(distillation_loss)
+        self.student_loss_tracker.update_state(student_loss)
+
+        # Return a dict of performance.
+        results = {m.name: m.result() for m in self.metrics}
+        return results
+
+    def test_step(self, data):
+        # Unpack the data.
+        x, y = data
+
+        # Compute predictions.
+        y_prediction = self.student(x / 255.0, training=False)
+
+        # Calculate the loss.
+        student_loss = self.student_loss_fn(y, y_prediction)
+
+        # Update the metrics.
+        self.compiled_metrics.update_state(y, y_prediction)
+        self.student_loss_tracker.update_state(student_loss)
+
+        # Return a dict of performance.
+        results = {m.name: m.result() for m in self.metrics}
+        return results
+
+    def call(self, inputs):
+        return self.student(inputs / 255.0, training=False)
+
+
+"""
+## Load the teacher model
+
+This model is based on the BiT family of ResNets
+([Kolesnikov et al.](https://arxiv.org/abs/1912.11370))
+fine-tuned on the `tf_flowers` dataset. You can refer to
+[this notebook](https://github.com/sayakpaul/deit-tf/blob/main/notebooks/bit-teacher.ipynb)
+to know how the training was performed. The teacher model has about 212 Million parameters
+which is about **40x more** than the student.
+"""
+
+"""shell
+wget -q https://github.com/sayakpaul/deit-tf/releases/download/v0.1.0/bit_teacher_flowers.zip
+unzip -q bit_teacher_flowers.zip
+"""
+
+bit_teacher_flowers = keras.models.load_model("bit_teacher_flowers")
+
+"""
+## Training through distillation
+"""
+
+deit_tiny = ViTDistilled()
+deit_distiller = DeiT(student=deit_tiny, teacher=bit_teacher_flowers)
+
+lr_scaled = (BASE_LR / 512) * BATCH_SIZE
+deit_distiller.compile(
+    optimizer=tfa.optimizers.AdamW(weight_decay=WEIGHT_DECAY, learning_rate=lr_scaled),
+    metrics=["accuracy"],
+    student_loss_fn=keras.losses.CategoricalCrossentropy(
+        from_logits=True, label_smoothing=0.1
+    ),
+    distillation_loss_fn=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+)
+_ = deit_distiller.fit(train_dataset, validation_data=val_dataset, epochs=NUM_EPOCHS)
+
+"""
+If we had trained the same model (the `ViTClassifier`) from scratch with the exact same
+hyperparameters, the model would have scored about 59% accuracy. You can adapt the following code
+to reproduce this result:
+
+```
+vit_tiny = ViTClassifier()
+
+inputs = keras.Input((RESOLUTION, RESOLUTION, 3))
+x = keras.layers.Rescaling(scale=1./255)(inputs)
+outputs = deit_tiny(x)
+model = keras.Model(inputs, outputs)
+
+model.compile(...)
+model.fit(...)
+```
+"""
+
+"""
+## Notes
+
+* Through the use of distillation, we're effectively transferring the inductive biases of
+a CNN-based teacher model.
+* Interestingly enough, this distillation strategy works better with a CNN as the teacher
+model rather than a Transformer as shown in the paper.
+* The use of regularization to train DeiT models is very important.
+* ViT models are initialized with a combination of different initializers including
+truncated normal, random normal, Glorot uniform, etc. If you're looking for
+end-to-end reproduction of the original results, don't forget to initialize the ViTs well.
+* If you want to explore the pre-trained DeiT models in TensorFlow and Keras with code
+for fine-tuning, [check out these models on TF-Hub](https://tfhub.dev/sayakpaul/collections/deit/1).
+
+## Acknowledgements
+
+* Ross Wightman for keeping
+[`timm`](https://github.com/rwightman/pytorch-image-models)
+updated with readable implementations. I referred to the implementations of ViT and DeiT
+a lot during implementing them in TensorFlow.
+* [Aritra Roy Gosthipaty](https://github.com/ariG23498)
+who implemented some portions of the `ViTClassifier` in another project.
+* [Google Developers Experts](https://developers.google.com/programs/experts/)
+program for supporting me with GCP credits which were used to run experiments for this
+example.
+
+Example available on HuggingFace:
+
+| Trained Model | Demo |
+| :--: | :--: |
+| [![Generic badge](https://img.shields.io/badge/🤗%20Model-DEIT-black.svg)](https://huggingface.co/keras-io/deit) | [![Generic badge](https://img.shields.io/badge/🤗%20Spaces-DEIT-black.svg)](https://huggingface.co/spaces/keras-io/deit/) |
+
+"""
diff --git a/knowledge_base/vision/depth_estimation.py b/knowledge_base/vision/depth_estimation.py
new file mode 100644
index 0000000000000000000000000000000000000000..22acc36f09854c0a730471fcfde15ad7baadb263
--- /dev/null
+++ b/knowledge_base/vision/depth_estimation.py
@@ -0,0 +1,533 @@
+"""
+Title: Monocular depth estimation
+Author: [Victor Basu](https://www.linkedin.com/in/victor-basu-520958147)
+Date created: 2021/08/30
+Last modified: 2024/08/13
+Description: Implement a depth estimation model with a convnet.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+_Depth estimation_ is a crucial step towards inferring scene geometry from 2D images.
+The goal in _monocular depth estimation_ is to predict the depth value of each pixel or
+inferring depth information, given only a single RGB image as input.
+This example will show an approach to build a depth estimation model with a convnet
+and simple loss functions.
+
+![depth](https://paperswithcode.com/media/thumbnails/task/task-0000000605-d9849a91.jpg)
+
+"""
+
+"""
+## Setup
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import sys
+
+import tensorflow as tf
+import keras
+from keras import layers
+from keras import ops
+import pandas as pd
+import numpy as np
+import cv2
+import matplotlib.pyplot as plt
+
+keras.utils.set_random_seed(123)
+
+"""
+## Downloading the dataset
+
+We will be using the dataset **DIODE: A Dense Indoor and Outdoor Depth Dataset**  for this
+tutorial. However, we use the validation set generating training and evaluation subsets
+for our model. The reason we use the validation set rather than the training set of the original dataset is because
+the training set consists of 81GB of data, which is challenging to download compared
+to the validation set which is only 2.6GB.
+Other datasets that you could use are
+**[NYU-v2](https://cs.nyu.edu/~silberman/datasets/nyu_depth_v2.html)**
+and **[KITTI](http://www.cvlibs.net/datasets/kitti/)**.
+"""
+
+annotation_folder = "/dataset/"
+if not os.path.exists(os.path.abspath(".") + annotation_folder):
+    annotation_zip = keras.utils.get_file(
+        "val.tar.gz",
+        cache_subdir=os.path.abspath("."),
+        origin="http://diode-dataset.s3.amazonaws.com/val.tar.gz",
+        extract=True,
+    )
+
+"""
+##  Preparing the dataset
+
+We only use the indoor images to train our depth estimation model.
+"""
+
+path = "val/indoors"
+
+filelist = []
+
+for root, dirs, files in os.walk(path):
+    for file in files:
+        filelist.append(os.path.join(root, file))
+
+filelist.sort()
+data = {
+    "image": [x for x in filelist if x.endswith(".png")],
+    "depth": [x for x in filelist if x.endswith("_depth.npy")],
+    "mask": [x for x in filelist if x.endswith("_depth_mask.npy")],
+}
+df = pd.DataFrame(data)
+
+df = df.sample(frac=1, random_state=42)
+
+"""
+## Preparing hyperparameters
+"""
+
+HEIGHT = 256
+WIDTH = 256
+LR = 0.00001
+EPOCHS = 30
+BATCH_SIZE = 32
+
+"""
+## Building a data pipeline
+
+1. The pipeline takes a dataframe containing the path for the RGB images,
+as well as the depth and depth mask files.
+2. It reads and resize the RGB images.
+3. It reads the depth and depth mask files, process them to generate the depth map image and
+resize it.
+4. It returns the RGB images and the depth map images for a batch.
+"""
+
+
+class DataGenerator(keras.utils.PyDataset):
+    def __init__(self, data, batch_size=6, dim=(768, 1024), n_channels=3, shuffle=True):
+        super().__init__()
+        """
+        Initialization
+        """
+        self.data = data
+        self.indices = self.data.index.tolist()
+        self.dim = dim
+        self.n_channels = n_channels
+        self.batch_size = batch_size
+        self.shuffle = shuffle
+        self.min_depth = 0.1
+        self.on_epoch_end()
+
+    def __len__(self):
+        return int(np.ceil(len(self.data) / self.batch_size))
+
+    def __getitem__(self, index):
+        if (index + 1) * self.batch_size > len(self.indices):
+            self.batch_size = len(self.indices) - index * self.batch_size
+        # Generate one batch of data
+        # Generate indices of the batch
+        index = self.indices[index * self.batch_size : (index + 1) * self.batch_size]
+        # Find list of IDs
+        batch = [self.indices[k] for k in index]
+        x, y = self.data_generation(batch)
+
+        return x, y
+
+    def on_epoch_end(self):
+        """
+        Updates indexes after each epoch
+        """
+        self.index = np.arange(len(self.indices))
+        if self.shuffle == True:
+            np.random.shuffle(self.index)
+
+    def load(self, image_path, depth_map, mask):
+        """Load input and target image."""
+
+        image_ = cv2.imread(image_path)
+        image_ = cv2.cvtColor(image_, cv2.COLOR_BGR2RGB)
+        image_ = cv2.resize(image_, self.dim)
+        image_ = tf.image.convert_image_dtype(image_, tf.float32)
+
+        depth_map = np.load(depth_map).squeeze()
+
+        mask = np.load(mask)
+        mask = mask > 0
+
+        max_depth = min(300, np.percentile(depth_map, 99))
+        depth_map = np.clip(depth_map, self.min_depth, max_depth)
+        depth_map = np.log(depth_map, where=mask)
+
+        depth_map = np.ma.masked_where(~mask, depth_map)
+
+        depth_map = np.clip(depth_map, 0.1, np.log(max_depth))
+        depth_map = cv2.resize(depth_map, self.dim)
+        depth_map = np.expand_dims(depth_map, axis=2)
+        depth_map = tf.image.convert_image_dtype(depth_map, tf.float32)
+
+        return image_, depth_map
+
+    def data_generation(self, batch):
+        x = np.empty((self.batch_size, *self.dim, self.n_channels))
+        y = np.empty((self.batch_size, *self.dim, 1))
+
+        for i, batch_id in enumerate(batch):
+            x[i,], y[i,] = self.load(
+                self.data["image"][batch_id],
+                self.data["depth"][batch_id],
+                self.data["mask"][batch_id],
+            )
+        x, y = x.astype("float32"), y.astype("float32")
+        return x, y
+
+
+"""
+## Visualizing samples
+"""
+
+
+def visualize_depth_map(samples, test=False, model=None):
+    input, target = samples
+    cmap = plt.cm.jet
+    cmap.set_bad(color="black")
+
+    if test:
+        pred = model.predict(input)
+        fig, ax = plt.subplots(6, 3, figsize=(50, 50))
+        for i in range(6):
+            ax[i, 0].imshow((input[i].squeeze()))
+            ax[i, 1].imshow((target[i].squeeze()), cmap=cmap)
+            ax[i, 2].imshow((pred[i].squeeze()), cmap=cmap)
+
+    else:
+        fig, ax = plt.subplots(6, 2, figsize=(50, 50))
+        for i in range(6):
+            ax[i, 0].imshow((input[i].squeeze()))
+            ax[i, 1].imshow((target[i].squeeze()), cmap=cmap)
+
+
+visualize_samples = next(
+    iter(DataGenerator(data=df, batch_size=6, dim=(HEIGHT, WIDTH)))
+)
+visualize_depth_map(visualize_samples)
+
+"""
+## 3D point cloud visualization
+"""
+
+depth_vis = np.flipud(visualize_samples[1][1].squeeze())  # target
+img_vis = np.flipud(visualize_samples[0][1].squeeze())  # input
+
+fig = plt.figure(figsize=(15, 10))
+ax = plt.axes(projection="3d")
+
+STEP = 3
+for x in range(0, img_vis.shape[0], STEP):
+    for y in range(0, img_vis.shape[1], STEP):
+        ax.scatter(
+            [depth_vis[x, y]] * 3,
+            [y] * 3,
+            [x] * 3,
+            c=tuple(img_vis[x, y, :3] / 255),
+            s=3,
+        )
+    ax.view_init(45, 135)
+
+"""
+## Building the model
+
+1. The basic model is from U-Net.
+2. Addditive skip-connections are implemented in the downscaling block.
+"""
+
+
+class DownscaleBlock(layers.Layer):
+    def __init__(
+        self, filters, kernel_size=(3, 3), padding="same", strides=1, **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.convA = layers.Conv2D(filters, kernel_size, strides, padding)
+        self.convB = layers.Conv2D(filters, kernel_size, strides, padding)
+        self.reluA = layers.LeakyReLU(negative_slope=0.2)
+        self.reluB = layers.LeakyReLU(negative_slope=0.2)
+        self.bn2a = layers.BatchNormalization()
+        self.bn2b = layers.BatchNormalization()
+
+        self.pool = layers.MaxPool2D((2, 2), (2, 2))
+
+    def call(self, input_tensor):
+        d = self.convA(input_tensor)
+        x = self.bn2a(d)
+        x = self.reluA(x)
+
+        x = self.convB(x)
+        x = self.bn2b(x)
+        x = self.reluB(x)
+
+        x += d
+        p = self.pool(x)
+        return x, p
+
+
+class UpscaleBlock(layers.Layer):
+    def __init__(
+        self, filters, kernel_size=(3, 3), padding="same", strides=1, **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.us = layers.UpSampling2D((2, 2))
+        self.convA = layers.Conv2D(filters, kernel_size, strides, padding)
+        self.convB = layers.Conv2D(filters, kernel_size, strides, padding)
+        self.reluA = layers.LeakyReLU(negative_slope=0.2)
+        self.reluB = layers.LeakyReLU(negative_slope=0.2)
+        self.bn2a = layers.BatchNormalization()
+        self.bn2b = layers.BatchNormalization()
+        self.conc = layers.Concatenate()
+
+    def call(self, x, skip):
+        x = self.us(x)
+        concat = self.conc([x, skip])
+        x = self.convA(concat)
+        x = self.bn2a(x)
+        x = self.reluA(x)
+
+        x = self.convB(x)
+        x = self.bn2b(x)
+        x = self.reluB(x)
+
+        return x
+
+
+class BottleNeckBlock(layers.Layer):
+    def __init__(
+        self, filters, kernel_size=(3, 3), padding="same", strides=1, **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.convA = layers.Conv2D(filters, kernel_size, strides, padding)
+        self.convB = layers.Conv2D(filters, kernel_size, strides, padding)
+        self.reluA = layers.LeakyReLU(negative_slope=0.2)
+        self.reluB = layers.LeakyReLU(negative_slope=0.2)
+
+    def call(self, x):
+        x = self.convA(x)
+        x = self.reluA(x)
+        x = self.convB(x)
+        x = self.reluB(x)
+        return x
+
+
+"""
+## Defining the loss
+
+We will optimize 3 losses in our mode.
+1. Structural similarity index(SSIM).
+2. L1-loss, or Point-wise depth in our case.
+3. Depth smoothness loss.
+
+Out of the three loss functions, SSIM contributes the most to improving model performance.
+"""
+
+
+def image_gradients(image):
+    if len(ops.shape(image)) != 4:
+        raise ValueError(
+            "image_gradients expects a 4D tensor "
+            "[batch_size, h, w, d], not {}.".format(ops.shape(image))
+        )
+
+    image_shape = ops.shape(image)
+    batch_size, height, width, depth = ops.unstack(image_shape)
+
+    dy = image[:, 1:, :, :] - image[:, :-1, :, :]
+    dx = image[:, :, 1:, :] - image[:, :, :-1, :]
+
+    # Return tensors with same size as original image by concatenating
+    # zeros. Place the gradient [I(x+1,y) - I(x,y)] on the base pixel (x, y).
+    shape = ops.stack([batch_size, 1, width, depth])
+    dy = ops.concatenate([dy, ops.zeros(shape, dtype=image.dtype)], axis=1)
+    dy = ops.reshape(dy, image_shape)
+
+    shape = ops.stack([batch_size, height, 1, depth])
+    dx = ops.concatenate([dx, ops.zeros(shape, dtype=image.dtype)], axis=2)
+    dx = ops.reshape(dx, image_shape)
+
+    return dy, dx
+
+
+class DepthEstimationModel(keras.Model):
+    def __init__(self):
+        super().__init__()
+        self.ssim_loss_weight = 0.85
+        self.l1_loss_weight = 0.1
+        self.edge_loss_weight = 0.9
+        self.loss_metric = keras.metrics.Mean(name="loss")
+        f = [16, 32, 64, 128, 256]
+        self.downscale_blocks = [
+            DownscaleBlock(f[0]),
+            DownscaleBlock(f[1]),
+            DownscaleBlock(f[2]),
+            DownscaleBlock(f[3]),
+        ]
+        self.bottle_neck_block = BottleNeckBlock(f[4])
+        self.upscale_blocks = [
+            UpscaleBlock(f[3]),
+            UpscaleBlock(f[2]),
+            UpscaleBlock(f[1]),
+            UpscaleBlock(f[0]),
+        ]
+        self.conv_layer = layers.Conv2D(1, (1, 1), padding="same", activation="tanh")
+
+    def calculate_loss(self, target, pred):
+        # Edges
+        dy_true, dx_true = image_gradients(target)
+        dy_pred, dx_pred = image_gradients(pred)
+        weights_x = ops.cast(ops.exp(ops.mean(ops.abs(dx_true))), "float32")
+        weights_y = ops.cast(ops.exp(ops.mean(ops.abs(dy_true))), "float32")
+
+        # Depth smoothness
+        smoothness_x = dx_pred * weights_x
+        smoothness_y = dy_pred * weights_y
+
+        depth_smoothness_loss = ops.mean(abs(smoothness_x)) + ops.mean(
+            abs(smoothness_y)
+        )
+
+        # Structural similarity (SSIM) index
+        ssim_loss = ops.mean(
+            1
+            - tf.image.ssim(
+                target, pred, max_val=WIDTH, filter_size=7, k1=0.01**2, k2=0.03**2
+            )
+        )
+        # Point-wise depth
+        l1_loss = ops.mean(ops.abs(target - pred))
+
+        loss = (
+            (self.ssim_loss_weight * ssim_loss)
+            + (self.l1_loss_weight * l1_loss)
+            + (self.edge_loss_weight * depth_smoothness_loss)
+        )
+
+        return loss
+
+    @property
+    def metrics(self):
+        return [self.loss_metric]
+
+    def train_step(self, batch_data):
+        input, target = batch_data
+        with tf.GradientTape() as tape:
+            pred = self(input, training=True)
+            loss = self.calculate_loss(target, pred)
+
+        gradients = tape.gradient(loss, self.trainable_variables)
+        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
+        self.loss_metric.update_state(loss)
+        return {
+            "loss": self.loss_metric.result(),
+        }
+
+    def test_step(self, batch_data):
+        input, target = batch_data
+
+        pred = self(input, training=False)
+        loss = self.calculate_loss(target, pred)
+
+        self.loss_metric.update_state(loss)
+        return {
+            "loss": self.loss_metric.result(),
+        }
+
+    def call(self, x):
+        c1, p1 = self.downscale_blocks[0](x)
+        c2, p2 = self.downscale_blocks[1](p1)
+        c3, p3 = self.downscale_blocks[2](p2)
+        c4, p4 = self.downscale_blocks[3](p3)
+
+        bn = self.bottle_neck_block(p4)
+
+        u1 = self.upscale_blocks[0](bn, c4)
+        u2 = self.upscale_blocks[1](u1, c3)
+        u3 = self.upscale_blocks[2](u2, c2)
+        u4 = self.upscale_blocks[3](u3, c1)
+
+        return self.conv_layer(u4)
+
+
+"""
+## Model training
+"""
+
+optimizer = keras.optimizers.SGD(
+    learning_rate=LR,
+    nesterov=False,
+)
+model = DepthEstimationModel()
+# Compile the model
+model.compile(optimizer)
+
+train_loader = DataGenerator(
+    data=df[:260].reset_index(drop="true"), batch_size=BATCH_SIZE, dim=(HEIGHT, WIDTH)
+)
+validation_loader = DataGenerator(
+    data=df[260:].reset_index(drop="true"), batch_size=BATCH_SIZE, dim=(HEIGHT, WIDTH)
+)
+model.fit(
+    train_loader,
+    epochs=EPOCHS,
+    validation_data=validation_loader,
+)
+
+"""
+## Visualizing model output
+
+We visualize the model output over the validation set.
+The first image is the RGB image, the second image is the ground truth depth map image
+and the third one is the predicted depth map image.
+"""
+
+test_loader = next(
+    iter(
+        DataGenerator(
+            data=df[265:].reset_index(drop="true"), batch_size=6, dim=(HEIGHT, WIDTH)
+        )
+    )
+)
+visualize_depth_map(test_loader, test=True, model=model)
+
+test_loader = next(
+    iter(
+        DataGenerator(
+            data=df[300:].reset_index(drop="true"), batch_size=6, dim=(HEIGHT, WIDTH)
+        )
+    )
+)
+visualize_depth_map(test_loader, test=True, model=model)
+
+"""
+## Possible improvements
+
+1. You can improve this model by replacing the encoding part of the U-Net with a
+pretrained DenseNet or ResNet.
+2. Loss functions play an important role in solving this problem.
+Tuning the loss functions may yield significant improvement.
+"""
+
+"""
+## References
+
+The following papers go deeper into possible approaches for depth estimation.
+1. [Depth Prediction Without the Sensors: Leveraging Structure for Unsupervised Learning from Monocular Videos](https://arxiv.org/abs/1811.06152v1)
+2. [Digging Into Self-Supervised Monocular Depth Estimation](https://openaccess.thecvf.com/content_ICCV_2019/papers/Godard_Digging_Into_Self-Supervised_Monocular_Depth_Estimation_ICCV_2019_paper.pdf)
+3. [Deeper Depth Prediction with Fully Convolutional Residual Networks](https://arxiv.org/abs/1606.00373v2)
+
+You can also find helpful implementations in the papers with code depth estimation task.
+
+You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/spaces/keras-io/Monocular-Depth-Estimation)
+and try the demo on [Hugging Face Spaces](https://huggingface.co/keras-io/monocular-depth-estimation).
+"""
diff --git a/knowledge_base/vision/eanet.py b/knowledge_base/vision/eanet.py
new file mode 100644
index 0000000000000000000000000000000000000000..45de6b317421d1204ed372f7aac3db4316325a2c
--- /dev/null
+++ b/knowledge_base/vision/eanet.py
@@ -0,0 +1,322 @@
+"""
+Title: Image classification with EANet (External Attention Transformer)
+Author: [ZhiYong Chang](https://github.com/czy00000)
+Date created: 2021/10/19
+Last modified: 2023/07/18
+Description: Image classification with a Transformer that leverages external attention.
+Accelerator: GPU
+Converted to Keras 3: [Muhammad Anas Raza](https://anasrz.com)
+"""
+
+"""
+## Introduction
+
+This example implements the [EANet](https://arxiv.org/abs/2105.02358)
+model for image classification, and demonstrates it on the CIFAR-100 dataset.
+EANet introduces a novel attention mechanism
+named ***external attention***, based on two external, small, learnable, and
+shared memories, which can be implemented easily by simply using two cascaded
+linear layers and two normalization layers. It conveniently replaces self-attention
+as used in existing architectures. External attention has linear complexity, as it only
+implicitly considers the correlations between all samples.
+"""
+
+"""
+## Setup
+"""
+
+import keras
+from keras import layers
+from keras import ops
+
+import matplotlib.pyplot as plt
+
+
+"""
+## Prepare the data
+"""
+
+num_classes = 100
+input_shape = (32, 32, 3)
+
+(x_train, y_train), (x_test, y_test) = keras.datasets.cifar100.load_data()
+y_train = keras.utils.to_categorical(y_train, num_classes)
+y_test = keras.utils.to_categorical(y_test, num_classes)
+print(f"x_train shape: {x_train.shape} - y_train shape: {y_train.shape}")
+print(f"x_test shape: {x_test.shape} - y_test shape: {y_test.shape}")
+
+"""
+## Configure the hyperparameters
+"""
+
+weight_decay = 0.0001
+learning_rate = 0.001
+label_smoothing = 0.1
+validation_split = 0.2
+batch_size = 128
+num_epochs = 50
+patch_size = 2  # Size of the patches to be extracted from the input images.
+num_patches = (input_shape[0] // patch_size) ** 2  # Number of patch
+embedding_dim = 64  # Number of hidden units.
+mlp_dim = 64
+dim_coefficient = 4
+num_heads = 4
+attention_dropout = 0.2
+projection_dropout = 0.2
+num_transformer_blocks = 8  # Number of repetitions of the transformer layer
+
+print(f"Patch size: {patch_size} X {patch_size} = {patch_size ** 2} ")
+print(f"Patches per image: {num_patches}")
+
+
+"""
+## Use data augmentation
+"""
+
+data_augmentation = keras.Sequential(
+    [
+        layers.Normalization(),
+        layers.RandomFlip("horizontal"),
+        layers.RandomRotation(factor=0.1),
+        layers.RandomContrast(factor=0.1),
+        layers.RandomZoom(height_factor=0.2, width_factor=0.2),
+    ],
+    name="data_augmentation",
+)
+# Compute the mean and the variance of the training data for normalization.
+data_augmentation.layers[0].adapt(x_train)
+
+"""
+## Implement the patch extraction and encoding layer
+"""
+
+
+class PatchExtract(layers.Layer):
+    def __init__(self, patch_size, **kwargs):
+        super().__init__(**kwargs)
+        self.patch_size = patch_size
+
+    def call(self, x):
+        B, C = ops.shape(x)[0], ops.shape(x)[-1]
+        x = ops.image.extract_patches(x, self.patch_size)
+        x = ops.reshape(x, (B, -1, self.patch_size * self.patch_size * C))
+        return x
+
+
+class PatchEmbedding(layers.Layer):
+    def __init__(self, num_patch, embed_dim, **kwargs):
+        super().__init__(**kwargs)
+        self.num_patch = num_patch
+        self.proj = layers.Dense(embed_dim)
+        self.pos_embed = layers.Embedding(input_dim=num_patch, output_dim=embed_dim)
+
+    def call(self, patch):
+        pos = ops.arange(start=0, stop=self.num_patch, step=1)
+        return self.proj(patch) + self.pos_embed(pos)
+
+
+"""
+## Implement the external attention block
+"""
+
+
+def external_attention(
+    x,
+    dim,
+    num_heads,
+    dim_coefficient=4,
+    attention_dropout=0,
+    projection_dropout=0,
+):
+    _, num_patch, channel = x.shape
+    assert dim % num_heads == 0
+    num_heads = num_heads * dim_coefficient
+
+    x = layers.Dense(dim * dim_coefficient)(x)
+    # create tensor [batch_size, num_patches, num_heads, dim*dim_coefficient//num_heads]
+    x = ops.reshape(x, (-1, num_patch, num_heads, dim * dim_coefficient // num_heads))
+    x = ops.transpose(x, axes=[0, 2, 1, 3])
+    # a linear layer M_k
+    attn = layers.Dense(dim // dim_coefficient)(x)
+    # normalize attention map
+    attn = layers.Softmax(axis=2)(attn)
+    # dobule-normalization
+    attn = layers.Lambda(
+        lambda attn: ops.divide(
+            attn,
+            ops.convert_to_tensor(1e-9) + ops.sum(attn, axis=-1, keepdims=True),
+        )
+    )(attn)
+    attn = layers.Dropout(attention_dropout)(attn)
+    # a linear layer M_v
+    x = layers.Dense(dim * dim_coefficient // num_heads)(attn)
+    x = ops.transpose(x, axes=[0, 2, 1, 3])
+    x = ops.reshape(x, [-1, num_patch, dim * dim_coefficient])
+    # a linear layer to project original dim
+    x = layers.Dense(dim)(x)
+    x = layers.Dropout(projection_dropout)(x)
+    return x
+
+
+"""
+## Implement the MLP block
+"""
+
+
+def mlp(x, embedding_dim, mlp_dim, drop_rate=0.2):
+    x = layers.Dense(mlp_dim, activation=ops.gelu)(x)
+    x = layers.Dropout(drop_rate)(x)
+    x = layers.Dense(embedding_dim)(x)
+    x = layers.Dropout(drop_rate)(x)
+    return x
+
+
+"""
+## Implement the Transformer block
+"""
+
+
+def transformer_encoder(
+    x,
+    embedding_dim,
+    mlp_dim,
+    num_heads,
+    dim_coefficient,
+    attention_dropout,
+    projection_dropout,
+    attention_type="external_attention",
+):
+    residual_1 = x
+    x = layers.LayerNormalization(epsilon=1e-5)(x)
+    if attention_type == "external_attention":
+        x = external_attention(
+            x,
+            embedding_dim,
+            num_heads,
+            dim_coefficient,
+            attention_dropout,
+            projection_dropout,
+        )
+    elif attention_type == "self_attention":
+        x = layers.MultiHeadAttention(
+            num_heads=num_heads,
+            key_dim=embedding_dim,
+            dropout=attention_dropout,
+        )(x, x)
+    x = layers.add([x, residual_1])
+    residual_2 = x
+    x = layers.LayerNormalization(epsilon=1e-5)(x)
+    x = mlp(x, embedding_dim, mlp_dim)
+    x = layers.add([x, residual_2])
+    return x
+
+
+"""
+## Implement the EANet model
+"""
+
+"""
+The EANet model leverages external attention.
+The computational complexity of traditional self attention is `O(d * N ** 2)`,
+where `d` is the embedding size, and `N` is the number of patch.
+the authors find that most pixels are closely related to just a few other
+pixels, and an `N`-to-`N` attention matrix may be redundant.
+So, they propose as an alternative an external
+attention module where the computational complexity of external attention is `O(d * S * N)`.
+As `d` and `S` are hyper-parameters,
+the proposed algorithm is linear in the number of pixels. In fact, this is equivalent
+to a drop patch operation, because a lot of information contained in a patch
+in an image is redundant and unimportant.
+"""
+
+
+def get_model(attention_type="external_attention"):
+    inputs = layers.Input(shape=input_shape)
+    # Image augment
+    x = data_augmentation(inputs)
+    # Extract patches.
+    x = PatchExtract(patch_size)(x)
+    # Create patch embedding.
+    x = PatchEmbedding(num_patches, embedding_dim)(x)
+    # Create Transformer block.
+    for _ in range(num_transformer_blocks):
+        x = transformer_encoder(
+            x,
+            embedding_dim,
+            mlp_dim,
+            num_heads,
+            dim_coefficient,
+            attention_dropout,
+            projection_dropout,
+            attention_type,
+        )
+
+    x = layers.GlobalAveragePooling1D()(x)
+    outputs = layers.Dense(num_classes, activation="softmax")(x)
+    model = keras.Model(inputs=inputs, outputs=outputs)
+    return model
+
+
+"""
+## Train on CIFAR-100
+
+"""
+
+
+model = get_model(attention_type="external_attention")
+
+model.compile(
+    loss=keras.losses.CategoricalCrossentropy(label_smoothing=label_smoothing),
+    optimizer=keras.optimizers.AdamW(
+        learning_rate=learning_rate, weight_decay=weight_decay
+    ),
+    metrics=[
+        keras.metrics.CategoricalAccuracy(name="accuracy"),
+        keras.metrics.TopKCategoricalAccuracy(5, name="top-5-accuracy"),
+    ],
+)
+
+history = model.fit(
+    x_train,
+    y_train,
+    batch_size=batch_size,
+    epochs=num_epochs,
+    validation_split=validation_split,
+)
+
+"""
+### Let's visualize the training progress of the model.
+
+"""
+
+plt.plot(history.history["loss"], label="train_loss")
+plt.plot(history.history["val_loss"], label="val_loss")
+plt.xlabel("Epochs")
+plt.ylabel("Loss")
+plt.title("Train and Validation Losses Over Epochs", fontsize=14)
+plt.legend()
+plt.grid()
+plt.show()
+
+"""
+### Let's display the final results of the test on CIFAR-100.
+
+"""
+
+loss, accuracy, top_5_accuracy = model.evaluate(x_test, y_test)
+print(f"Test loss: {round(loss, 2)}")
+print(f"Test accuracy: {round(accuracy * 100, 2)}%")
+print(f"Test top 5 accuracy: {round(top_5_accuracy * 100, 2)}%")
+
+"""
+EANet just replaces self attention in Vit with external attention.
+The traditional Vit achieved a ~73% test top-5 accuracy and ~41 top-1 accuracy after
+training 50 epochs, but with 0.6M parameters. Under the same experimental environment
+and the same hyperparameters, The EANet model we just trained has just 0.3M parameters,
+and it gets us to ~73% test top-5 accuracy and ~43% top-1 accuracy. This fully demonstrates the
+effectiveness of external attention.
+
+We only show the training
+process of EANet, you can train Vit under the same experimental conditions and observe
+the test results.
+"""
diff --git a/knowledge_base/vision/edsr.py b/knowledge_base/vision/edsr.py
new file mode 100644
index 0000000000000000000000000000000000000000..87cb1ccd8d1a8f76e89b72b439c240c107a3201a
--- /dev/null
+++ b/knowledge_base/vision/edsr.py
@@ -0,0 +1,357 @@
+"""
+Title: Enhanced Deep Residual Networks for single-image super-resolution
+Author: Gitesh Chawda
+Date created: 2022/04/07
+Last modified: 2024/08/27
+Description: Training an EDSR model on the DIV2K Dataset.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+In this example, we implement
+[Enhanced Deep Residual Networks for Single Image Super-Resolution (EDSR)](https://arxiv.org/abs/1707.02921)
+by Bee Lim, Sanghyun Son, Heewon Kim, Seungjun Nah, and Kyoung Mu Lee.
+
+The EDSR architecture is based on the SRResNet architecture and consists of multiple
+residual blocks. It uses constant scaling layers instead of batch normalization layers to
+produce consistent results (input and output have similar distributions, thus
+normalizing intermediate features may not be desirable). Instead of using a L2 loss (mean squared error),
+the authors employed an L1 loss (mean absolute error), which performs better empirically.
+
+Our implementation only includes 16 residual blocks with 64 channels.
+
+Alternatively, as shown in the Keras example
+[Image Super-Resolution using an Efficient Sub-Pixel CNN](https://keras.io/examples/vision/super_resolution_sub_pixel/#image-superresolution-using-an-efficient-subpixel-cnn),
+you can do super-resolution using an ESPCN Model. According to the survey paper, EDSR is one of the top-five
+best-performing super-resolution methods based on PSNR scores. However, it has more
+parameters and requires more computational power than other approaches.
+It has a PSNR value (≈34db) that is slightly higher than ESPCN (≈32db).
+As per the survey paper, EDSR performs better than ESPCN.
+
+Paper:
+[A comprehensive review of deep learning based single image super-resolution](https://arxiv.org/abs/2102.09351)
+
+Comparison Graph:
+<img src="https://dfzljdn9uc3pi.cloudfront.net/2021/cs-621/1/fig-11-2x.jpg" width="500" />
+"""
+
+"""
+## Imports
+"""
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import numpy as np
+import tensorflow as tf
+import tensorflow_datasets as tfds
+import matplotlib.pyplot as plt
+
+import keras
+from keras import layers
+from keras import ops
+
+AUTOTUNE = tf.data.AUTOTUNE
+
+"""
+## Download the training dataset
+
+We use the DIV2K Dataset, a prominent single-image super-resolution dataset with 1,000
+images of scenes with various sorts of degradations,
+divided into 800 images for training, 100 images for validation, and 100
+images for testing. We use 4x bicubic downsampled images as our "low quality" reference.
+"""
+
+# Download DIV2K from TF Datasets
+# Using bicubic 4x degradation type
+div2k_data = tfds.image.Div2k(config="bicubic_x4")
+div2k_data.download_and_prepare()
+
+# Taking train data from div2k_data object
+train = div2k_data.as_dataset(split="train", as_supervised=True)
+train_cache = train.cache()
+# Validation data
+val = div2k_data.as_dataset(split="validation", as_supervised=True)
+val_cache = val.cache()
+
+"""
+## Flip, crop and resize images
+"""
+
+
+def flip_left_right(lowres_img, highres_img):
+    """Flips Images to left and right."""
+
+    # Outputs random values from a uniform distribution in between 0 to 1
+    rn = keras.random.uniform(shape=(), maxval=1)
+    # If rn is less than 0.5 it returns original lowres_img and highres_img
+    # If rn is greater than 0.5 it returns flipped image
+    return ops.cond(
+        rn < 0.5,
+        lambda: (lowres_img, highres_img),
+        lambda: (
+            ops.flip(lowres_img),
+            ops.flip(highres_img),
+        ),
+    )
+
+
+def random_rotate(lowres_img, highres_img):
+    """Rotates Images by 90 degrees."""
+
+    # Outputs random values from uniform distribution in between 0 to 4
+    rn = ops.cast(
+        keras.random.uniform(shape=(), maxval=4, dtype="float32"), dtype="int32"
+    )
+    # Here rn signifies number of times the image(s) are rotated by 90 degrees
+    return tf.image.rot90(lowres_img, rn), tf.image.rot90(highres_img, rn)
+
+
+def random_crop(lowres_img, highres_img, hr_crop_size=96, scale=4):
+    """Crop images.
+
+    low resolution images: 24x24
+    high resolution images: 96x96
+    """
+    lowres_crop_size = hr_crop_size // scale  # 96//4=24
+    lowres_img_shape = ops.shape(lowres_img)[:2]  # (height,width)
+
+    lowres_width = ops.cast(
+        keras.random.uniform(
+            shape=(), maxval=lowres_img_shape[1] - lowres_crop_size + 1, dtype="float32"
+        ),
+        dtype="int32",
+    )
+    lowres_height = ops.cast(
+        keras.random.uniform(
+            shape=(), maxval=lowres_img_shape[0] - lowres_crop_size + 1, dtype="float32"
+        ),
+        dtype="int32",
+    )
+
+    highres_width = lowres_width * scale
+    highres_height = lowres_height * scale
+
+    lowres_img_cropped = lowres_img[
+        lowres_height : lowres_height + lowres_crop_size,
+        lowres_width : lowres_width + lowres_crop_size,
+    ]  # 24x24
+    highres_img_cropped = highres_img[
+        highres_height : highres_height + hr_crop_size,
+        highres_width : highres_width + hr_crop_size,
+    ]  # 96x96
+
+    return lowres_img_cropped, highres_img_cropped
+
+
+"""
+## Prepare a `tf.data.Dataset` object
+
+We augment the training data with random horizontal flips and 90 rotations.
+
+As low resolution images, we use 24x24 RGB input patches.
+"""
+
+
+def dataset_object(dataset_cache, training=True):
+    ds = dataset_cache
+    ds = ds.map(
+        lambda lowres, highres: random_crop(lowres, highres, scale=4),
+        num_parallel_calls=AUTOTUNE,
+    )
+
+    if training:
+        ds = ds.map(random_rotate, num_parallel_calls=AUTOTUNE)
+        ds = ds.map(flip_left_right, num_parallel_calls=AUTOTUNE)
+    # Batching Data
+    ds = ds.batch(16)
+
+    if training:
+        # Repeating Data, so that cardinality if dataset becomes infinte
+        ds = ds.repeat()
+    # prefetching allows later images to be prepared while the current image is being processed
+    ds = ds.prefetch(buffer_size=AUTOTUNE)
+    return ds
+
+
+train_ds = dataset_object(train_cache, training=True)
+val_ds = dataset_object(val_cache, training=False)
+
+"""
+## Visualize the data
+
+Let's visualize a few sample images:
+"""
+
+lowres, highres = next(iter(train_ds))
+
+# High Resolution Images
+plt.figure(figsize=(10, 10))
+for i in range(9):
+    ax = plt.subplot(3, 3, i + 1)
+    plt.imshow(highres[i].numpy().astype("uint8"))
+    plt.title(highres[i].shape)
+    plt.axis("off")
+
+# Low Resolution Images
+plt.figure(figsize=(10, 10))
+for i in range(9):
+    ax = plt.subplot(3, 3, i + 1)
+    plt.imshow(lowres[i].numpy().astype("uint8"))
+    plt.title(lowres[i].shape)
+    plt.axis("off")
+
+
+def PSNR(super_resolution, high_resolution):
+    """Compute the peak signal-to-noise ratio, measures quality of image."""
+    # Max value of pixel is 255
+    psnr_value = tf.image.psnr(high_resolution, super_resolution, max_val=255)[0]
+    return psnr_value
+
+
+"""
+## Build the model
+
+In the paper, the authors train three models: EDSR, MDSR, and a baseline model. In this code example,
+we only train the baseline model.
+
+### Comparison with model with three residual blocks
+
+The residual block design of EDSR differs from that of ResNet. Batch normalization
+layers have been removed (together with the final ReLU activation): since batch normalization
+layers normalize the features, they hurt output value range flexibility.
+It is thus better to remove them. Further, it also helps reduce the
+amount of GPU RAM required by the model, since the batch normalization layers consume the same amount of
+memory as the preceding convolutional layers.
+
+<img src="https://miro.medium.com/max/1050/1*EPviXGqlGWotVtV2gqVvNg.png" width="500" />
+"""
+
+
+class EDSRModel(keras.Model):
+    def train_step(self, data):
+        # Unpack the data. Its structure depends on your model and
+        # on what you pass to `fit()`.
+        x, y = data
+
+        with tf.GradientTape() as tape:
+            y_pred = self(x, training=True)  # Forward pass
+            # Compute the loss value
+            # (the loss function is configured in `compile()`)
+            loss = self.compiled_loss(y, y_pred, regularization_losses=self.losses)
+
+        # Compute gradients
+        trainable_vars = self.trainable_variables
+        gradients = tape.gradient(loss, trainable_vars)
+        # Update weights
+        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
+        # Update metrics (includes the metric that tracks the loss)
+        self.compiled_metrics.update_state(y, y_pred)
+        # Return a dict mapping metric names to current value
+        return {m.name: m.result() for m in self.metrics}
+
+    def predict_step(self, x):
+        # Adding dummy dimension using tf.expand_dims and converting to float32 using tf.cast
+        x = ops.cast(tf.expand_dims(x, axis=0), dtype="float32")
+        # Passing low resolution image to model
+        super_resolution_img = self(x, training=False)
+        # Clips the tensor from min(0) to max(255)
+        super_resolution_img = ops.clip(super_resolution_img, 0, 255)
+        # Rounds the values of a tensor to the nearest integer
+        super_resolution_img = ops.round(super_resolution_img)
+        # Removes dimensions of size 1 from the shape of a tensor and converting to uint8
+        super_resolution_img = ops.squeeze(
+            ops.cast(super_resolution_img, dtype="uint8"), axis=0
+        )
+        return super_resolution_img
+
+
+# Residual Block
+def ResBlock(inputs):
+    x = layers.Conv2D(64, 3, padding="same", activation="relu")(inputs)
+    x = layers.Conv2D(64, 3, padding="same")(x)
+    x = layers.Add()([inputs, x])
+    return x
+
+
+# Upsampling Block
+def Upsampling(inputs, factor=2, **kwargs):
+    x = layers.Conv2D(64 * (factor**2), 3, padding="same", **kwargs)(inputs)
+    x = layers.Lambda(lambda x: tf.nn.depth_to_space(x, block_size=factor))(x)
+    x = layers.Conv2D(64 * (factor**2), 3, padding="same", **kwargs)(x)
+    x = layers.Lambda(lambda x: tf.nn.depth_to_space(x, block_size=factor))(x)
+    return x
+
+
+def make_model(num_filters, num_of_residual_blocks):
+    # Flexible Inputs to input_layer
+    input_layer = layers.Input(shape=(None, None, 3))
+    # Scaling Pixel Values
+    x = layers.Rescaling(scale=1.0 / 255)(input_layer)
+    x = x_new = layers.Conv2D(num_filters, 3, padding="same")(x)
+
+    # 16 residual blocks
+    for _ in range(num_of_residual_blocks):
+        x_new = ResBlock(x_new)
+
+    x_new = layers.Conv2D(num_filters, 3, padding="same")(x_new)
+    x = layers.Add()([x, x_new])
+
+    x = Upsampling(x)
+    x = layers.Conv2D(3, 3, padding="same")(x)
+
+    output_layer = layers.Rescaling(scale=255)(x)
+    return EDSRModel(input_layer, output_layer)
+
+
+model = make_model(num_filters=64, num_of_residual_blocks=16)
+
+"""
+## Train the model
+"""
+
+# Using adam optimizer with initial learning rate as 1e-4, changing learning rate after 5000 steps to 5e-5
+optim_edsr = keras.optimizers.Adam(
+    learning_rate=keras.optimizers.schedules.PiecewiseConstantDecay(
+        boundaries=[5000], values=[1e-4, 5e-5]
+    )
+)
+# Compiling model with loss as mean absolute error(L1 Loss) and metric as psnr
+model.compile(optimizer=optim_edsr, loss="mae", metrics=[PSNR])
+# Training for more epochs will improve results
+model.fit(train_ds, epochs=100, steps_per_epoch=200, validation_data=val_ds)
+
+"""
+## Run inference on new images and plot the results
+"""
+
+
+def plot_results(lowres, preds):
+    """
+    Displays low resolution image and super resolution image
+    """
+    plt.figure(figsize=(24, 14))
+    plt.subplot(132), plt.imshow(lowres), plt.title("Low resolution")
+    plt.subplot(133), plt.imshow(preds), plt.title("Prediction")
+    plt.show()
+
+
+for lowres, highres in val.take(10):
+    lowres = tf.image.random_crop(lowres, (150, 150, 3))
+    preds = model.predict_step(lowres)
+    plot_results(lowres, preds)
+
+"""
+## Final remarks
+
+In this example, we implemented the EDSR model (Enhanced Deep Residual Networks for Single Image
+Super-Resolution). You could improve the model accuracy by training the model for more epochs, as well as
+training the model with a wider variety of inputs with mixed downgrading factors, so as to
+be able to handle a greater range of real-world images.
+
+You could also improve on the given baseline EDSR model by implementing EDSR+,
+or MDSR( Multi-Scale super-resolution) and MDSR+,
+which were proposed in the same paper.
+"""
diff --git a/knowledge_base/vision/fixres.py b/knowledge_base/vision/fixres.py
new file mode 100644
index 0000000000000000000000000000000000000000..dafb2f76e1dc46b897d25211ff273b623d69c864
--- /dev/null
+++ b/knowledge_base/vision/fixres.py
@@ -0,0 +1,338 @@
+"""
+Title: FixRes: Fixing train-test resolution discrepancy
+Author: [Sayak Paul](https://twitter.com/RisingSayak)
+Date created: 2021/10/08
+Last modified: 2021/10/10
+Description: Mitigating resolution discrepancy between training and test sets.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+It is a common practice to use the same input image resolution while training and testing
+vision models. However, as investigated in
+[Fixing the train-test resolution discrepancy](https://arxiv.org/abs/1906.06423)
+(Touvron et al.), this practice leads to suboptimal performance. Data augmentation
+is an indispensable part of the training process of deep neural networks. For vision models, we
+typically use random resized crops during training and center crops during inference.
+This introduces a discrepancy in the object sizes seen during training and inference.
+As shown by Touvron et al., if we can fix this discrepancy, we can significantly
+boost model performance.
+
+In this example, we implement the **FixRes** techniques introduced by Touvron et al.
+to fix this discrepancy.
+"""
+
+"""
+## Imports
+"""
+
+import keras
+from keras import layers
+import tensorflow as tf  # just for image processing and pipeline
+
+import tensorflow_datasets as tfds
+
+tfds.disable_progress_bar()
+
+import matplotlib.pyplot as plt
+
+"""
+## Load the `tf_flowers` dataset
+"""
+
+train_dataset, val_dataset = tfds.load(
+    "tf_flowers", split=["train[:90%]", "train[90%:]"], as_supervised=True
+)
+
+num_train = train_dataset.cardinality()
+num_val = val_dataset.cardinality()
+print(f"Number of training examples: {num_train}")
+print(f"Number of validation examples: {num_val}")
+
+"""
+## Data preprocessing utilities
+"""
+
+"""
+We create three datasets:
+
+1. A dataset with a smaller resolution - 128x128.
+2. Two datasets with a larger resolution - 224x224.
+
+We will apply different augmentation transforms to the larger-resolution datasets.
+
+The idea of FixRes is to first train a model on a smaller resolution dataset and then fine-tune
+it on a larger resolution dataset. This simple yet effective recipe leads to non-trivial performance
+improvements. Please refer to the [original paper](https://arxiv.org/abs/1906.06423) for
+results.
+"""
+
+# Reference: https://github.com/facebookresearch/FixRes/blob/main/transforms_v2.py.
+
+batch_size = 32
+auto = tf.data.AUTOTUNE
+smaller_size = 128
+bigger_size = 224
+
+size_for_resizing = int((bigger_size / smaller_size) * bigger_size)
+central_crop_layer = layers.CenterCrop(bigger_size, bigger_size)
+
+
+def preprocess_initial(train, image_size):
+    """Initial preprocessing function for training on smaller resolution.
+
+    For training, do random_horizontal_flip -> random_crop.
+    For validation, just resize.
+    No color-jittering has been used.
+    """
+
+    def _pp(image, label, train):
+        if train:
+            channels = image.shape[-1]
+            begin, size, _ = tf.image.sample_distorted_bounding_box(
+                tf.shape(image),
+                tf.zeros([0, 0, 4], tf.float32),
+                area_range=(0.05, 1.0),
+                min_object_covered=0,
+                use_image_if_no_bounding_boxes=True,
+            )
+            image = tf.slice(image, begin, size)
+
+            image.set_shape([None, None, channels])
+            image = tf.image.resize(image, [image_size, image_size])
+            image = tf.image.random_flip_left_right(image)
+        else:
+            image = tf.image.resize(image, [image_size, image_size])
+
+        return image, label
+
+    return _pp
+
+
+def preprocess_finetune(image, label, train):
+    """Preprocessing function for fine-tuning on a higher resolution.
+
+    For training, resize to a bigger resolution to maintain the ratio ->
+        random_horizontal_flip -> center_crop.
+    For validation, do the same without any horizontal flipping.
+    No color-jittering has been used.
+    """
+    image = tf.image.resize(image, [size_for_resizing, size_for_resizing])
+    if train:
+        image = tf.image.random_flip_left_right(image)
+    image = central_crop_layer(image[None, ...])[0]
+
+    return image, label
+
+
+def make_dataset(
+    dataset: tf.data.Dataset,
+    train: bool,
+    image_size: int = smaller_size,
+    fixres: bool = True,
+    num_parallel_calls=auto,
+):
+    if image_size not in [smaller_size, bigger_size]:
+        raise ValueError(f"{image_size} resolution is not supported.")
+
+    # Determine which preprocessing function we are using.
+    if image_size == smaller_size:
+        preprocess_func = preprocess_initial(train, image_size)
+    elif not fixres and image_size == bigger_size:
+        preprocess_func = preprocess_initial(train, image_size)
+    else:
+        preprocess_func = preprocess_finetune
+
+    dataset = dataset.map(
+        lambda x, y: preprocess_func(x, y, train),
+        num_parallel_calls=num_parallel_calls,
+    )
+    dataset = dataset.batch(batch_size)
+
+    if train:
+        dataset = dataset.shuffle(batch_size * 10)
+
+    return dataset.prefetch(num_parallel_calls)
+
+
+"""
+Notice how the augmentation transforms vary for the kind of dataset we are preparing.
+"""
+
+"""
+## Prepare datasets
+"""
+
+initial_train_dataset = make_dataset(train_dataset, train=True, image_size=smaller_size)
+initial_val_dataset = make_dataset(val_dataset, train=False, image_size=smaller_size)
+
+finetune_train_dataset = make_dataset(train_dataset, train=True, image_size=bigger_size)
+finetune_val_dataset = make_dataset(val_dataset, train=False, image_size=bigger_size)
+
+vanilla_train_dataset = make_dataset(
+    train_dataset, train=True, image_size=bigger_size, fixres=False
+)
+vanilla_val_dataset = make_dataset(
+    val_dataset, train=False, image_size=bigger_size, fixres=False
+)
+
+"""
+## Visualize the datasets
+"""
+
+
+def visualize_dataset(batch_images):
+    plt.figure(figsize=(10, 10))
+    for n in range(25):
+        ax = plt.subplot(5, 5, n + 1)
+        plt.imshow(batch_images[n].numpy().astype("int"))
+        plt.axis("off")
+    plt.show()
+
+    print(f"Batch shape: {batch_images.shape}.")
+
+
+# Smaller resolution.
+initial_sample_images, _ = next(iter(initial_train_dataset))
+visualize_dataset(initial_sample_images)
+
+# Bigger resolution, only for fine-tuning.
+finetune_sample_images, _ = next(iter(finetune_train_dataset))
+visualize_dataset(finetune_sample_images)
+
+# Bigger resolution, with the same augmentation transforms as
+# the smaller resolution dataset.
+vanilla_sample_images, _ = next(iter(vanilla_train_dataset))
+visualize_dataset(vanilla_sample_images)
+
+"""
+## Model training utilities
+
+We train multiple variants of ResNet50V2
+([He et al.](https://arxiv.org/abs/1603.05027)):
+
+1. On the smaller resolution dataset (128x128). It will be trained from scratch.
+2. Then fine-tune the model from 1 on the larger resolution (224x224) dataset.
+3. Train another ResNet50V2 from scratch on the larger resolution dataset.
+
+As a reminder, the larger resolution datasets differ in terms of their augmentation
+transforms.
+"""
+
+
+def get_training_model(num_classes=5):
+    inputs = layers.Input((None, None, 3))
+    resnet_base = keras.applications.ResNet50V2(
+        include_top=False, weights=None, pooling="avg"
+    )
+    resnet_base.trainable = True
+
+    x = layers.Rescaling(scale=1.0 / 127.5, offset=-1)(inputs)
+    x = resnet_base(x)
+    outputs = layers.Dense(num_classes, activation="softmax")(x)
+    return keras.Model(inputs, outputs)
+
+
+def train_and_evaluate(
+    model,
+    train_ds,
+    val_ds,
+    epochs,
+    learning_rate=1e-3,
+    use_early_stopping=False,
+):
+    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
+    model.compile(
+        optimizer=optimizer,
+        loss="sparse_categorical_crossentropy",
+        metrics=["accuracy"],
+    )
+
+    if use_early_stopping:
+        es_callback = keras.callbacks.EarlyStopping(patience=5)
+        callbacks = [es_callback]
+    else:
+        callbacks = None
+
+    model.fit(
+        train_ds,
+        validation_data=val_ds,
+        epochs=epochs,
+        callbacks=callbacks,
+    )
+
+    _, accuracy = model.evaluate(val_ds)
+    print(f"Top-1 accuracy on the validation set: {accuracy*100:.2f}%.")
+    return model
+
+
+"""
+## Experiment 1: Train on 128x128 and then fine-tune on 224x224
+"""
+
+epochs = 30
+
+smaller_res_model = get_training_model()
+smaller_res_model = train_and_evaluate(
+    smaller_res_model, initial_train_dataset, initial_val_dataset, epochs
+)
+
+"""
+### Freeze all the layers except for the final Batch Normalization layer
+
+For fine-tuning, we train only two layers:
+
+* The final Batch Normalization ([Ioffe et al.](https://arxiv.org/abs/1502.03167)) layer.
+* The classification layer.
+
+We are unfreezing the final Batch Normalization layer to compensate for the change in
+activation statistics before the global average pooling layer. As shown in
+[the paper](https://arxiv.org/abs/1906.06423), unfreezing the final Batch
+Normalization layer is enough.
+
+For a comprehensive guide on fine-tuning models in Keras, refer to
+[this tutorial](https://keras.io/guides/transfer_learning/).
+"""
+
+for layer in smaller_res_model.layers[2].layers:
+    layer.trainable = False
+
+smaller_res_model.layers[2].get_layer("post_bn").trainable = True
+
+epochs = 10
+
+# Use a lower learning rate during fine-tuning.
+bigger_res_model = train_and_evaluate(
+    smaller_res_model,
+    finetune_train_dataset,
+    finetune_val_dataset,
+    epochs,
+    learning_rate=1e-4,
+)
+
+"""
+## Experiment 2: Train a model on 224x224 resolution from scratch
+
+Now, we train another model from scratch on the larger resolution dataset. Recall that
+the augmentation transforms used in this dataset are different from before.
+"""
+
+epochs = 30
+
+vanilla_bigger_res_model = get_training_model()
+vanilla_bigger_res_model = train_and_evaluate(
+    vanilla_bigger_res_model, vanilla_train_dataset, vanilla_val_dataset, epochs
+)
+
+"""
+As we can notice from the above cells, FixRes leads to a better performance. Another
+advantage of FixRes is the improved total training time and reduction in GPU memory usage.
+FixRes is model-agnostic, you can use it on any image classification model
+to potentially boost performance.
+
+You can find more results
+[here](https://tensorboard.dev/experiment/BQOg28w0TlmvuJYeqsVntw)
+that were gathered by running the same code with different random seeds.
+"""
diff --git a/knowledge_base/vision/focal_modulation_network.py b/knowledge_base/vision/focal_modulation_network.py
new file mode 100644
index 0000000000000000000000000000000000000000..d50fa5fd36cf5b0a369f743d86b02c004176f3d5
--- /dev/null
+++ b/knowledge_base/vision/focal_modulation_network.py
@@ -0,0 +1,1085 @@
+"""
+Title: Focal Modulation: A replacement for Self-Attention
+Author: [Aritra Roy Gosthipaty](https://twitter.com/ariG23498), [Ritwik Raha](https://twitter.com/ritwik_raha)
+Date created: 2023/01/25
+Last modified: 2023/02/15
+Description: Image classification with Focal Modulation Networks.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+This tutorial aims to provide a comprehensive guide to the implementation of
+Focal Modulation Networks, as presented in
+[Yang et al.](https://arxiv.org/abs/2203.11926).
+
+This tutorial will provide a formal, minimalistic approach to implementing Focal
+Modulation Networks and explore its potential applications in the field of Deep Learning.
+
+**Problem statement**
+
+The Transformer architecture ([Vaswani et al.](https://arxiv.org/abs/1706.03762)),
+which has become the de facto standard in most Natural Language Processing tasks, has
+also been applied to the field of computer vision, e.g. Vision
+Transformers ([Dosovitskiy et al.](https://arxiv.org/abs/2010.11929v2)).
+
+> In Transformers, the self-attention (SA) is arguably the key to its success which
+enables input-dependent global interactions, in contrast to convolution operation which
+constraints interactions in a local region with a shared kernel.
+
+The **Attention** module is mathematically written as shown in **Equation 1**.
+
+| ![Attention Equation](https://i.imgur.com/thdHvQx.png) |
+| :--: |
+| Equation 1: The mathematical equation of attention (Source: Aritra and Ritwik) |
+
+Where:
+
+- `Q` is the query
+- `K` is the key
+- `V` is the value
+- `d_k` is the dimension of the key
+
+With **self-attention**, the query, key, and value are all sourced from the input
+sequence. Let us rewrite the attention equation for self-attention as shown in **Equation
+2**.
+
+| ![Self-Attention Equation](https://i.imgur.com/OFsmVdP.png) |
+| :--: |
+| Equation 2: The mathematical equation of self-attention (Source: Aritra and Ritwik) |
+
+Upon looking at the equation of self-attention, we see that it is a quadratic equation.
+Therefore, as the number of tokens increase, so does the computation time (cost too). To
+mitigate this problem and make Transformers more interpretable, Yang et al.
+have tried to replace the Self-Attention module with better components.
+
+**The Solution**
+
+Yang et al. introduce the Focal Modulation layer to serve as a
+seamless replacement for the Self-Attention Layer. The layer boasts high
+interpretability, making it a valuable tool for Deep Learning practitioners.
+
+In this tutorial, we will delve into the practical application of this layer by training
+the entire model on the CIFAR-10 dataset and visually interpreting the layer's
+performance.
+
+Note: We try to align our implementation with the
+[official implementation](https://github.com/microsoft/FocalNet).
+"""
+
+"""
+## Setup and Imports
+
+We use tensorflow version `2.11.0` for this tutorial.
+"""
+
+import numpy as np
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+from tensorflow.keras.optimizers.experimental import AdamW
+from typing import Optional, Tuple, List
+from matplotlib import pyplot as plt
+from random import randint
+
+# Set seed for reproducibility.
+tf.keras.utils.set_random_seed(42)
+
+"""
+## Global Configuration
+
+We do not have any strong rationale behind choosing these hyperparameters. Please feel
+free to change the configuration and train the model.
+"""
+
+# DATA
+TRAIN_SLICE = 40000
+BUFFER_SIZE = 2048
+BATCH_SIZE = 1024
+AUTO = tf.data.AUTOTUNE
+INPUT_SHAPE = (32, 32, 3)
+IMAGE_SIZE = 48
+NUM_CLASSES = 10
+
+# OPTIMIZER
+LEARNING_RATE = 1e-4
+WEIGHT_DECAY = 1e-4
+
+# TRAINING
+EPOCHS = 25
+
+"""
+## Load and process the CIFAR-10 dataset
+"""
+
+(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()
+(x_train, y_train), (x_val, y_val) = (
+    (x_train[:TRAIN_SLICE], y_train[:TRAIN_SLICE]),
+    (x_train[TRAIN_SLICE:], y_train[TRAIN_SLICE:]),
+)
+
+"""
+### Build the augmentations
+
+We use the `keras.Sequential` API to compose all the individual augmentation steps
+into one API.
+"""
+
+# Build the `train` augmentation pipeline.
+train_aug = keras.Sequential(
+    [
+        layers.Rescaling(1 / 255.0),
+        layers.Resizing(INPUT_SHAPE[0] + 20, INPUT_SHAPE[0] + 20),
+        layers.RandomCrop(IMAGE_SIZE, IMAGE_SIZE),
+        layers.RandomFlip("horizontal"),
+    ],
+    name="train_data_augmentation",
+)
+
+# Build the `val` and `test` data pipeline.
+test_aug = keras.Sequential(
+    [
+        layers.Rescaling(1 / 255.0),
+        layers.Resizing(IMAGE_SIZE, IMAGE_SIZE),
+    ],
+    name="test_data_augmentation",
+)
+
+"""
+### Build `tf.data` pipeline
+"""
+
+train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+train_ds = (
+    train_ds.map(
+        lambda image, label: (train_aug(image), label), num_parallel_calls=AUTO
+    )
+    .shuffle(BUFFER_SIZE)
+    .batch(BATCH_SIZE)
+    .prefetch(AUTO)
+)
+
+val_ds = tf.data.Dataset.from_tensor_slices((x_val, y_val))
+val_ds = (
+    val_ds.map(lambda image, label: (test_aug(image), label), num_parallel_calls=AUTO)
+    .batch(BATCH_SIZE)
+    .prefetch(AUTO)
+)
+
+test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test))
+test_ds = (
+    test_ds.map(lambda image, label: (test_aug(image), label), num_parallel_calls=AUTO)
+    .batch(BATCH_SIZE)
+    .prefetch(AUTO)
+)
+
+"""
+## Architecture
+
+We pause here to take a quick look at the Architecture of the Focal Modulation Network.
+**Figure 1** shows how every individual layer is compiled into a single model. This gives
+us a bird's eye view of the entire architecture.
+
+| ![Diagram of the model](https://i.imgur.com/v5HYV5R.png) |
+| :--: |
+| Figure 1: A diagram of the Focal Modulation model (Source: Aritra and Ritwik) |
+
+We dive deep into each of these layers in the following sections. This is the order we
+will follow:
+
+
+- Patch Embedding Layer
+- Focal Modulation Block
+  - Multi-Layer Perceptron
+  - Focal Modulation Layer
+    - Hierarchical Contextualization
+    - Gated Aggregation
+  - Building Focal Modulation Block
+- Building the Basic Layer
+
+To better understand the architecture in a format we are well versed in, let us see how
+the Focal Modulation Network would look when drawn like a Transformer architecture.
+
+**Figure 2** shows the encoder layer of a traditional Transformer architecture where Self
+Attention is replaced with the Focal Modulation layer.
+
+The <font color="blue">blue</font> blocks represent the Focal Modulation block. A stack
+of these blocks builds a single Basic Layer. The <font color="green">green</font> blocks
+represent the Focal Modulation layer.
+
+| ![The Entire Architecture](https://i.imgur.com/PduYD6m.png) |
+| :--: |
+| Figure 2: The Entire Architecture (Source: Aritra and Ritwik) |
+"""
+
+"""
+## Patch Embedding Layer
+
+The patch embedding layer is used to patchify the input images and project them into a
+latent space. This layer is also used as the down-sampling layer in the architecture.
+"""
+
+
+class PatchEmbed(layers.Layer):
+    """Image patch embedding layer, also acts as the down-sampling layer.
+
+    Args:
+        image_size (Tuple[int]): Input image resolution.
+        patch_size (Tuple[int]): Patch spatial resolution.
+        embed_dim (int): Embedding dimension.
+    """
+
+    def __init__(
+        self,
+        image_size: Tuple[int] = (224, 224),
+        patch_size: Tuple[int] = (4, 4),
+        embed_dim: int = 96,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        patch_resolution = [
+            image_size[0] // patch_size[0],
+            image_size[1] // patch_size[1],
+        ]
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.embed_dim = embed_dim
+        self.patch_resolution = patch_resolution
+        self.num_patches = patch_resolution[0] * patch_resolution[1]
+        self.proj = layers.Conv2D(
+            filters=embed_dim, kernel_size=patch_size, strides=patch_size
+        )
+        self.flatten = layers.Reshape(target_shape=(-1, embed_dim))
+        self.norm = keras.layers.LayerNormalization(epsilon=1e-7)
+
+    def call(self, x: tf.Tensor) -> Tuple[tf.Tensor, int, int, int]:
+        """Patchifies the image and converts into tokens.
+
+        Args:
+            x: Tensor of shape (B, H, W, C)
+
+        Returns:
+            A tuple of the processed tensor, height of the projected
+            feature map, width of the projected feature map, number
+            of channels of the projected feature map.
+        """
+        # Project the inputs.
+        x = self.proj(x)
+
+        # Obtain the shape from the projected tensor.
+        height = tf.shape(x)[1]
+        width = tf.shape(x)[2]
+        channels = tf.shape(x)[3]
+
+        # B, H, W, C -> B, H*W, C
+        x = self.norm(self.flatten(x))
+
+        return x, height, width, channels
+
+
+"""
+## Focal Modulation block
+
+A Focal Modulation block can be considered as a single Transformer Block with the Self
+Attention (SA) module being replaced with Focal Modulation module, as we saw in **Figure
+2**.
+
+Let us recall how a focal modulation block is supposed to look like with the aid of the
+**Figure 3**.
+
+
+| ![Focal Modulation Block](https://i.imgur.com/bPYTSiB.png) |
+| :--: |
+| Figure 3: The isolated view of the Focal Modulation Block (Source: Aritra and Ritwik) |
+
+The Focal Modulation Block consists of:
+- Multilayer Perceptron
+- Focal Modulation layer
+"""
+
+"""
+### Multilayer Perceptron
+"""
+
+
+def MLP(
+    in_features: int,
+    hidden_features: Optional[int] = None,
+    out_features: Optional[int] = None,
+    mlp_drop_rate: float = 0.0,
+):
+    hidden_features = hidden_features or in_features
+    out_features = out_features or in_features
+
+    return keras.Sequential(
+        [
+            layers.Dense(units=hidden_features, activation=keras.activations.gelu),
+            layers.Dense(units=out_features),
+            layers.Dropout(rate=mlp_drop_rate),
+        ]
+    )
+
+
+"""
+### Focal Modulation layer
+
+In a typical Transformer architecture, for each visual token (**query**) `x_i in R^C` in
+an input feature map `X in R^{HxWxC}` a **generic encoding process** produces a feature
+representation `y_i in R^C`.
+
+The encoding process consists of **interaction** (with its surroundings for e.g. a dot
+product), and **aggregation** (over the contexts for e.g weighted mean).
+
+We will talk about two types of encoding here:
+- Interaction and then Aggregation in **Self-Attention**
+- Aggregation and then Interaction in **Focal Modulation**
+
+**Self-Attention**
+
+| ![Self-Attention Expression](https://i.imgur.com/heBYp0F.png) |
+| :--: |
+| **Figure 4**: Self-Attention module. (Source: Aritra and Ritwik) |
+
+| ![Aggregation and Interaction for Self-Attention](https://i.imgur.com/j1k8Xmy.png) |
+| :--: |
+| **Equation 3:** Aggregation and Interaction in Self-Attention(Surce: Aritra and Ritwik)|
+
+As shown in **Figure 4** the query and the key interact (in the interaction step) with
+each other to output the attention scores. The weighted aggregation of the value comes
+next, known as the aggregation step.
+
+**Focal Modulation**
+
+| ![Focal Modulation module](https://i.imgur.com/tmbLgQl.png) |
+| :--: |
+| **Figure 5**: Focal Modulation module. (Source: Aritra and Ritwik) |
+
+| ![Aggregation and Interaction in Focal Modulation](https://i.imgur.com/gsvJfWp.png) |
+| :--: |
+| **Equation 4:** Aggregation and Interaction in Focal Modulation (Source: Aritra and Ritwik) |
+
+**Figure 5** depicts the Focal Modulation layer. `q()` is the query projection
+function. It is a **linear layer** that projects the query into a latent space. `m ()` is
+the context aggregation function. Unlike self-attention, the
+aggregation step takes place in focal modulation before the interaction step.
+"""
+
+"""
+While `q()` is pretty straightforward to understand, the context aggregation function
+`m()` is more complex. Therefore, this section will focus on `m()`.
+
+| ![Context Aggregation](https://i.imgur.com/uqIRXI7.png)|
+| :--: |
+| **Figure 6**: Context Aggregation function `m()`. (Source: Aritra and Ritwik) |
+
+The context aggregation function `m()` consists of two parts as shown in **Figure 6**:
+- Hierarchical Contextualization
+- Gated Aggregation
+"""
+
+"""
+#### Hierarchical Contextualization
+
+| ![Hierarchical Contextualization](https://i.imgur.com/q875c83.png)|
+| :--: |
+| **Figure 7**: Hierarchical Contextualization (Source: Aritra and Ritwik) |
+
+In **Figure 7**, we see that the input is first projected linearly. This linear projection
+produces `Z^0`. Where `Z^0` can be expressed as follows:
+
+| ![Linear projection of z_not](https://i.imgur.com/pd0Z2Of.png) |
+| :--: |
+| Equation 5: Linear projection of `Z^0` (Source: Aritra and Ritwik) |
+
+`Z^0` is then passed on to a series of Depth-Wise (DWConv) Conv and
+[GeLU](https://www.tensorflow.org/api_docs/python/tf/keras/activations/gelu) layers. The
+authors term each block of DWConv and GeLU as levels denoted by `l`. In **Figure 6** we
+have two levels. Mathematically this is represented as:
+
+| ![Levels of modulation](https://i.imgur.com/ijGD1Df.png) |
+| :--: |
+| Equation 6: Levels of the modulation layer (Source: Aritra and Ritwik) |
+
+where `l in {1, ... , L}`
+
+The final feature map goes through a Global Average Pooling Layer. This can be expressed
+as follows:
+
+| ![Avg Pool](https://i.imgur.com/MQzQhbo.png) |
+| :--: |
+| Equation 7: Average Pooling of the final feature (Source: Aritra and Ritwik)|
+"""
+
+"""
+#### Gated Aggregation
+
+| ![Gated Aggregation](https://i.imgur.com/LwrdDKo.png[/img)|
+| :--: |
+| **Figure 8**: Gated Aggregation (Source: Aritra and Ritwik) |
+
+Now that we have `L+1` intermediate feature maps by virtue of the Hierarchical
+Contextualization step, we need a gating mechanism that lets some features pass and
+prohibits others. This can be implemented with the attention module.
+Later in the tutorial, we will visualize these gates to better understand their
+usefulness.
+
+First, we build the weights for aggregation. Here we apply a **linear layer** on the input
+feature map that projects it into `L+1` dimensions.
+
+| ![Gates](https://i.imgur.com/1CgEo1G.png) |
+| :--: |
+| Eqation 8: Gates (Source: Aritra and Ritwik) |
+
+Next we perform the weighted aggregation over the contexts.
+
+| ![z out](https://i.imgur.com/mpJ712R.png) |
+| :--: |
+| Eqation 9: Final feature map (Source: Aritra and Ritwik) |
+
+To enable communication across different channels, we use another linear layer `h()`
+to obtain the modulator
+
+| ![Modulator](https://i.imgur.com/0EpT3Ti.png) |
+| :--: |
+| Eqation 10: Modulator (Source: Aritra and Ritwik) |
+
+To sum up the Focal Modulation layer we have:
+
+| ![Focal Modulation Layer](https://i.imgur.com/1QIhvYA.png) |
+| :--: |
+| Eqation 11: Focal Modulation Layer (Source: Aritra and Ritwik) |
+"""
+
+
+class FocalModulationLayer(layers.Layer):
+    """The Focal Modulation layer includes query projection & context aggregation.
+
+    Args:
+        dim (int): Projection dimension.
+        focal_window (int): Window size for focal modulation.
+        focal_level (int): The current focal level.
+        focal_factor (int): Factor of focal modulation.
+        proj_drop_rate (float): Rate of dropout.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        focal_window: int,
+        focal_level: int,
+        focal_factor: int = 2,
+        proj_drop_rate: float = 0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.dim = dim
+        self.focal_window = focal_window
+        self.focal_level = focal_level
+        self.focal_factor = focal_factor
+        self.proj_drop_rate = proj_drop_rate
+
+        # Project the input feature into a new feature space using a
+        # linear layer. Note the `units` used. We will be projecting the input
+        # feature all at once and split the projection into query, context,
+        # and gates.
+        self.initial_proj = layers.Dense(
+            units=(2 * self.dim) + (self.focal_level + 1),
+            use_bias=True,
+        )
+        self.focal_layers = list()
+        self.kernel_sizes = list()
+        for idx in range(self.focal_level):
+            kernel_size = (self.focal_factor * idx) + self.focal_window
+            depth_gelu_block = keras.Sequential(
+                [
+                    layers.ZeroPadding2D(padding=(kernel_size // 2, kernel_size // 2)),
+                    layers.Conv2D(
+                        filters=self.dim,
+                        kernel_size=kernel_size,
+                        activation=keras.activations.gelu,
+                        groups=self.dim,
+                        use_bias=False,
+                    ),
+                ]
+            )
+            self.focal_layers.append(depth_gelu_block)
+            self.kernel_sizes.append(kernel_size)
+        self.activation = keras.activations.gelu
+        self.gap = layers.GlobalAveragePooling2D(keepdims=True)
+        self.modulator_proj = layers.Conv2D(
+            filters=self.dim,
+            kernel_size=(1, 1),
+            use_bias=True,
+        )
+        self.proj = layers.Dense(units=self.dim)
+        self.proj_drop = layers.Dropout(self.proj_drop_rate)
+
+    def call(self, x: tf.Tensor, training: Optional[bool] = None) -> tf.Tensor:
+        """Forward pass of the layer.
+
+        Args:
+            x: Tensor of shape (B, H, W, C)
+        """
+        # Apply the linear projecion to the input feature map
+        x_proj = self.initial_proj(x)
+
+        # Split the projected x into query, context and gates
+        query, context, self.gates = tf.split(
+            value=x_proj,
+            num_or_size_splits=[self.dim, self.dim, self.focal_level + 1],
+            axis=-1,
+        )
+
+        # Context aggregation
+        context = self.focal_layers[0](context)
+        context_all = context * self.gates[..., 0:1]
+        for idx in range(1, self.focal_level):
+            context = self.focal_layers[idx](context)
+            context_all += context * self.gates[..., idx : idx + 1]
+
+        # Build the global context
+        context_global = self.activation(self.gap(context))
+        context_all += context_global * self.gates[..., self.focal_level :]
+
+        # Focal Modulation
+        self.modulator = self.modulator_proj(context_all)
+        x_output = query * self.modulator
+
+        # Project the output and apply dropout
+        x_output = self.proj(x_output)
+        x_output = self.proj_drop(x_output)
+
+        return x_output
+
+
+"""
+### The Focal Modulation block
+
+Finally, we have all the components we need to build the Focal Modulation block. Here we
+take the MLP and Focal Modulation layer together and build the Focal Modulation block.
+"""
+
+
+class FocalModulationBlock(layers.Layer):
+    """Combine FFN and Focal Modulation Layer.
+
+    Args:
+        dim (int): Number of input channels.
+        input_resolution (Tuple[int]): Input resulotion.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        drop (float): Dropout rate.
+        drop_path (float): Stochastic depth rate.
+        focal_level (int): Number of focal levels.
+        focal_window (int): Focal window size at first focal level
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        input_resolution: Tuple[int],
+        mlp_ratio: float = 4.0,
+        drop: float = 0.0,
+        drop_path: float = 0.0,
+        focal_level: int = 1,
+        focal_window: int = 3,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.mlp_ratio = mlp_ratio
+        self.focal_level = focal_level
+        self.focal_window = focal_window
+        self.norm = layers.LayerNormalization(epsilon=1e-5)
+        self.modulation = FocalModulationLayer(
+            dim=self.dim,
+            focal_window=self.focal_window,
+            focal_level=self.focal_level,
+            proj_drop_rate=drop,
+        )
+        mlp_hidden_dim = int(self.dim * self.mlp_ratio)
+        self.mlp = MLP(
+            in_features=self.dim,
+            hidden_features=mlp_hidden_dim,
+            mlp_drop_rate=drop,
+        )
+
+    def call(self, x: tf.Tensor, height: int, width: int, channels: int) -> tf.Tensor:
+        """Processes the input tensor through the focal modulation block.
+
+        Args:
+            x (tf.Tensor): Inputs of the shape (B, L, C)
+            height (int): The height of the feature map
+            width (int): The width of the feature map
+            channels (int): The number of channels of the feature map
+
+        Returns:
+            The processed tensor.
+        """
+        shortcut = x
+
+        # Focal Modulation
+        x = tf.reshape(x, shape=(-1, height, width, channels))
+        x = self.modulation(x)
+        x = tf.reshape(x, shape=(-1, height * width, channels))
+
+        # FFN
+        x = shortcut + x
+        x = x + self.mlp(self.norm(x))
+        return x
+
+
+"""
+## The Basic Layer
+
+The basic layer consists of a collection of Focal Modulation blocks. This is
+illustrated in **Figure 9**.
+
+| ![Basic Layer](https://i.imgur.com/UcZV0K6.png) |
+| :--: |
+| **Figure 9**: Basic Layer, a collection of focal modulation blocks. (Source: Aritra and Ritwik) |
+
+Notice how in **Fig. 9** there are more than one focal modulation blocks denoted by `Nx`.
+This shows how the Basic Layer is a collection of Focal Modulation blocks.
+"""
+
+
+class BasicLayer(layers.Layer):
+    """Collection of Focal Modulation Blocks.
+
+    Args:
+        dim (int): Dimensions of the model.
+        out_dim (int): Dimension used by the Patch Embedding Layer.
+        input_resolution (Tuple[int]): Input image resolution.
+        depth (int): The number of Focal Modulation Blocks.
+        mlp_ratio (float): Ratio of mlp hidden dim to embedding dim.
+        drop (float): Dropout rate.
+        downsample (tf.keras.layers.Layer): Downsampling layer at the end of the layer.
+        focal_level (int): The current focal level.
+        focal_window (int): Focal window used.
+    """
+
+    def __init__(
+        self,
+        dim: int,
+        out_dim: int,
+        input_resolution: Tuple[int],
+        depth: int,
+        mlp_ratio: float = 4.0,
+        drop: float = 0.0,
+        downsample=None,
+        focal_level: int = 1,
+        focal_window: int = 1,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.dim = dim
+        self.input_resolution = input_resolution
+        self.depth = depth
+        self.blocks = [
+            FocalModulationBlock(
+                dim=dim,
+                input_resolution=input_resolution,
+                mlp_ratio=mlp_ratio,
+                drop=drop,
+                focal_level=focal_level,
+                focal_window=focal_window,
+            )
+            for i in range(self.depth)
+        ]
+
+        # Downsample layer at the end of the layer
+        if downsample is not None:
+            self.downsample = downsample(
+                image_size=input_resolution,
+                patch_size=(2, 2),
+                embed_dim=out_dim,
+            )
+        else:
+            self.downsample = None
+
+    def call(
+        self, x: tf.Tensor, height: int, width: int, channels: int
+    ) -> Tuple[tf.Tensor, int, int, int]:
+        """Forward pass of the layer.
+
+        Args:
+            x (tf.Tensor): Tensor of shape (B, L, C)
+            height (int): Height of feature map
+            width (int): Width of feature map
+            channels (int): Embed Dim of feature map
+
+        Returns:
+            A tuple of the processed tensor, changed height, width, and
+            dim of the tensor.
+        """
+        # Apply Focal Modulation Blocks
+        for block in self.blocks:
+            x = block(x, height, width, channels)
+
+        # Except the last Basic Layer, all the layers have
+        # downsample at the end of it.
+        if self.downsample is not None:
+            x = tf.reshape(x, shape=(-1, height, width, channels))
+            x, height_o, width_o, channels_o = self.downsample(x)
+        else:
+            height_o, width_o, channels_o = height, width, channels
+
+        return x, height_o, width_o, channels_o
+
+
+"""
+## The Focal Modulation Network model
+
+This is the model that ties everything together.
+It consists of a collection of Basic Layers with a classification head.
+For a recap of how this is structured refer to **Figure 1**.
+"""
+
+
+class FocalModulationNetwork(keras.Model):
+    """The Focal Modulation Network.
+
+    Parameters:
+        image_size (Tuple[int]): Spatial size of images used.
+        patch_size (Tuple[int]): Patch size of each patch.
+        num_classes (int): Number of classes used for classification.
+        embed_dim (int): Patch embedding dimension.
+        depths (List[int]): Depth of each Focal Transformer block.
+        mlp_ratio (float): Ratio of expansion for the intermediate layer of MLP.
+        drop_rate (float): The dropout rate for FM and MLP layers.
+        focal_levels (list): How many focal levels at all stages.
+            Note that this excludes the finest-grain level.
+        focal_windows (list): The focal window size at all stages.
+    """
+
+    def __init__(
+        self,
+        image_size: Tuple[int] = (48, 48),
+        patch_size: Tuple[int] = (4, 4),
+        num_classes: int = 10,
+        embed_dim: int = 256,
+        depths: List[int] = [2, 3, 2],
+        mlp_ratio: float = 4.0,
+        drop_rate: float = 0.1,
+        focal_levels=[2, 2, 2],
+        focal_windows=[3, 3, 3],
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.num_layers = len(depths)
+        embed_dim = [embed_dim * (2**i) for i in range(self.num_layers)]
+        self.num_classes = num_classes
+        self.embed_dim = embed_dim
+        self.num_features = embed_dim[-1]
+        self.mlp_ratio = mlp_ratio
+        self.patch_embed = PatchEmbed(
+            image_size=image_size,
+            patch_size=patch_size,
+            embed_dim=embed_dim[0],
+        )
+        num_patches = self.patch_embed.num_patches
+        patches_resolution = self.patch_embed.patch_resolution
+        self.patches_resolution = patches_resolution
+        self.pos_drop = layers.Dropout(drop_rate)
+        self.basic_layers = list()
+        for i_layer in range(self.num_layers):
+            layer = BasicLayer(
+                dim=embed_dim[i_layer],
+                out_dim=(
+                    embed_dim[i_layer + 1] if (i_layer < self.num_layers - 1) else None
+                ),
+                input_resolution=(
+                    patches_resolution[0] // (2**i_layer),
+                    patches_resolution[1] // (2**i_layer),
+                ),
+                depth=depths[i_layer],
+                mlp_ratio=self.mlp_ratio,
+                drop=drop_rate,
+                downsample=PatchEmbed if (i_layer < self.num_layers - 1) else None,
+                focal_level=focal_levels[i_layer],
+                focal_window=focal_windows[i_layer],
+            )
+            self.basic_layers.append(layer)
+        self.norm = keras.layers.LayerNormalization(epsilon=1e-7)
+        self.avgpool = layers.GlobalAveragePooling1D()
+        self.flatten = layers.Flatten()
+        self.head = layers.Dense(self.num_classes, activation="softmax")
+
+    def call(self, x: tf.Tensor) -> tf.Tensor:
+        """Forward pass of the layer.
+
+        Args:
+            x: Tensor of shape (B, H, W, C)
+
+        Returns:
+            The logits.
+        """
+        # Patch Embed the input images.
+        x, height, width, channels = self.patch_embed(x)
+        x = self.pos_drop(x)
+
+        for idx, layer in enumerate(self.basic_layers):
+            x, height, width, channels = layer(x, height, width, channels)
+
+        x = self.norm(x)
+        x = self.avgpool(x)
+        x = self.flatten(x)
+        x = self.head(x)
+        return x
+
+
+"""
+## Train the model
+
+Now with all the components in place and the architecture actually built, we are ready to
+put it to good use.
+
+In this section, we train our Focal Modulation model on the CIFAR-10 dataset.
+"""
+
+"""
+### Visualization Callback
+
+A key feature of the Focal Modulation Network is explicit input-dependency. This means
+the modulator is calculated by looking at the local features around the target location,
+so it depends on the input. In very simple terms, this makes interpretation easy. We can
+simply lay down the gating values and the original image, next to each other to see how
+the gating mechanism works.
+
+The authors of the paper visualize the gates and the modulator in order to focus on the
+interpretability of the Focal Modulation layer. Below is a visualization
+callback that shows the gates and modulator of a specific layer in the model while the
+model trains.
+
+We will notice later that as the model trains, the visualizations get better.
+
+The gates appear to selectively permit certain aspects of the input image to pass
+through, while gently disregarding others, ultimately leading to improved classification
+accuracy.
+"""
+
+
+def display_grid(
+    test_images: tf.Tensor,
+    gates: tf.Tensor,
+    modulator: tf.Tensor,
+):
+    """Displays the image with the gates and modulator overlayed.
+
+    Args:
+        test_images (tf.Tensor): A batch of test images.
+        gates (tf.Tensor): The gates of the Focal Modualtion Layer.
+        modulator (tf.Tensor): The modulator of the Focal Modulation Layer.
+    """
+    fig, ax = plt.subplots(nrows=1, ncols=5, figsize=(25, 5))
+
+    # Radomly sample an image from the batch.
+    index = randint(0, BATCH_SIZE - 1)
+    orig_image = test_images[index]
+    gate_image = gates[index]
+    modulator_image = modulator[index]
+
+    # Original Image
+    ax[0].imshow(orig_image)
+    ax[0].set_title("Original:")
+    ax[0].axis("off")
+
+    for index in range(1, 5):
+        img = ax[index].imshow(orig_image)
+        if index != 4:
+            overlay_image = gate_image[..., index - 1]
+            title = f"G {index}:"
+        else:
+            overlay_image = tf.norm(modulator_image, ord=2, axis=-1)
+            title = f"MOD:"
+
+        ax[index].imshow(
+            overlay_image, cmap="inferno", alpha=0.6, extent=img.get_extent()
+        )
+        ax[index].set_title(title)
+        ax[index].axis("off")
+
+    plt.axis("off")
+    plt.show()
+    plt.close()
+
+
+"""
+### TrainMonitor
+"""
+
+# Taking a batch of test inputs to measure the model's progress.
+test_images, test_labels = next(iter(test_ds))
+upsampler = tf.keras.layers.UpSampling2D(
+    size=(4, 4),
+    interpolation="bilinear",
+)
+
+
+class TrainMonitor(keras.callbacks.Callback):
+    def __init__(self, epoch_interval=None):
+        self.epoch_interval = epoch_interval
+
+    def on_epoch_end(self, epoch, logs=None):
+        if self.epoch_interval and epoch % self.epoch_interval == 0:
+            _ = self.model(test_images)
+
+            # Take the mid layer for visualization
+            gates = self.model.basic_layers[1].blocks[-1].modulation.gates
+            gates = upsampler(gates)
+            modulator = self.model.basic_layers[1].blocks[-1].modulation.modulator
+            modulator = upsampler(modulator)
+
+            # Display the grid of gates and modulator.
+            display_grid(test_images=test_images, gates=gates, modulator=modulator)
+
+
+"""
+### Learning Rate scheduler
+"""
+
+
+# Some code is taken from:
+# https://www.kaggle.com/ashusma/training-rfcx-tensorflow-tpu-effnet-b2.
+class WarmUpCosine(keras.optimizers.schedules.LearningRateSchedule):
+    def __init__(
+        self, learning_rate_base, total_steps, warmup_learning_rate, warmup_steps
+    ):
+        super().__init__()
+        self.learning_rate_base = learning_rate_base
+        self.total_steps = total_steps
+        self.warmup_learning_rate = warmup_learning_rate
+        self.warmup_steps = warmup_steps
+        self.pi = tf.constant(np.pi)
+
+    def __call__(self, step):
+        if self.total_steps < self.warmup_steps:
+            raise ValueError("Total_steps must be larger or equal to warmup_steps.")
+        cos_annealed_lr = tf.cos(
+            self.pi
+            * (tf.cast(step, tf.float32) - self.warmup_steps)
+            / float(self.total_steps - self.warmup_steps)
+        )
+        learning_rate = 0.5 * self.learning_rate_base * (1 + cos_annealed_lr)
+        if self.warmup_steps > 0:
+            if self.learning_rate_base < self.warmup_learning_rate:
+                raise ValueError(
+                    "Learning_rate_base must be larger or equal to "
+                    "warmup_learning_rate."
+                )
+            slope = (
+                self.learning_rate_base - self.warmup_learning_rate
+            ) / self.warmup_steps
+            warmup_rate = slope * tf.cast(step, tf.float32) + self.warmup_learning_rate
+            learning_rate = tf.where(
+                step < self.warmup_steps, warmup_rate, learning_rate
+            )
+        return tf.where(
+            step > self.total_steps, 0.0, learning_rate, name="learning_rate"
+        )
+
+
+total_steps = int((len(x_train) / BATCH_SIZE) * EPOCHS)
+warmup_epoch_percentage = 0.15
+warmup_steps = int(total_steps * warmup_epoch_percentage)
+scheduled_lrs = WarmUpCosine(
+    learning_rate_base=LEARNING_RATE,
+    total_steps=total_steps,
+    warmup_learning_rate=0.0,
+    warmup_steps=warmup_steps,
+)
+
+"""
+### Initialize, compile and train the model
+"""
+
+focal_mod_net = FocalModulationNetwork()
+optimizer = AdamW(learning_rate=scheduled_lrs, weight_decay=WEIGHT_DECAY)
+
+# Compile and train the model.
+focal_mod_net.compile(
+    optimizer=optimizer,
+    loss="sparse_categorical_crossentropy",
+    metrics=["accuracy"],
+)
+history = focal_mod_net.fit(
+    train_ds,
+    epochs=EPOCHS,
+    validation_data=val_ds,
+    callbacks=[TrainMonitor(epoch_interval=10)],
+)
+
+"""
+## Plot loss and accuracy
+"""
+
+plt.plot(history.history["loss"], label="loss")
+plt.plot(history.history["val_loss"], label="val_loss")
+plt.legend()
+plt.show()
+
+plt.plot(history.history["accuracy"], label="accuracy")
+plt.plot(history.history["val_accuracy"], label="val_accuracy")
+plt.legend()
+plt.show()
+
+"""
+## Test visualizations
+
+Let's test our model on some test images and see how the gates look like.
+"""
+
+test_images, test_labels = next(iter(test_ds))
+_ = focal_mod_net(test_images)
+
+# Take the mid layer for visualization
+gates = focal_mod_net.basic_layers[1].blocks[-1].modulation.gates
+gates = upsampler(gates)
+modulator = focal_mod_net.basic_layers[1].blocks[-1].modulation.modulator
+modulator = upsampler(modulator)
+
+# Plot the test images with the gates and modulator overlayed.
+for row in range(5):
+    display_grid(
+        test_images=test_images,
+        gates=gates,
+        modulator=modulator,
+    )
+
+"""
+## Conclusion
+
+The proposed architecture, the Focal Modulation Network
+architecture is a mechanism that allows different
+parts of an image to interact with each other in a way that depends on the image itself.
+It works by first gathering different levels of context information around each part of
+the image (the "query token"), then using a gate to decide which context information is
+most relevant, and finally combining the chosen information in a simple but effective
+way.
+
+This is meant as a replacement of Self-Attention mechanism from the Transformer
+architecture. The key feature that makes this research notable is not the conception of
+attention-less networks, but rather the introduction of a equally powerful architecture
+that is interpretable.
+
+The authors also mention that they created a series of Focal Modulation Networks
+(FocalNets) that significantly outperform Self-Attention counterparts and with a fraction
+of parameters and pretraining data.
+
+The FocalNets architecture has the potential to deliver impressive results and offers a
+simple implementation. Its promising performance and ease of use make it an attractive
+alternative to Self-Attention for researchers to explore in their own projects. It could
+potentially become widely adopted by the Deep Learning community in the near future.
+
+## Acknowledgement
+
+We would like to thank [PyImageSearch](https://pyimagesearch.com/) for providing with a
+Colab Pro account, [JarvisLabs.ai](https://cloud.jarvislabs.ai/) for GPU credits,
+and also Microsoft Research for providing an
+[official implementation](https://github.com/microsoft/FocalNet) of their paper.
+We would also like to extend our gratitude to the first author of the
+paper [Jianwei Yang](https://twitter.com/jw2yang4ai) who reviewed this tutorial
+extensively.
+"""
diff --git a/knowledge_base/vision/forwardforward.py b/knowledge_base/vision/forwardforward.py
new file mode 100644
index 0000000000000000000000000000000000000000..ee0d8573c322974faab63b71ee4df927629255f1
--- /dev/null
+++ b/knowledge_base/vision/forwardforward.py
@@ -0,0 +1,446 @@
+"""
+Title: Using the Forward-Forward Algorithm for Image Classification
+Author: [Suvaditya Mukherjee](https://twitter.com/halcyonrayes)
+Date created: 2023/01/08
+Last modified: 2024/09/17
+Description: Training a Dense-layer model using the Forward-Forward algorithm.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+The following example explores how to use the Forward-Forward algorithm to perform
+training instead of the traditionally-used method of backpropagation, as proposed by
+Hinton in
+[The Forward-Forward Algorithm: Some Preliminary Investigations](https://www.cs.toronto.edu/~hinton/FFA13.pdf)
+(2022).
+
+The concept was inspired by the understanding behind
+[Boltzmann Machines](http://www.cs.toronto.edu/~fritz/absps/dbm.pdf). Backpropagation
+involves calculating the difference between actual and predicted output via a cost
+function to adjust network weights. On the other hand, the FF Algorithm suggests the
+analogy of neurons which get "excited" based on looking at a certain recognized
+combination of an image and its correct corresponding label.
+
+This method takes certain inspiration from the biological learning process that occurs in
+the cortex. A significant advantage that this method brings is the fact that
+backpropagation through the network does not need to be performed anymore, and that
+weight updates are local to the layer itself.
+
+As this is yet still an experimental method, it does not yield state-of-the-art results.
+But with proper tuning, it is supposed to come close to the same.
+Through this example, we will examine a process that allows us to implement the
+Forward-Forward algorithm within the layers themselves, instead of the traditional method
+of relying on the global loss functions and optimizers.
+
+The tutorial is structured as follows:
+
+- Perform necessary imports
+- Load the [MNIST dataset](http://yann.lecun.com/exdb/mnist/)
+- Visualize Random samples from the MNIST dataset
+- Define a `FFDense` Layer to override `call` and implement a custom `forwardforward`
+method which performs weight updates.
+- Define a `FFNetwork` Layer to override `train_step`, `predict` and implement 2 custom
+functions for per-sample prediction and overlaying labels
+- Convert MNIST from `NumPy` arrays to `tf.data.Dataset`
+- Fit the network
+- Visualize results
+- Perform inference on test samples
+
+As this example requires the customization of certain core functions with
+`keras.layers.Layer` and `keras.models.Model`, refer to the following resources for
+a primer on how to do so:
+
+- [Customizing what happens in `model.fit()`](https://www.tensorflow.org/guide/keras/customizing_what_happens_in_fit)
+- [Making new Layers and Models via subclassing](https://www.tensorflow.org/guide/keras/custom_layers_and_models)
+"""
+
+"""
+## Setup imports
+"""
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import tensorflow as tf
+import keras
+from keras import ops
+import numpy as np
+import matplotlib.pyplot as plt
+from sklearn.metrics import accuracy_score
+import random
+from tensorflow.compiler.tf2xla.python import xla
+
+"""
+## Load the dataset and visualize the data
+
+We use the `keras.datasets.mnist.load_data()` utility to directly pull the MNIST dataset
+in the form of `NumPy` arrays. We then arrange it in the form of the train and test
+splits.
+
+Following loading the dataset, we select 4 random samples from within the training set
+and visualize them using `matplotlib.pyplot`.
+"""
+
+(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
+
+print("4 Random Training samples and labels")
+idx1, idx2, idx3, idx4 = random.sample(range(0, x_train.shape[0]), 4)
+
+img1 = (x_train[idx1], y_train[idx1])
+img2 = (x_train[idx2], y_train[idx2])
+img3 = (x_train[idx3], y_train[idx3])
+img4 = (x_train[idx4], y_train[idx4])
+
+imgs = [img1, img2, img3, img4]
+
+plt.figure(figsize=(10, 10))
+
+for idx, item in enumerate(imgs):
+    image, label = item[0], item[1]
+    plt.subplot(2, 2, idx + 1)
+    plt.imshow(image, cmap="gray")
+    plt.title(f"Label : {label}")
+plt.show()
+
+"""
+## Define `FFDense` custom layer
+
+In this custom layer, we have a base `keras.layers.Dense` object which acts as the
+base `Dense` layer within. Since weight updates will happen within the layer itself, we
+add an `keras.optimizers.Optimizer` object that is accepted from the user. Here, we
+use `Adam` as our optimizer with a rather higher learning rate of `0.03`.
+
+Following the algorithm's specifics, we must set a `threshold` parameter that will be
+used to make the positive-negative decision in each prediction. This is set to a default
+of 2.0.
+As the epochs are localized to the layer itself, we also set a `num_epochs` parameter
+(defaults to 50).
+
+We override the `call` method in order to perform a normalization over the complete
+input space followed by running it through the base `Dense` layer as would happen in a
+normal `Dense` layer call.
+
+We implement the Forward-Forward algorithm which accepts 2 kinds of input tensors, each
+representing the positive and negative samples respectively. We write a custom training
+loop here with the use of `tf.GradientTape()`, within which we calculate a loss per
+sample by taking the distance of the prediction from the threshold to understand the
+error and taking its mean to get a `mean_loss` metric.
+
+With the help of `tf.GradientTape()` we calculate the gradient updates for the trainable
+base `Dense` layer and apply them using the layer's local optimizer.
+
+Finally, we return the `call` result as the `Dense` results of the positive and negative
+samples while also returning the last `mean_loss` metric and all the loss values over a
+certain all-epoch run.
+"""
+
+
+class FFDense(keras.layers.Layer):
+    """
+    A custom ForwardForward-enabled Dense layer. It has an implementation of the
+    Forward-Forward network internally for use.
+    This layer must be used in conjunction with the `FFNetwork` model.
+    """
+
+    def __init__(
+        self,
+        units,
+        init_optimizer,
+        loss_metric,
+        num_epochs=50,
+        use_bias=True,
+        kernel_initializer="glorot_uniform",
+        bias_initializer="zeros",
+        kernel_regularizer=None,
+        bias_regularizer=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.dense = keras.layers.Dense(
+            units=units,
+            use_bias=use_bias,
+            kernel_initializer=kernel_initializer,
+            bias_initializer=bias_initializer,
+            kernel_regularizer=kernel_regularizer,
+            bias_regularizer=bias_regularizer,
+        )
+        self.relu = keras.layers.ReLU()
+        self.optimizer = init_optimizer()
+        self.loss_metric = loss_metric
+        self.threshold = 1.5
+        self.num_epochs = num_epochs
+
+    # We perform a normalization step before we run the input through the Dense
+    # layer.
+
+    def call(self, x):
+        x_norm = ops.norm(x, ord=2, axis=1, keepdims=True)
+        x_norm = x_norm + 1e-4
+        x_dir = x / x_norm
+        res = self.dense(x_dir)
+        return self.relu(res)
+
+    # The Forward-Forward algorithm is below. We first perform the Dense-layer
+    # operation and then get a Mean Square value for all positive and negative
+    # samples respectively.
+    # The custom loss function finds the distance between the Mean-squared
+    # result and the threshold value we set (a hyperparameter) that will define
+    # whether the prediction is positive or negative in nature. Once the loss is
+    # calculated, we get a mean across the entire batch combined and perform a
+    # gradient calculation and optimization step. This does not technically
+    # qualify as backpropagation since there is no gradient being
+    # sent to any previous layer and is completely local in nature.
+
+    def forward_forward(self, x_pos, x_neg):
+        for i in range(self.num_epochs):
+            with tf.GradientTape() as tape:
+                g_pos = ops.mean(ops.power(self.call(x_pos), 2), 1)
+                g_neg = ops.mean(ops.power(self.call(x_neg), 2), 1)
+
+                loss = ops.log(
+                    1
+                    + ops.exp(
+                        ops.concatenate(
+                            [-g_pos + self.threshold, g_neg - self.threshold], 0
+                        )
+                    )
+                )
+                mean_loss = ops.cast(ops.mean(loss), dtype="float32")
+                self.loss_metric.update_state([mean_loss])
+            gradients = tape.gradient(mean_loss, self.dense.trainable_weights)
+            self.optimizer.apply_gradients(zip(gradients, self.dense.trainable_weights))
+        return (
+            ops.stop_gradient(self.call(x_pos)),
+            ops.stop_gradient(self.call(x_neg)),
+            self.loss_metric.result(),
+        )
+
+
+"""
+## Define the `FFNetwork` Custom Model
+
+With our custom layer defined, we also need to override the `train_step` method and
+define a custom `keras.models.Model` that works with our `FFDense` layer.
+
+For this algorithm, we must 'embed' the labels onto the original image. To do so, we
+exploit the structure of MNIST images where the top-left 10 pixels are always zeros. We
+use that as a label space in order to visually one-hot-encode the labels within the image
+itself. This action is performed by the `overlay_y_on_x` function.
+
+We break down the prediction function with a per-sample prediction function which is then
+called over the entire test set by the overriden `predict()` function. The prediction is
+performed here with the help of measuring the `excitation` of the neurons per layer for
+each image. This is then summed over all layers to calculate a network-wide 'goodness
+score'. The label with the highest 'goodness score' is then chosen as the sample
+prediction.
+
+The `train_step` function is overriden to act as the main controlling loop for running
+training on each layer as per the number of epochs per layer.
+"""
+
+
+class FFNetwork(keras.Model):
+    """
+    A `keras.Model` that supports a `FFDense` network creation. This model
+    can work for any kind of classification task. It has an internal
+    implementation with some details specific to the MNIST dataset which can be
+    changed as per the use-case.
+    """
+
+    # Since each layer runs gradient-calculation and optimization locally, each
+    # layer has its own optimizer that we pass. As a standard choice, we pass
+    # the `Adam` optimizer with a default learning rate of 0.03 as that was
+    # found to be the best rate after experimentation.
+    # Loss is tracked using `loss_var` and `loss_count` variables.
+
+    def __init__(
+        self,
+        dims,
+        init_layer_optimizer=lambda: keras.optimizers.Adam(learning_rate=0.03),
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.init_layer_optimizer = init_layer_optimizer
+        self.loss_var = keras.Variable(0.0, trainable=False, dtype="float32")
+        self.loss_count = keras.Variable(0.0, trainable=False, dtype="float32")
+        self.layer_list = [keras.Input(shape=(dims[0],))]
+        self.metrics_built = False
+        for d in range(len(dims) - 1):
+            self.layer_list += [
+                FFDense(
+                    dims[d + 1],
+                    init_optimizer=self.init_layer_optimizer,
+                    loss_metric=keras.metrics.Mean(),
+                )
+            ]
+
+    # This function makes a dynamic change to the image wherein the labels are
+    # put on top of the original image (for this example, as MNIST has 10
+    # unique labels, we take the top-left corner's first 10 pixels). This
+    # function returns the original data tensor with the first 10 pixels being
+    # a pixel-based one-hot representation of the labels.
+
+    @tf.function(reduce_retracing=True)
+    def overlay_y_on_x(self, data):
+        X_sample, y_sample = data
+        max_sample = ops.amax(X_sample, axis=0, keepdims=True)
+        max_sample = ops.cast(max_sample, dtype="float64")
+        X_zeros = ops.zeros([10], dtype="float64")
+        X_update = xla.dynamic_update_slice(X_zeros, max_sample, [y_sample])
+        X_sample = xla.dynamic_update_slice(X_sample, X_update, [0])
+        return X_sample, y_sample
+
+    # A custom `predict_one_sample` performs predictions by passing the images
+    # through the network, measures the results produced by each layer (i.e.
+    # how high/low the output values are with respect to the set threshold for
+    # each label) and then simply finding the label with the highest values.
+    # In such a case, the images are tested for their 'goodness' with all
+    # labels.
+
+    @tf.function(reduce_retracing=True)
+    def predict_one_sample(self, x):
+        goodness_per_label = []
+        x = ops.reshape(x, [ops.shape(x)[0] * ops.shape(x)[1]])
+        for label in range(10):
+            h, label = self.overlay_y_on_x(data=(x, label))
+            h = ops.reshape(h, [-1, ops.shape(h)[0]])
+            goodness = []
+            for layer_idx in range(1, len(self.layer_list)):
+                layer = self.layer_list[layer_idx]
+                h = layer(h)
+                goodness += [ops.mean(ops.power(h, 2), 1)]
+            goodness_per_label += [ops.expand_dims(ops.sum(goodness, keepdims=True), 1)]
+        goodness_per_label = tf.concat(goodness_per_label, 1)
+        return ops.cast(ops.argmax(goodness_per_label, 1), dtype="float64")
+
+    def predict(self, data):
+        x = data
+        preds = list()
+        preds = ops.vectorized_map(self.predict_one_sample, x)
+        return np.asarray(preds, dtype=int)
+
+    # This custom `train_step` function overrides the internal `train_step`
+    # implementation. We take all the input image tensors, flatten them and
+    # subsequently produce positive and negative samples on the images.
+    # A positive sample is an image that has the right label encoded on it with
+    # the `overlay_y_on_x` function. A negative sample is an image that has an
+    # erroneous label present on it.
+    # With the samples ready, we pass them through each `FFLayer` and perform
+    # the Forward-Forward computation on it. The returned loss is the final
+    # loss value over all the layers.
+
+    @tf.function(jit_compile=False)
+    def train_step(self, data):
+        x, y = data
+
+        if not self.metrics_built:
+            # build metrics to ensure they can be queried without erroring out.
+            # We can't update the metrics' state, as we would usually do, since
+            # we do not perform predictions within the train step
+            for metric in self.metrics:
+                if hasattr(metric, "build"):
+                    metric.build(y, y)
+            self.metrics_built = True
+
+        # Flatten op
+        x = ops.reshape(x, [-1, ops.shape(x)[1] * ops.shape(x)[2]])
+
+        x_pos, y = ops.vectorized_map(self.overlay_y_on_x, (x, y))
+
+        random_y = tf.random.shuffle(y)
+        x_neg, y = tf.map_fn(self.overlay_y_on_x, (x, random_y))
+
+        h_pos, h_neg = x_pos, x_neg
+
+        for idx, layer in enumerate(self.layers):
+            if isinstance(layer, FFDense):
+                print(f"Training layer {idx+1} now : ")
+                h_pos, h_neg, loss = layer.forward_forward(h_pos, h_neg)
+                self.loss_var.assign_add(loss)
+                self.loss_count.assign_add(1.0)
+            else:
+                print(f"Passing layer {idx+1} now : ")
+                x = layer(x)
+        mean_res = ops.divide(self.loss_var, self.loss_count)
+        return {"FinalLoss": mean_res}
+
+
+"""
+## Convert MNIST `NumPy` arrays to `tf.data.Dataset`
+
+We now perform some preliminary processing on the `NumPy` arrays and then convert them
+into the `tf.data.Dataset` format which allows for optimized loading.
+"""
+
+x_train = x_train.astype(float) / 255
+x_test = x_test.astype(float) / 255
+y_train = y_train.astype(int)
+y_test = y_test.astype(int)
+
+train_dataset = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+test_dataset = tf.data.Dataset.from_tensor_slices((x_test, y_test))
+
+train_dataset = train_dataset.batch(60000)
+test_dataset = test_dataset.batch(10000)
+
+"""
+## Fit the network and visualize results
+
+Having performed all previous set-up, we are now going to run `model.fit()` and run 250
+model epochs, which will perform 50*250 epochs on each layer. We get to see the plotted loss
+curve as each layer is trained.
+"""
+
+model = FFNetwork(dims=[784, 500, 500])
+
+model.compile(
+    optimizer=keras.optimizers.Adam(learning_rate=0.03),
+    loss="mse",
+    jit_compile=False,
+    metrics=[],
+)
+
+epochs = 250
+history = model.fit(train_dataset, epochs=epochs)
+
+"""
+## Perform inference and testing
+
+Having trained the model to a large extent, we now see how it performs on the
+test set. We calculate the Accuracy Score to understand the results closely.
+"""
+
+preds = model.predict(ops.convert_to_tensor(x_test))
+
+preds = preds.reshape((preds.shape[0], preds.shape[1]))
+
+results = accuracy_score(preds, y_test)
+
+print(f"Test Accuracy score : {results*100}%")
+
+plt.plot(range(len(history.history["FinalLoss"])), history.history["FinalLoss"])
+plt.title("Loss over training")
+plt.show()
+
+"""
+## Conclusion
+
+This example has hereby demonstrated how the Forward-Forward algorithm works using
+the TensorFlow and Keras packages. While the investigation results presented by Prof. Hinton
+in their paper are currently still limited to smaller models and datasets like MNIST and
+Fashion-MNIST, subsequent results on larger models like LLMs are expected in future
+papers.
+
+Through the paper, Prof. Hinton has reported results of 1.36% test accuracy error with a
+2000-units, 4 hidden-layer, fully-connected network run over 60 epochs (while mentioning
+that backpropagation takes only 20 epochs to achieve similar performance). Another run of
+doubling the learning rate and training for 40 epochs yields a slightly worse error rate
+of 1.46%
+
+The current example does not yield state-of-the-art results. But with proper tuning of
+the Learning Rate, model architecture (number of units in `Dense` layers, kernel
+activations, initializations, regularization etc.), the results can be improved
+to match the claims of the paper.
+"""
diff --git a/knowledge_base/vision/fully_convolutional_network.py b/knowledge_base/vision/fully_convolutional_network.py
new file mode 100644
index 0000000000000000000000000000000000000000..aa792f17185c40cea8e49e7183a165e68678a210
--- /dev/null
+++ b/knowledge_base/vision/fully_convolutional_network.py
@@ -0,0 +1,634 @@
+"""
+Title: Image Segmentation using Composable Fully-Convolutional Networks
+Author: [Suvaditya Mukherjee](https://twitter.com/halcyonrayes)
+Date created: 2023/06/16
+Last modified: 2023/12/25
+Description: Using the Fully-Convolutional Network for Image Segmentation.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+The following example walks through the steps to implement Fully-Convolutional Networks
+for Image Segmentation on the Oxford-IIIT Pets dataset.
+The model was proposed in the paper,
+[Fully Convolutional Networks for Semantic Segmentation by Long et. al.(2014)](https://arxiv.org/abs/1411.4038).
+Image segmentation is one of the most common and introductory tasks when it comes to
+Computer Vision, where we extend the problem of Image Classification from
+one-label-per-image to a pixel-wise classification problem.
+In this example, we will assemble the aforementioned Fully-Convolutional Segmentation architecture,
+capable of performing Image Segmentation.
+The network extends the pooling layer outputs from the VGG in order to perform upsampling
+and get a final result. The intermediate outputs coming from the 3rd, 4th and 5th Max-Pooling layers from VGG19 are
+extracted out and upsampled at different levels and factors to get a final output with the same shape as that
+of the output, but with the class of each pixel present at each location, instead of pixel intensity values.
+Different intermediate pool layers are extracted and processed upon for different versions of the network.
+The FCN architecture has 3 versions of differing quality.
+
+- FCN-32S
+- FCN-16S
+- FCN-8S
+
+All versions of the model derive their outputs through an iterative processing of
+successive intermediate pool layers of the main backbone used.
+A better idea can be gained from the figure below.
+
+| ![FCN Architecture](https://i.imgur.com/Ttros06.png) |
+| :--: |
+| **Diagram 1**: Combined Architecture Versions (Source: Paper) |
+
+To get a better idea on Image Segmentation or find more pre-trained models, feel free to
+navigate to the [Hugging Face Image Segmentation Models](https://huggingface.co/models?pipeline_tag=image-segmentation) page,
+or a [PyImageSearch Blog on Semantic Segmentation](https://pyimagesearch.com/2018/09/03/semantic-segmentation-with-opencv-and-deep-learning/)
+
+"""
+
+"""
+## Setup Imports
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+import keras
+from keras import ops
+import tensorflow as tf
+import matplotlib.pyplot as plt
+import tensorflow_datasets as tfds
+import numpy as np
+
+AUTOTUNE = tf.data.AUTOTUNE
+
+"""
+## Set configurations for notebook variables
+
+We set the required parameters for the experiment.
+The chosen dataset has a total of 4 classes per image, with regards to the segmentation mask.
+We also set our hyperparameters in this cell.
+
+Mixed Precision as an option is also available in systems which support it, to reduce
+load.
+This would make most tensors use `16-bit float` values instead of `32-bit float`
+values, in places where it will not adversely affect computation.
+This means, during computation, TensorFlow will use `16-bit float` Tensors to increase speed at the cost of precision,
+while storing the values in their original default `32-bit float` form.
+"""
+
+NUM_CLASSES = 4
+INPUT_HEIGHT = 224
+INPUT_WIDTH = 224
+LEARNING_RATE = 1e-3
+WEIGHT_DECAY = 1e-4
+EPOCHS = 20
+BATCH_SIZE = 32
+MIXED_PRECISION = True
+SHUFFLE = True
+
+# Mixed-precision setting
+if MIXED_PRECISION:
+    policy = keras.mixed_precision.Policy("mixed_float16")
+    keras.mixed_precision.set_global_policy(policy)
+
+"""
+## Load dataset
+
+We make use of the [Oxford-IIIT Pets dataset](http://www.robots.ox.ac.uk/~vgg/data/pets/)
+which contains a total of 7,349 samples and their segmentation masks.
+We have 37 classes, with roughly 200 samples per class.
+Our training and validation dataset has 3,128 and 552 samples respectively.
+Aside from this, our test split has a total of 3,669 samples.
+
+We set a `batch_size` parameter that will batch our samples together, use a `shuffle`
+parameter to mix our samples together.
+"""
+
+(train_ds, valid_ds, test_ds) = tfds.load(
+    "oxford_iiit_pet",
+    split=["train[:85%]", "train[85%:]", "test"],
+    batch_size=BATCH_SIZE,
+    shuffle_files=SHUFFLE,
+)
+
+"""
+## Unpack and preprocess dataset
+
+We define a simple function that includes performs Resizing over our
+training, validation and test datasets.
+We do the same process on the masks as well, to make sure both are aligned in terms of shape and size.
+"""
+
+
+# Image and Mask Pre-processing
+def unpack_resize_data(section):
+    image = section["image"]
+    segmentation_mask = section["segmentation_mask"]
+
+    resize_layer = keras.layers.Resizing(INPUT_HEIGHT, INPUT_WIDTH)
+
+    image = resize_layer(image)
+    segmentation_mask = resize_layer(segmentation_mask)
+
+    return image, segmentation_mask
+
+
+train_ds = train_ds.map(unpack_resize_data, num_parallel_calls=AUTOTUNE)
+valid_ds = valid_ds.map(unpack_resize_data, num_parallel_calls=AUTOTUNE)
+test_ds = test_ds.map(unpack_resize_data, num_parallel_calls=AUTOTUNE)
+"""
+## Visualize one random sample from the pre-processed dataset
+
+We visualize what a random sample in our test split of the dataset looks like, and plot
+the segmentation mask on top to see the effective mask areas.
+Note that we have performed pre-processing on this dataset too,
+which makes the image and mask size same.
+"""
+
+# Select random image and mask. Cast to NumPy array
+# for Matplotlib visualization.
+
+images, masks = next(iter(test_ds))
+random_idx = keras.random.uniform([], minval=0, maxval=BATCH_SIZE, seed=10)
+
+test_image = images[int(random_idx)].numpy().astype("float")
+test_mask = masks[int(random_idx)].numpy().astype("float")
+
+# Overlay segmentation mask on top of image.
+fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
+
+ax[0].set_title("Image")
+ax[0].imshow(test_image / 255.0)
+
+ax[1].set_title("Image with segmentation mask overlay")
+ax[1].imshow(test_image / 255.0)
+ax[1].imshow(
+    test_mask,
+    cmap="inferno",
+    alpha=0.6,
+)
+plt.show()
+
+"""
+## Perform VGG-specific pre-processing
+
+`keras.applications.VGG19` requires the use of a `preprocess_input` function that will
+pro-actively perform Image-net style Standard Deviation Normalization scheme.
+"""
+
+
+def preprocess_data(image, segmentation_mask):
+    image = keras.applications.vgg19.preprocess_input(image)
+
+    return image, segmentation_mask
+
+
+train_ds = (
+    train_ds.map(preprocess_data, num_parallel_calls=AUTOTUNE)
+    .shuffle(buffer_size=1024)
+    .prefetch(buffer_size=1024)
+)
+valid_ds = (
+    valid_ds.map(preprocess_data, num_parallel_calls=AUTOTUNE)
+    .shuffle(buffer_size=1024)
+    .prefetch(buffer_size=1024)
+)
+test_ds = (
+    test_ds.map(preprocess_data, num_parallel_calls=AUTOTUNE)
+    .shuffle(buffer_size=1024)
+    .prefetch(buffer_size=1024)
+)
+"""
+## Model Definition
+
+The Fully-Convolutional Network boasts a simple architecture composed of only
+`keras.layers.Conv2D` Layers, `keras.layers.Dense` layers and `keras.layers.Dropout`
+layers.
+
+| ![FCN Architecture](https://i.imgur.com/PerTKjf.png) |
+| :--: |
+| **Diagram 2**: Generic FCN Forward Pass (Source: Paper)|
+
+Pixel-wise prediction is performed by having a Softmax Convolutional layer with the same
+size of the image, such that we can perform direct comparison
+We can find several important metrics such as Accuracy and Mean-Intersection-over-Union on the network.
+"""
+
+"""
+### Backbone (VGG-19)
+
+We use the [VGG-19 network](https://keras.io/api/applications/vgg/) as the backbone, as
+the paper suggests it to be one of the most effective backbones for this network.
+We extract different outputs from the network by making use of `keras.models.Model`.
+Following this, we add layers on top to make a network perfectly simulating that of
+Diagram 1.
+The backbone's `keras.layers.Dense` layers will be converted to `keras.layers.Conv2D`
+layers based on the [original Caffe code present here.](https://github.com/linxi159/FCN-caffe/blob/master/pascalcontext-fcn16s/net.py)
+All 3 networks will share the same backbone weights, but will have differing results
+based on their extensions.
+We make the backbone non-trainable to improve training time requirements.
+It is also noted in the paper that making the network trainable does not yield major benefits.
+"""
+
+input_layer = keras.Input(shape=(INPUT_HEIGHT, INPUT_WIDTH, 3))
+
+# VGG Model backbone with pre-trained ImageNet weights.
+vgg_model = keras.applications.vgg19.VGG19(include_top=True, weights="imagenet")
+
+# Extracting different outputs from same model
+fcn_backbone = keras.models.Model(
+    inputs=vgg_model.layers[1].input,
+    outputs=[
+        vgg_model.get_layer(block_name).output
+        for block_name in ["block3_pool", "block4_pool", "block5_pool"]
+    ],
+)
+
+# Setting backbone to be non-trainable
+fcn_backbone.trainable = False
+
+x = fcn_backbone(input_layer)
+
+# Converting Dense layers to Conv2D layers
+units = [4096, 4096]
+dense_convs = []
+
+for filter_idx in range(len(units)):
+    dense_conv = keras.layers.Conv2D(
+        filters=units[filter_idx],
+        kernel_size=(7, 7) if filter_idx == 0 else (1, 1),
+        strides=(1, 1),
+        activation="relu",
+        padding="same",
+        use_bias=False,
+        kernel_initializer=keras.initializers.Constant(1.0),
+    )
+    dense_convs.append(dense_conv)
+    dropout_layer = keras.layers.Dropout(0.5)
+    dense_convs.append(dropout_layer)
+
+dense_convs = keras.Sequential(dense_convs)
+dense_convs.trainable = False
+
+x[-1] = dense_convs(x[-1])
+
+pool3_output, pool4_output, pool5_output = x
+
+"""
+### FCN-32S
+
+We extend the last output, perform a `1x1 Convolution` and perform 2D Bilinear Upsampling
+by a factor of 32 to get an image of the same size as that of our input.
+We use a simple `keras.layers.UpSampling2D` layer over a `keras.layers.Conv2DTranspose`
+since it yields performance benefits from being a deterministic mathematical operation
+over a Convolutional operation
+It is also noted in the paper that making the Up-sampling parameters trainable does not yield benefits.
+Original experiments of the paper used Upsampling as well.
+"""
+
+# 1x1 convolution to set channels = number of classes
+pool5 = keras.layers.Conv2D(
+    filters=NUM_CLASSES,
+    kernel_size=(1, 1),
+    padding="same",
+    strides=(1, 1),
+    activation="relu",
+)
+
+# Get Softmax outputs for all classes
+fcn32s_conv_layer = keras.layers.Conv2D(
+    filters=NUM_CLASSES,
+    kernel_size=(1, 1),
+    activation="softmax",
+    padding="same",
+    strides=(1, 1),
+)
+
+# Up-sample to original image size
+fcn32s_upsampling = keras.layers.UpSampling2D(
+    size=(32, 32),
+    data_format=keras.backend.image_data_format(),
+    interpolation="bilinear",
+)
+
+final_fcn32s_pool = pool5(pool5_output)
+final_fcn32s_output = fcn32s_conv_layer(final_fcn32s_pool)
+final_fcn32s_output = fcn32s_upsampling(final_fcn32s_output)
+
+fcn32s_model = keras.Model(inputs=input_layer, outputs=final_fcn32s_output)
+
+"""
+### FCN-16S
+
+The pooling output from the FCN-32S is extended and added to the 4th-level Pooling output
+of our backbone.
+Following this, we upsample by a factor of 16 to get image of the same
+size as that of our input.
+"""
+
+# 1x1 convolution to set channels = number of classes
+# Followed from the original Caffe implementation
+pool4 = keras.layers.Conv2D(
+    filters=NUM_CLASSES,
+    kernel_size=(1, 1),
+    padding="same",
+    strides=(1, 1),
+    activation="linear",
+    kernel_initializer=keras.initializers.Zeros(),
+)(pool4_output)
+
+# Intermediate up-sample
+pool5 = keras.layers.UpSampling2D(
+    size=(2, 2),
+    data_format=keras.backend.image_data_format(),
+    interpolation="bilinear",
+)(final_fcn32s_pool)
+
+# Get Softmax outputs for all classes
+fcn16s_conv_layer = keras.layers.Conv2D(
+    filters=NUM_CLASSES,
+    kernel_size=(1, 1),
+    activation="softmax",
+    padding="same",
+    strides=(1, 1),
+)
+
+# Up-sample to original image size
+fcn16s_upsample_layer = keras.layers.UpSampling2D(
+    size=(16, 16),
+    data_format=keras.backend.image_data_format(),
+    interpolation="bilinear",
+)
+
+# Add intermediate outputs
+final_fcn16s_pool = keras.layers.Add()([pool4, pool5])
+final_fcn16s_output = fcn16s_conv_layer(final_fcn16s_pool)
+final_fcn16s_output = fcn16s_upsample_layer(final_fcn16s_output)
+
+fcn16s_model = keras.models.Model(inputs=input_layer, outputs=final_fcn16s_output)
+
+"""
+### FCN-8S
+
+The pooling output from the FCN-16S is extended once more, and added from the 3rd-level
+Pooling output of our backbone.
+This result is upsampled by a factor of 8 to get an image of the same size as that of our input.
+"""
+
+# 1x1 convolution to set channels = number of classes
+# Followed from the original Caffe implementation
+pool3 = keras.layers.Conv2D(
+    filters=NUM_CLASSES,
+    kernel_size=(1, 1),
+    padding="same",
+    strides=(1, 1),
+    activation="linear",
+    kernel_initializer=keras.initializers.Zeros(),
+)(pool3_output)
+
+# Intermediate up-sample
+intermediate_pool_output = keras.layers.UpSampling2D(
+    size=(2, 2),
+    data_format=keras.backend.image_data_format(),
+    interpolation="bilinear",
+)(final_fcn16s_pool)
+
+# Get Softmax outputs for all classes
+fcn8s_conv_layer = keras.layers.Conv2D(
+    filters=NUM_CLASSES,
+    kernel_size=(1, 1),
+    activation="softmax",
+    padding="same",
+    strides=(1, 1),
+)
+
+# Up-sample to original image size
+fcn8s_upsample_layer = keras.layers.UpSampling2D(
+    size=(8, 8),
+    data_format=keras.backend.image_data_format(),
+    interpolation="bilinear",
+)
+
+# Add intermediate outputs
+final_fcn8s_pool = keras.layers.Add()([pool3, intermediate_pool_output])
+final_fcn8s_output = fcn8s_conv_layer(final_fcn8s_pool)
+final_fcn8s_output = fcn8s_upsample_layer(final_fcn8s_output)
+
+fcn8s_model = keras.models.Model(inputs=input_layer, outputs=final_fcn8s_output)
+
+"""
+### Load weights into backbone
+
+It was noted in the paper, as well as through experimentation that extracting the weights
+of the last 2 Fully-connected Dense layers from the backbone, reshaping the weights to
+fit that of the `keras.layers.Dense` layers we had previously converted into
+`keras.layers.Conv2D`, and setting them to it yields far better results and a significant
+increase in mIOU performance.
+"""
+
+# VGG's last 2 layers
+weights1 = vgg_model.get_layer("fc1").get_weights()[0]
+weights2 = vgg_model.get_layer("fc2").get_weights()[0]
+
+weights1 = weights1.reshape(7, 7, 512, 4096)
+weights2 = weights2.reshape(1, 1, 4096, 4096)
+
+dense_convs.layers[0].set_weights([weights1])
+dense_convs.layers[2].set_weights([weights2])
+
+"""
+## Training
+
+The original paper talks about making use of [SGD with Momentum](https://keras.io/api/optimizers/sgd/) as the optimizer of choice.
+But it was noticed during experimentation that
+[AdamW](https://keras.io/api/optimizers/adamw/)
+yielded better results in terms of mIOU and Pixel-wise Accuracy.
+"""
+
+"""
+### FCN-32S
+"""
+
+fcn32s_optimizer = keras.optimizers.AdamW(
+    learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY
+)
+
+fcn32s_loss = keras.losses.SparseCategoricalCrossentropy()
+
+# Maintain mIOU and Pixel-wise Accuracy as metrics
+fcn32s_model.compile(
+    optimizer=fcn32s_optimizer,
+    loss=fcn32s_loss,
+    metrics=[
+        keras.metrics.MeanIoU(num_classes=NUM_CLASSES, sparse_y_pred=False),
+        keras.metrics.SparseCategoricalAccuracy(),
+    ],
+)
+
+fcn32s_history = fcn32s_model.fit(train_ds, epochs=EPOCHS, validation_data=valid_ds)
+
+"""
+### FCN-16S
+"""
+
+fcn16s_optimizer = keras.optimizers.AdamW(
+    learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY
+)
+
+fcn16s_loss = keras.losses.SparseCategoricalCrossentropy()
+
+# Maintain mIOU and Pixel-wise Accuracy as metrics
+fcn16s_model.compile(
+    optimizer=fcn16s_optimizer,
+    loss=fcn16s_loss,
+    metrics=[
+        keras.metrics.MeanIoU(num_classes=NUM_CLASSES, sparse_y_pred=False),
+        keras.metrics.SparseCategoricalAccuracy(),
+    ],
+)
+
+fcn16s_history = fcn16s_model.fit(train_ds, epochs=EPOCHS, validation_data=valid_ds)
+
+"""
+### FCN-8S
+"""
+
+fcn8s_optimizer = keras.optimizers.AdamW(
+    learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY
+)
+
+fcn8s_loss = keras.losses.SparseCategoricalCrossentropy()
+
+# Maintain mIOU and Pixel-wise Accuracy as metrics
+fcn8s_model.compile(
+    optimizer=fcn8s_optimizer,
+    loss=fcn8s_loss,
+    metrics=[
+        keras.metrics.MeanIoU(num_classes=NUM_CLASSES, sparse_y_pred=False),
+        keras.metrics.SparseCategoricalAccuracy(),
+    ],
+)
+
+fcn8s_history = fcn8s_model.fit(train_ds, epochs=EPOCHS, validation_data=valid_ds)
+"""
+## Visualizations
+"""
+
+"""
+### Plotting metrics for training run
+
+We perform a comparative study between all 3 versions of the model by tracking training
+and validation metrics of Accuracy, Loss and Mean IoU.
+"""
+
+total_plots = len(fcn32s_history.history)
+cols = total_plots // 2
+
+rows = total_plots // cols
+
+if total_plots % cols != 0:
+    rows += 1
+
+# Set all history dictionary objects
+fcn32s_dict = fcn32s_history.history
+fcn16s_dict = fcn16s_history.history
+fcn8s_dict = fcn8s_history.history
+
+pos = range(1, total_plots + 1)
+plt.figure(figsize=(15, 10))
+
+for i, ((key_32s, value_32s), (key_16s, value_16s), (key_8s, value_8s)) in enumerate(
+    zip(fcn32s_dict.items(), fcn16s_dict.items(), fcn8s_dict.items())
+):
+    plt.subplot(rows, cols, pos[i])
+    plt.plot(range(len(value_32s)), value_32s)
+    plt.plot(range(len(value_16s)), value_16s)
+    plt.plot(range(len(value_8s)), value_8s)
+    plt.title(str(key_32s) + " (combined)")
+    plt.legend(["FCN-32S", "FCN-16S", "FCN-8S"])
+
+plt.show()
+
+"""
+### Visualizing predicted segmentation masks
+
+To understand the results and see them better, we pick a random image from the test
+dataset and perform inference on it to see the masks generated by each model.
+Note: For better results, the model must be trained for a higher number of epochs.
+"""
+
+images, masks = next(iter(test_ds))
+random_idx = keras.random.uniform([], minval=0, maxval=BATCH_SIZE, seed=10)
+
+# Get random test image and mask
+test_image = images[int(random_idx)].numpy().astype("float")
+test_mask = masks[int(random_idx)].numpy().astype("float")
+
+pred_image = ops.expand_dims(test_image, axis=0)
+pred_image = keras.applications.vgg19.preprocess_input(pred_image)
+
+# Perform inference on FCN-32S
+pred_mask_32s = fcn32s_model.predict(pred_image, verbose=0).astype("float")
+pred_mask_32s = np.argmax(pred_mask_32s, axis=-1)
+pred_mask_32s = pred_mask_32s[0, ...]
+
+# Perform inference on FCN-16S
+pred_mask_16s = fcn16s_model.predict(pred_image, verbose=0).astype("float")
+pred_mask_16s = np.argmax(pred_mask_16s, axis=-1)
+pred_mask_16s = pred_mask_16s[0, ...]
+
+# Perform inference on FCN-8S
+pred_mask_8s = fcn8s_model.predict(pred_image, verbose=0).astype("float")
+pred_mask_8s = np.argmax(pred_mask_8s, axis=-1)
+pred_mask_8s = pred_mask_8s[0, ...]
+
+# Plot all results
+fig, ax = plt.subplots(nrows=2, ncols=3, figsize=(15, 8))
+
+fig.delaxes(ax[0, 2])
+
+ax[0, 0].set_title("Image")
+ax[0, 0].imshow(test_image / 255.0)
+
+ax[0, 1].set_title("Image with ground truth overlay")
+ax[0, 1].imshow(test_image / 255.0)
+ax[0, 1].imshow(
+    test_mask,
+    cmap="inferno",
+    alpha=0.6,
+)
+
+ax[1, 0].set_title("Image with FCN-32S mask overlay")
+ax[1, 0].imshow(test_image / 255.0)
+ax[1, 0].imshow(pred_mask_32s, cmap="inferno", alpha=0.6)
+
+ax[1, 1].set_title("Image with FCN-16S mask overlay")
+ax[1, 1].imshow(test_image / 255.0)
+ax[1, 1].imshow(pred_mask_16s, cmap="inferno", alpha=0.6)
+
+ax[1, 2].set_title("Image with FCN-8S mask overlay")
+ax[1, 2].imshow(test_image / 255.0)
+ax[1, 2].imshow(pred_mask_8s, cmap="inferno", alpha=0.6)
+
+plt.show()
+
+"""
+## Conclusion
+
+The Fully-Convolutional Network is an exceptionally simple network that has yielded
+strong results in Image Segmentation tasks across different benchmarks.
+With the advent of better mechanisms like [Attention](https://arxiv.org/abs/1706.03762) as used in
+[SegFormer](https://arxiv.org/abs/2105.15203) and
+[DeTR](https://arxiv.org/abs/2005.12872), this model serves as a quick way to iterate and
+find baselines for this task on unknown data.
+"""
+
+"""
+## Acknowledgements
+
+I thank [Aritra Roy Gosthipaty](https://twitter.com/ariG23498), [Ayush
+Thakur](https://twitter.com/ayushthakur0) and [Ritwik
+Raha](https://twitter.com/ritwik_raha) for giving a preliminary review of the example.
+I also thank the [Google Developer
+Experts](https://developers.google.com/community/experts) program.
+
+"""
diff --git a/knowledge_base/vision/grad_cam.py b/knowledge_base/vision/grad_cam.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6b6531c216ca330a1dbc35ab511e7077840a0b7
--- /dev/null
+++ b/knowledge_base/vision/grad_cam.py
@@ -0,0 +1,203 @@
+"""
+Title: Grad-CAM class activation visualization
+Author: [fchollet](https://twitter.com/fchollet)
+Date created: 2020/04/26
+Last modified: 2021/03/07
+Description: How to obtain a class activation heatmap for an image classification model.
+Accelerator: GPU
+"""
+
+"""
+Adapted from Deep Learning with Python (2017).
+
+## Setup
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import numpy as np
+import tensorflow as tf
+import keras
+
+# Display
+from IPython.display import Image, display
+import matplotlib as mpl
+import matplotlib.pyplot as plt
+
+
+"""
+## Configurable parameters
+
+You can change these to another model.
+
+To get the values for `last_conv_layer_name` use `model.summary()`
+to see the names of all layers in the model.
+"""
+
+model_builder = keras.applications.xception.Xception
+img_size = (299, 299)
+preprocess_input = keras.applications.xception.preprocess_input
+decode_predictions = keras.applications.xception.decode_predictions
+
+last_conv_layer_name = "block14_sepconv2_act"
+
+# The local path to our target image
+img_path = keras.utils.get_file(
+    "african_elephant.jpg", "https://i.imgur.com/Bvro0YD.png"
+)
+
+display(Image(img_path))
+
+
+"""
+## The Grad-CAM algorithm
+"""
+
+
+def get_img_array(img_path, size):
+    # `img` is a PIL image of size 299x299
+    img = keras.utils.load_img(img_path, target_size=size)
+    # `array` is a float32 Numpy array of shape (299, 299, 3)
+    array = keras.utils.img_to_array(img)
+    # We add a dimension to transform our array into a "batch"
+    # of size (1, 299, 299, 3)
+    array = np.expand_dims(array, axis=0)
+    return array
+
+
+def make_gradcam_heatmap(img_array, model, last_conv_layer_name, pred_index=None):
+    # First, we create a model that maps the input image to the activations
+    # of the last conv layer as well as the output predictions
+    grad_model = keras.models.Model(
+        model.inputs, [model.get_layer(last_conv_layer_name).output, model.output]
+    )
+
+    # Then, we compute the gradient of the top predicted class for our input image
+    # with respect to the activations of the last conv layer
+    with tf.GradientTape() as tape:
+        last_conv_layer_output, preds = grad_model(img_array)
+        if pred_index is None:
+            pred_index = tf.argmax(preds[0])
+        class_channel = preds[:, pred_index]
+
+    # This is the gradient of the output neuron (top predicted or chosen)
+    # with regard to the output feature map of the last conv layer
+    grads = tape.gradient(class_channel, last_conv_layer_output)
+
+    # This is a vector where each entry is the mean intensity of the gradient
+    # over a specific feature map channel
+    pooled_grads = tf.reduce_mean(grads, axis=(0, 1, 2))
+
+    # We multiply each channel in the feature map array
+    # by "how important this channel is" with regard to the top predicted class
+    # then sum all the channels to obtain the heatmap class activation
+    last_conv_layer_output = last_conv_layer_output[0]
+    heatmap = last_conv_layer_output @ pooled_grads[..., tf.newaxis]
+    heatmap = tf.squeeze(heatmap)
+
+    # For visualization purpose, we will also normalize the heatmap between 0 & 1
+    heatmap = tf.maximum(heatmap, 0) / tf.math.reduce_max(heatmap)
+    return heatmap.numpy()
+
+
+"""
+## Let's test-drive it
+"""
+
+# Prepare image
+img_array = preprocess_input(get_img_array(img_path, size=img_size))
+
+# Make model
+model = model_builder(weights="imagenet")
+
+# Remove last layer's softmax
+model.layers[-1].activation = None
+
+# Print what the top predicted class is
+preds = model.predict(img_array)
+print("Predicted:", decode_predictions(preds, top=1)[0])
+
+# Generate class activation heatmap
+heatmap = make_gradcam_heatmap(img_array, model, last_conv_layer_name)
+
+# Display heatmap
+plt.matshow(heatmap)
+plt.show()
+
+
+"""
+## Create a superimposed visualization
+"""
+
+
+def save_and_display_gradcam(img_path, heatmap, cam_path="cam.jpg", alpha=0.4):
+    # Load the original image
+    img = keras.utils.load_img(img_path)
+    img = keras.utils.img_to_array(img)
+
+    # Rescale heatmap to a range 0-255
+    heatmap = np.uint8(255 * heatmap)
+
+    # Use jet colormap to colorize heatmap
+    jet = mpl.colormaps["jet"]
+
+    # Use RGB values of the colormap
+    jet_colors = jet(np.arange(256))[:, :3]
+    jet_heatmap = jet_colors[heatmap]
+
+    # Create an image with RGB colorized heatmap
+    jet_heatmap = keras.utils.array_to_img(jet_heatmap)
+    jet_heatmap = jet_heatmap.resize((img.shape[1], img.shape[0]))
+    jet_heatmap = keras.utils.img_to_array(jet_heatmap)
+
+    # Superimpose the heatmap on original image
+    superimposed_img = jet_heatmap * alpha + img
+    superimposed_img = keras.utils.array_to_img(superimposed_img)
+
+    # Save the superimposed image
+    superimposed_img.save(cam_path)
+
+    # Display Grad CAM
+    display(Image(cam_path))
+
+
+save_and_display_gradcam(img_path, heatmap)
+
+"""
+## Let's try another image
+
+We will see how the grad cam explains the model's outputs for a multi-label image. Let's
+try an image with a cat and a dog together, and see how the grad cam behaves.
+"""
+
+img_path = keras.utils.get_file(
+    "cat_and_dog.jpg",
+    "https://storage.googleapis.com/petbacker/images/blog/2017/dog-and-cat-cover.jpg",
+)
+
+display(Image(img_path))
+
+# Prepare image
+img_array = preprocess_input(get_img_array(img_path, size=img_size))
+
+# Print what the two top predicted classes are
+preds = model.predict(img_array)
+print("Predicted:", decode_predictions(preds, top=2)[0])
+
+"""
+We generate class activation heatmap for "chow," the class index is 260
+"""
+
+heatmap = make_gradcam_heatmap(img_array, model, last_conv_layer_name, pred_index=260)
+
+save_and_display_gradcam(img_path, heatmap)
+
+"""
+We generate class activation heatmap for "egyptian cat," the class index is 285
+"""
+
+heatmap = make_gradcam_heatmap(img_array, model, last_conv_layer_name, pred_index=285)
+
+save_and_display_gradcam(img_path, heatmap)
diff --git a/knowledge_base/vision/gradient_centralization.py b/knowledge_base/vision/gradient_centralization.py
new file mode 100644
index 0000000000000000000000000000000000000000..aefc5e3e9ff1e00fe353c29b467926bacc4cd7aa
--- /dev/null
+++ b/knowledge_base/vision/gradient_centralization.py
@@ -0,0 +1,273 @@
+"""
+Title: Gradient Centralization for Better Training Performance
+Author: [Rishit Dagli](https://github.com/Rishit-dagli)
+Date created: 06/18/21
+Last modified: 07/25/23
+Description: Implement Gradient Centralization to improve training performance of DNNs.
+Accelerator: GPU
+Converted to Keras 3 by: [Muhammad Anas Raza](https://anasrz.com)
+"""
+
+"""
+## Introduction
+
+This example implements [Gradient Centralization](https://arxiv.org/abs/2004.01461), a
+new optimization technique for Deep Neural Networks by Yong et al., and demonstrates it
+on Laurence Moroney's [Horses or Humans
+Dataset](https://www.tensorflow.org/datasets/catalog/horses_or_humans). Gradient
+Centralization can both speedup training process and improve the final generalization
+performance of DNNs. It operates directly on gradients by centralizing the gradient
+vectors to have zero mean. Gradient Centralization morever improves the Lipschitzness of
+the loss function and its gradient so that the training process becomes more efficient
+and stable.
+
+This example requires `tensorflow_datasets` which can be installed with this command:
+
+```
+pip install tensorflow-datasets
+```
+"""
+
+"""
+## Setup
+"""
+
+from time import time
+
+import keras
+from keras import layers
+from keras.optimizers import RMSprop
+from keras import ops
+
+from tensorflow import data as tf_data
+import tensorflow_datasets as tfds
+
+
+"""
+## Prepare the data
+
+For this example, we will be using the [Horses or Humans
+dataset](https://www.tensorflow.org/datasets/catalog/horses_or_humans).
+"""
+
+num_classes = 2
+input_shape = (300, 300, 3)
+dataset_name = "horses_or_humans"
+batch_size = 128
+AUTOTUNE = tf_data.AUTOTUNE
+
+(train_ds, test_ds), metadata = tfds.load(
+    name=dataset_name,
+    split=[tfds.Split.TRAIN, tfds.Split.TEST],
+    with_info=True,
+    as_supervised=True,
+)
+
+print(f"Image shape: {metadata.features['image'].shape}")
+print(f"Training images: {metadata.splits['train'].num_examples}")
+print(f"Test images: {metadata.splits['test'].num_examples}")
+
+"""
+## Use Data Augmentation
+
+We will rescale the data to `[0, 1]` and perform simple augmentations to our data.
+"""
+
+rescale = layers.Rescaling(1.0 / 255)
+
+data_augmentation = [
+    layers.RandomFlip("horizontal_and_vertical"),
+    layers.RandomRotation(0.3),
+    layers.RandomZoom(0.2),
+]
+
+
+# Helper to apply augmentation
+def apply_aug(x):
+    for aug in data_augmentation:
+        x = aug(x)
+    return x
+
+
+def prepare(ds, shuffle=False, augment=False):
+    # Rescale dataset
+    ds = ds.map(lambda x, y: (rescale(x), y), num_parallel_calls=AUTOTUNE)
+
+    if shuffle:
+        ds = ds.shuffle(1024)
+
+    # Batch dataset
+    ds = ds.batch(batch_size)
+
+    # Use data augmentation only on the training set
+    if augment:
+        ds = ds.map(
+            lambda x, y: (apply_aug(x), y),
+            num_parallel_calls=AUTOTUNE,
+        )
+
+    # Use buffered prefecting
+    return ds.prefetch(buffer_size=AUTOTUNE)
+
+
+"""
+Rescale and augment the data
+"""
+
+train_ds = prepare(train_ds, shuffle=True, augment=True)
+test_ds = prepare(test_ds)
+"""
+## Define a model
+
+In this section we will define a Convolutional neural network.
+"""
+
+model = keras.Sequential(
+    [
+        layers.Input(shape=input_shape),
+        layers.Conv2D(16, (3, 3), activation="relu"),
+        layers.MaxPooling2D(2, 2),
+        layers.Conv2D(32, (3, 3), activation="relu"),
+        layers.Dropout(0.5),
+        layers.MaxPooling2D(2, 2),
+        layers.Conv2D(64, (3, 3), activation="relu"),
+        layers.Dropout(0.5),
+        layers.MaxPooling2D(2, 2),
+        layers.Conv2D(64, (3, 3), activation="relu"),
+        layers.MaxPooling2D(2, 2),
+        layers.Conv2D(64, (3, 3), activation="relu"),
+        layers.MaxPooling2D(2, 2),
+        layers.Flatten(),
+        layers.Dropout(0.5),
+        layers.Dense(512, activation="relu"),
+        layers.Dense(1, activation="sigmoid"),
+    ]
+)
+
+"""
+## Implement Gradient Centralization
+
+We will now
+subclass the `RMSProp` optimizer class modifying the
+`keras.optimizers.Optimizer.get_gradients()` method where we now implement Gradient
+Centralization. On a high level the idea is that let us say we obtain our gradients
+through back propagation for a Dense or Convolution layer we then compute the mean of the
+column vectors of the weight matrix, and then remove the mean from each column vector.
+
+The experiments in [this paper](https://arxiv.org/abs/2004.01461) on various
+applications, including general image classification, fine-grained image classification,
+detection and segmentation and Person ReID demonstrate that GC can consistently improve
+the performance of DNN learning.
+
+Also, for simplicity at the moment we are not implementing gradient cliiping functionality,
+however this quite easy to implement.
+
+At the moment we are just creating a subclass for the `RMSProp` optimizer
+however you could easily reproduce this for any other optimizer or on a custom
+optimizer in the same way. We will be using this class in the later section when
+we train a model with Gradient Centralization.
+"""
+
+
+class GCRMSprop(RMSprop):
+    def get_gradients(self, loss, params):
+        # We here just provide a modified get_gradients() function since we are
+        # trying to just compute the centralized gradients.
+
+        grads = []
+        gradients = super().get_gradients()
+        for grad in gradients:
+            grad_len = len(grad.shape)
+            if grad_len > 1:
+                axis = list(range(grad_len - 1))
+                grad -= ops.mean(grad, axis=axis, keep_dims=True)
+            grads.append(grad)
+
+        return grads
+
+
+optimizer = GCRMSprop(learning_rate=1e-4)
+
+"""
+## Training utilities
+
+We will also create a callback which allows us to easily measure the total training time
+and the time taken for each epoch since we are interested in comparing the effect of
+Gradient Centralization on the model we built above.
+"""
+
+
+class TimeHistory(keras.callbacks.Callback):
+    def on_train_begin(self, logs={}):
+        self.times = []
+
+    def on_epoch_begin(self, batch, logs={}):
+        self.epoch_time_start = time()
+
+    def on_epoch_end(self, batch, logs={}):
+        self.times.append(time() - self.epoch_time_start)
+
+
+"""
+## Train the model without GC
+
+We now train the model we built earlier without Gradient Centralization which we can
+compare to the training performance of the model trained with Gradient Centralization.
+"""
+
+time_callback_no_gc = TimeHistory()
+model.compile(
+    loss="binary_crossentropy",
+    optimizer=RMSprop(learning_rate=1e-4),
+    metrics=["accuracy"],
+)
+
+model.summary()
+
+"""
+We also save the history since we later want to compare our model trained with and not
+trained with Gradient Centralization
+"""
+
+history_no_gc = model.fit(
+    train_ds, epochs=10, verbose=1, callbacks=[time_callback_no_gc]
+)
+
+"""
+## Train the model with GC
+
+We will now train the same model, this time using Gradient Centralization,
+notice our optimizer is the one using Gradient Centralization this time.
+"""
+
+time_callback_gc = TimeHistory()
+model.compile(loss="binary_crossentropy", optimizer=optimizer, metrics=["accuracy"])
+
+model.summary()
+
+history_gc = model.fit(train_ds, epochs=10, verbose=1, callbacks=[time_callback_gc])
+
+"""
+## Comparing performance
+"""
+
+print("Not using Gradient Centralization")
+print(f"Loss: {history_no_gc.history['loss'][-1]}")
+print(f"Accuracy: {history_no_gc.history['accuracy'][-1]}")
+print(f"Training Time: {sum(time_callback_no_gc.times)}")
+
+print("Using Gradient Centralization")
+print(f"Loss: {history_gc.history['loss'][-1]}")
+print(f"Accuracy: {history_gc.history['accuracy'][-1]}")
+print(f"Training Time: {sum(time_callback_gc.times)}")
+
+"""
+Readers are encouraged to try out Gradient Centralization on different datasets from
+different domains and experiment with it's effect. You are strongly advised to check out
+the [original paper](https://arxiv.org/abs/2004.01461) as well - the authors present
+several studies on Gradient Centralization showing how it can improve general
+performance, generalization, training time as well as more efficient.
+
+Many thanks to [Ali Mustufa Shaikh](https://github.com/ialimustufa) for reviewing this
+implementation.
+"""
diff --git a/knowledge_base/vision/handwriting_recognition.py b/knowledge_base/vision/handwriting_recognition.py
new file mode 100644
index 0000000000000000000000000000000000000000..b0fc453170df7da2e379db8f7cfbd664fa9f410d
--- /dev/null
+++ b/knowledge_base/vision/handwriting_recognition.py
@@ -0,0 +1,580 @@
+"""
+Title: Handwriting recognition
+Authors: [A_K_Nain](https://twitter.com/A_K_Nain), [Sayak Paul](https://twitter.com/RisingSayak)
+Date created: 2021/08/16
+Last modified: 2025/09/29
+Description: Training a handwriting recognition model with variable-length sequences.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+This example shows how the [Captcha OCR](https://keras.io/examples/vision/captcha_ocr/)
+example can be extended to the
+[IAM Dataset](https://fki.tic.heia-fr.ch/databases/iam-handwriting-database),
+which has variable length ground-truth targets. Each sample in the dataset is an image of some
+handwritten text, and its corresponding target is the string present in the image.
+The IAM Dataset is widely used across many OCR benchmarks, so we hope this example can serve as a
+good starting point for building OCR systems.
+"""
+
+"""
+## Data collection
+"""
+
+"""shell
+wget -q https://github.com/sayakpaul/Handwriting-Recognizer-in-Keras/releases/download/v1.0.0/IAM_Words.zip
+unzip -qq IAM_Words.zip
+
+mkdir data
+mkdir data/words
+tar -xf IAM_Words/words.tgz -C data/words
+mv IAM_Words/words.txt data
+"""
+
+"""
+Preview how the dataset is organized. Lines prepended by "#" are just metadata information.
+"""
+
+"""shell
+head -20 data/words.txt
+"""
+
+"""
+## Imports
+"""
+
+import keras
+from keras.layers import StringLookup
+from keras import ops
+import matplotlib.pyplot as plt
+import tensorflow as tf
+import numpy as np
+import os
+
+np.random.seed(42)
+keras.utils.set_random_seed(42)
+
+"""
+## Dataset splitting
+"""
+
+base_path = "data"
+words_list = []
+
+words = open(f"{base_path}/words.txt", "r").readlines()
+for line in words:
+    if line[0] == "#":
+        continue
+    if line.split(" ")[1] != "err":  # We don't need to deal with errored entries.
+        words_list.append(line)
+
+len(words_list)
+
+np.random.shuffle(words_list)
+
+"""
+We will split the dataset into three subsets with a 90:5:5 ratio (train:validation:test).
+"""
+
+split_idx = int(0.9 * len(words_list))
+train_samples = words_list[:split_idx]
+test_samples = words_list[split_idx:]
+
+val_split_idx = int(0.5 * len(test_samples))
+validation_samples = test_samples[:val_split_idx]
+test_samples = test_samples[val_split_idx:]
+
+assert len(words_list) == len(train_samples) + len(validation_samples) + len(
+    test_samples
+)
+
+print(f"Total training samples: {len(train_samples)}")
+print(f"Total validation samples: {len(validation_samples)}")
+print(f"Total test samples: {len(test_samples)}")
+
+"""
+## Data input pipeline
+
+We start building our data input pipeline by first preparing the image paths.
+"""
+
+base_image_path = os.path.join(base_path, "words")
+
+
+def get_image_paths_and_labels(samples):
+    paths = []
+    corrected_samples = []
+    for i, file_line in enumerate(samples):
+        line_split = file_line.strip()
+        line_split = line_split.split(" ")
+
+        # Each line split will have this format for the corresponding image:
+        # part1/part1-part2/part1-part2-part3.png
+        image_name = line_split[0]
+        partI = image_name.split("-")[0]
+        partII = image_name.split("-")[1]
+        img_path = os.path.join(
+            base_image_path, partI, partI + "-" + partII, image_name + ".png"
+        )
+        if os.path.getsize(img_path):
+            paths.append(img_path)
+            corrected_samples.append(file_line.split("\n")[0])
+
+    return paths, corrected_samples
+
+
+train_img_paths, train_labels = get_image_paths_and_labels(train_samples)
+validation_img_paths, validation_labels = get_image_paths_and_labels(validation_samples)
+test_img_paths, test_labels = get_image_paths_and_labels(test_samples)
+
+"""
+Then we prepare the ground-truth labels.
+"""
+
+# Find maximum length and the size of the vocabulary in the training data.
+train_labels_cleaned = []
+characters = set()
+max_len = 0
+
+for label in train_labels:
+    label = label.split(" ")[-1].strip()
+    for char in label:
+        characters.add(char)
+
+    max_len = max(max_len, len(label))
+    train_labels_cleaned.append(label)
+
+characters = sorted(list(characters))
+
+print("Maximum length: ", max_len)
+print("Vocab size: ", len(characters))
+
+# Check some label samples.
+train_labels_cleaned[:10]
+
+"""
+Now we clean the validation and the test labels as well.
+"""
+
+
+def clean_labels(labels):
+    cleaned_labels = []
+    for label in labels:
+        label = label.split(" ")[-1].strip()
+        cleaned_labels.append(label)
+    return cleaned_labels
+
+
+validation_labels_cleaned = clean_labels(validation_labels)
+test_labels_cleaned = clean_labels(test_labels)
+
+"""
+### Building the character vocabulary
+
+Keras provides different preprocessing layers to deal with different modalities of data.
+[This guide](https://keras.io/api/layers/preprocessing_layers/) provides a comprehensive introduction.
+Our example involves preprocessing labels at the character
+level. This means that if there are two labels, e.g. "cat" and "dog", then our character
+vocabulary should be {a, c, d, g, o, t} (without any special tokens). We use the
+[`StringLookup`](https://keras.io/api/layers/preprocessing_layers/categorical/string_lookup/)
+layer for this purpose.
+"""
+
+
+AUTOTUNE = tf.data.AUTOTUNE
+
+# Mapping characters to integers.
+char_to_num = StringLookup(vocabulary=list(characters), mask_token=None)
+
+# Mapping integers back to original characters.
+num_to_char = StringLookup(
+    vocabulary=char_to_num.get_vocabulary(), mask_token=None, invert=True
+)
+
+"""
+### Resizing images without distortion
+
+Instead of square images, many OCR models work with rectangular images. This will become
+clearer in a moment when we will visualize a few samples from the dataset. While
+aspect-unaware resizing square images does not introduce a significant amount of
+distortion this is not the case for rectangular images. But resizing images to a uniform
+size is a requirement for mini-batching. So we need to perform our resizing such that
+the following criteria are met:
+
+* Aspect ratio is preserved.
+* Content of the images is not affected.
+"""
+
+
+def distortion_free_resize(image, img_size):
+    w, h = img_size
+    image = tf.image.resize(image, size=(h, w), preserve_aspect_ratio=True)
+
+    # Check tha amount of padding needed to be done.
+    pad_height = h - ops.shape(image)[0]
+    pad_width = w - ops.shape(image)[1]
+
+    # Only necessary if you want to do same amount of padding on both sides.
+    if pad_height % 2 != 0:
+        height = pad_height // 2
+        pad_height_top = height + 1
+        pad_height_bottom = height
+    else:
+        pad_height_top = pad_height_bottom = pad_height // 2
+
+    if pad_width % 2 != 0:
+        width = pad_width // 2
+        pad_width_left = width + 1
+        pad_width_right = width
+    else:
+        pad_width_left = pad_width_right = pad_width // 2
+
+    image = tf.pad(
+        image,
+        paddings=[
+            [pad_height_top, pad_height_bottom],
+            [pad_width_left, pad_width_right],
+            [0, 0],
+        ],
+    )
+
+    image = ops.transpose(image, (1, 0, 2))
+    image = tf.image.flip_left_right(image)
+    return image
+
+
+"""
+If we just go with the plain resizing then the images would look like so:
+
+![](https://i.imgur.com/eqq3s4N.png)
+
+Notice how this resizing would have introduced unnecessary stretching.
+"""
+
+"""
+### Putting the utilities together
+"""
+
+batch_size = 64
+padding_token = 99
+image_width = 128
+image_height = 32
+
+
+def preprocess_image(image_path, img_size=(image_width, image_height)):
+    image = tf.io.read_file(image_path)
+    image = tf.image.decode_png(image, 1)
+    image = distortion_free_resize(image, img_size)
+    image = ops.cast(image, tf.float32) / 255.0
+    return image
+
+
+def vectorize_label(label):
+    label = char_to_num(tf.strings.unicode_split(label, input_encoding="UTF-8"))
+    length = ops.shape(label)[0]
+    pad_amount = max_len - length
+    label = tf.pad(label, paddings=[[0, pad_amount]], constant_values=padding_token)
+    return label
+
+
+def process_images_labels(image_path, label):
+    image = preprocess_image(image_path)
+    label = vectorize_label(label)
+    return {"image": image, "label": label}
+
+
+def prepare_dataset(image_paths, labels):
+    dataset = tf.data.Dataset.from_tensor_slices((image_paths, labels)).map(
+        process_images_labels, num_parallel_calls=AUTOTUNE
+    )
+    return dataset.batch(batch_size).cache().prefetch(AUTOTUNE)
+
+
+"""
+## Prepare `tf.data.Dataset` objects
+"""
+
+train_ds = prepare_dataset(train_img_paths, train_labels_cleaned)
+validation_ds = prepare_dataset(validation_img_paths, validation_labels_cleaned)
+test_ds = prepare_dataset(test_img_paths, test_labels_cleaned)
+
+"""
+## Visualize a few samples
+"""
+
+for data in train_ds.take(1):
+    images, labels = data["image"], data["label"]
+
+    _, ax = plt.subplots(4, 4, figsize=(15, 8))
+
+    for i in range(16):
+        img = images[i]
+        img = tf.image.flip_left_right(img)
+        img = ops.transpose(img, (1, 0, 2))
+        img = (img * 255.0).numpy().clip(0, 255).astype(np.uint8)
+        img = img[:, :, 0]
+
+        # Gather indices where label!= padding_token.
+        label = labels[i]
+        indices = tf.gather(label, tf.where(tf.math.not_equal(label, padding_token)))
+        # Convert to string.
+        label = tf.strings.reduce_join(num_to_char(indices))
+        label = label.numpy().decode("utf-8")
+
+        ax[i // 4, i % 4].imshow(img, cmap="gray")
+        ax[i // 4, i % 4].set_title(label)
+        ax[i // 4, i % 4].axis("off")
+
+
+plt.show()
+
+"""
+You will notice that the content of original image is kept as faithful as possible and has
+been padded accordingly.
+"""
+
+"""
+## Model
+
+Our model will use the CTC loss as an endpoint layer. For a detailed understanding of the
+CTC loss, refer to [this post](https://distill.pub/2017/ctc/).
+"""
+
+
+class CTCLayer(keras.layers.Layer):
+    def __init__(self, name=None):
+        super().__init__(name=name)
+        self.loss_fn = tf.keras.backend.ctc_batch_cost
+
+    def call(self, y_true, y_pred):
+        batch_len = ops.cast(ops.shape(y_true)[0], dtype="int64")
+        input_length = ops.cast(ops.shape(y_pred)[1], dtype="int64")
+        label_length = ops.cast(ops.shape(y_true)[1], dtype="int64")
+
+        input_length = input_length * ops.ones(shape=(batch_len, 1), dtype="int64")
+        label_length = label_length * ops.ones(shape=(batch_len, 1), dtype="int64")
+        loss = self.loss_fn(y_true, y_pred, input_length, label_length)
+        self.add_loss(loss)
+
+        # At test time, just return the computed predictions.
+        return y_pred
+
+
+def build_model():
+    # Inputs to the model
+    input_img = keras.Input(shape=(image_width, image_height, 1), name="image")
+    labels = keras.layers.Input(name="label", shape=(None,))
+
+    # First conv block.
+    x = keras.layers.Conv2D(
+        32,
+        (3, 3),
+        activation="relu",
+        kernel_initializer="he_normal",
+        padding="same",
+        name="Conv1",
+    )(input_img)
+    x = keras.layers.MaxPooling2D((2, 2), name="pool1")(x)
+
+    # Second conv block.
+    x = keras.layers.Conv2D(
+        64,
+        (3, 3),
+        activation="relu",
+        kernel_initializer="he_normal",
+        padding="same",
+        name="Conv2",
+    )(x)
+    x = keras.layers.MaxPooling2D((2, 2), name="pool2")(x)
+
+    # We have used two max pool with pool size and strides 2.
+    # Hence, downsampled feature maps are 4x smaller. The number of
+    # filters in the last layer is 64. Reshape accordingly before
+    # passing the output to the RNN part of the model.
+    new_shape = ((image_width // 4), (image_height // 4) * 64)
+    x = keras.layers.Reshape(target_shape=new_shape, name="reshape")(x)
+    x = keras.layers.Dense(64, activation="relu", name="dense1")(x)
+    x = keras.layers.Dropout(0.2)(x)
+
+    # RNNs.
+    x = keras.layers.Bidirectional(
+        keras.layers.LSTM(128, return_sequences=True, dropout=0.25)
+    )(x)
+    x = keras.layers.Bidirectional(
+        keras.layers.LSTM(64, return_sequences=True, dropout=0.25)
+    )(x)
+
+    # +2 is to account for the two special tokens introduced by the CTC loss.
+    # The recommendation comes here: https://git.io/J0eXP.
+    x = keras.layers.Dense(
+        len(char_to_num.get_vocabulary()) + 2, activation="softmax", name="dense2"
+    )(x)
+
+    # Add CTC layer for calculating CTC loss at each step.
+    output = CTCLayer(name="ctc_loss")(labels, x)
+
+    # Define the model.
+    model = keras.models.Model(
+        inputs=[input_img, labels], outputs=output, name="handwriting_recognizer"
+    )
+    # Optimizer.
+    opt = keras.optimizers.Adam()
+    # Compile the model and return.
+    model.compile(optimizer=opt)
+    return model
+
+
+# Get the model.
+model = build_model()
+model.summary()
+
+"""
+## Evaluation metric
+
+[Edit Distance](https://en.wikipedia.org/wiki/Edit_distance)
+is the most widely used metric for evaluating OCR models. In this section, we will
+implement it and use it as a callback to monitor our model.
+"""
+
+"""
+We first segregate the validation images and their labels for convenience.
+"""
+validation_images = []
+validation_labels = []
+
+for batch in validation_ds:
+    validation_images.append(batch["image"])
+    validation_labels.append(batch["label"])
+
+"""
+Now, we create a callback to monitor the edit distances.
+"""
+
+
+def calculate_edit_distance(labels, predictions):
+    # Get a single batch and convert its labels to sparse tensors.
+    saprse_labels = ops.cast(tf.sparse.from_dense(labels), dtype=tf.int64)
+
+    # Make predictions and convert them to sparse tensors.
+    input_len = np.ones(predictions.shape[0]) * predictions.shape[1]
+    predictions_decoded = keras.ops.nn.ctc_decode(
+        predictions, sequence_lengths=input_len
+    )[0][0][:, :max_len]
+    sparse_predictions = ops.cast(
+        tf.sparse.from_dense(predictions_decoded), dtype=tf.int64
+    )
+
+    # Compute individual edit distances and average them out.
+    edit_distances = tf.edit_distance(
+        sparse_predictions, saprse_labels, normalize=False
+    )
+    return tf.reduce_mean(edit_distances)
+
+
+class EditDistanceCallback(keras.callbacks.Callback):
+    def __init__(self, pred_model):
+        super().__init__()
+        self.prediction_model = pred_model
+
+    def on_epoch_end(self, epoch, logs=None):
+        edit_distances = []
+
+        for i in range(len(validation_images)):
+            labels = validation_labels[i]
+            predictions = self.prediction_model.predict(validation_images[i])
+            edit_distances.append(calculate_edit_distance(labels, predictions).numpy())
+
+        print(
+            f"Mean edit distance for epoch {epoch + 1}: {np.mean(edit_distances):.4f}"
+        )
+
+
+"""
+## Training
+
+Now we are ready to kick off model training.
+"""
+
+epochs = 10  # To get good results this should be at least 50.
+
+model = build_model()
+prediction_model = keras.models.Model(
+    model.get_layer(name="image").output, model.get_layer(name="dense2").output
+)
+edit_distance_callback = EditDistanceCallback(prediction_model)
+
+# Train the model.
+history = model.fit(
+    train_ds,
+    validation_data=validation_ds,
+    epochs=epochs,
+    callbacks=[edit_distance_callback],
+)
+
+
+"""
+## Inference
+"""
+
+
+# A utility function to decode the output of the network.
+def decode_batch_predictions(pred):
+    input_len = np.ones(pred.shape[0]) * pred.shape[1]
+    # Use greedy search. For complex tasks, you can use beam search.
+    results = keras.ops.nn.ctc_decode(pred, sequence_lengths=input_len)[0][0][
+        :, :max_len
+    ]
+    # Iterate over the results and get back the text.
+    output_text = []
+    for res in results:
+        res = tf.gather(res, tf.where(tf.math.not_equal(res, -1)))
+        res = (
+            tf.strings.reduce_join(num_to_char(res))
+            .numpy()
+            .decode("utf-8")
+            .replace("[UNK]", "")
+        )
+        output_text.append(res)
+    return output_text
+
+
+#  Let's check results on some test samples.
+for batch in test_ds.take(1):
+    batch_images = batch["image"]
+    _, ax = plt.subplots(4, 4, figsize=(15, 8))
+
+    preds = prediction_model.predict(batch_images)
+    pred_texts = decode_batch_predictions(preds)
+
+    for i in range(16):
+        img = batch_images[i]
+        img = tf.image.flip_left_right(img)
+        img = ops.transpose(img, (1, 0, 2))
+        img = (img * 255.0).numpy().clip(0, 255).astype(np.uint8)
+        img = img[:, :, 0]
+
+        title = f"Prediction: {pred_texts[i]}"
+        ax[i // 4, i % 4].imshow(img, cmap="gray")
+        ax[i // 4, i % 4].set_title(title)
+        ax[i // 4, i % 4].axis("off")
+
+plt.show()
+
+"""
+To get better results the model should be trained for at least 50 epochs.
+"""
+
+"""
+## Final remarks
+
+* The `prediction_model` is fully compatible with TensorFlow Lite. If you are interested,
+you can use it inside a mobile application. You may find
+[this notebook](https://github.com/tulasiram58827/ocr_tflite/blob/main/colabs/captcha_ocr_tflite.ipynb)
+to be useful in this regard.
+* Not all the training examples are perfectly aligned as observed in this example. This
+can hurt model performance for complex sequences. To this end, we can leverage
+Spatial Transformer Networks ([Jaderberg et al.](https://arxiv.org/abs/1506.02025))
+that can help the model learn affine transformations that maximize its performance.
+"""
diff --git a/knowledge_base/vision/image_captioning.py b/knowledge_base/vision/image_captioning.py
new file mode 100644
index 0000000000000000000000000000000000000000..3b82821b23780ce579841bb2c5faa3ba600b0cc8
--- /dev/null
+++ b/knowledge_base/vision/image_captioning.py
@@ -0,0 +1,654 @@
+"""
+Title: Image Captioning
+Author: [A_K_Nain](https://twitter.com/A_K_Nain)
+Date created: 2021/05/29
+Last modified: 2021/10/31
+Description: Implement an image captioning model using a CNN and a Transformer.
+Accelerator: GPU
+"""
+
+"""
+## Setup
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import re
+import numpy as np
+import matplotlib.pyplot as plt
+
+import tensorflow as tf
+import keras
+from keras import layers
+from keras.applications import efficientnet
+from keras.layers import TextVectorization
+
+keras.utils.set_random_seed(111)
+
+"""
+## Download the dataset
+
+We will be using the Flickr8K dataset for this tutorial. This dataset comprises over
+8,000 images, that are each paired with five different captions.
+"""
+
+
+"""shell
+wget -q https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_Dataset.zip
+wget -q https://github.com/jbrownlee/Datasets/releases/download/Flickr8k/Flickr8k_text.zip
+unzip -qq Flickr8k_Dataset.zip
+unzip -qq Flickr8k_text.zip
+rm Flickr8k_Dataset.zip Flickr8k_text.zip
+"""
+
+
+# Path to the images
+IMAGES_PATH = "Flicker8k_Dataset"
+
+# Desired image dimensions
+IMAGE_SIZE = (299, 299)
+
+# Vocabulary size
+VOCAB_SIZE = 10000
+
+# Fixed length allowed for any sequence
+SEQ_LENGTH = 25
+
+# Dimension for the image embeddings and token embeddings
+EMBED_DIM = 512
+
+# Per-layer units in the feed-forward network
+FF_DIM = 512
+
+# Other training parameters
+BATCH_SIZE = 64
+EPOCHS = 30
+AUTOTUNE = tf.data.AUTOTUNE
+
+"""
+## Preparing the dataset
+"""
+
+
+def load_captions_data(filename):
+    """Loads captions (text) data and maps them to corresponding images.
+
+    Args:
+        filename: Path to the text file containing caption data.
+
+    Returns:
+        caption_mapping: Dictionary mapping image names and the corresponding captions
+        text_data: List containing all the available captions
+    """
+
+    with open(filename) as caption_file:
+        caption_data = caption_file.readlines()
+        caption_mapping = {}
+        text_data = []
+        images_to_skip = set()
+
+        for line in caption_data:
+            line = line.rstrip("\n")
+            # Image name and captions are separated using a tab
+            img_name, caption = line.split("\t")
+
+            # Each image is repeated five times for the five different captions.
+            # Each image name has a suffix `#(caption_number)`
+            img_name = img_name.split("#")[0]
+            img_name = os.path.join(IMAGES_PATH, img_name.strip())
+
+            # We will remove caption that are either too short to too long
+            tokens = caption.strip().split()
+
+            if len(tokens) < 5 or len(tokens) > SEQ_LENGTH:
+                images_to_skip.add(img_name)
+                continue
+
+            if img_name.endswith("jpg") and img_name not in images_to_skip:
+                # We will add a start and an end token to each caption
+                caption = "<start> " + caption.strip() + " <end>"
+                text_data.append(caption)
+
+                if img_name in caption_mapping:
+                    caption_mapping[img_name].append(caption)
+                else:
+                    caption_mapping[img_name] = [caption]
+
+        for img_name in images_to_skip:
+            if img_name in caption_mapping:
+                del caption_mapping[img_name]
+
+        return caption_mapping, text_data
+
+
+def train_val_split(caption_data, train_size=0.8, shuffle=True):
+    """Split the captioning dataset into train and validation sets.
+
+    Args:
+        caption_data (dict): Dictionary containing the mapped caption data
+        train_size (float): Fraction of all the full dataset to use as training data
+        shuffle (bool): Whether to shuffle the dataset before splitting
+
+    Returns:
+        Traning and validation datasets as two separated dicts
+    """
+
+    # 1. Get the list of all image names
+    all_images = list(caption_data.keys())
+
+    # 2. Shuffle if necessary
+    if shuffle:
+        np.random.shuffle(all_images)
+
+    # 3. Split into training and validation sets
+    train_size = int(len(caption_data) * train_size)
+
+    training_data = {
+        img_name: caption_data[img_name] for img_name in all_images[:train_size]
+    }
+    validation_data = {
+        img_name: caption_data[img_name] for img_name in all_images[train_size:]
+    }
+
+    # 4. Return the splits
+    return training_data, validation_data
+
+
+# Load the dataset
+captions_mapping, text_data = load_captions_data("Flickr8k.token.txt")
+
+# Split the dataset into training and validation sets
+train_data, valid_data = train_val_split(captions_mapping)
+print("Number of training samples: ", len(train_data))
+print("Number of validation samples: ", len(valid_data))
+
+"""
+## Vectorizing the text data
+
+We'll use the `TextVectorization` layer to vectorize the text data,
+that is to say, to turn the
+original strings into integer sequences where each integer represents the index of
+a word in a vocabulary. We will use a custom string standardization scheme
+(strip punctuation characters except `<` and `>`) and the default
+splitting scheme (split on whitespace).
+"""
+
+
+def custom_standardization(input_string):
+    lowercase = tf.strings.lower(input_string)
+    return tf.strings.regex_replace(lowercase, "[%s]" % re.escape(strip_chars), "")
+
+
+strip_chars = "!\"#$%&'()*+,-./:;<=>?@[\]^_`{|}~"
+strip_chars = strip_chars.replace("<", "")
+strip_chars = strip_chars.replace(">", "")
+
+vectorization = TextVectorization(
+    max_tokens=VOCAB_SIZE,
+    output_mode="int",
+    output_sequence_length=SEQ_LENGTH,
+    standardize=custom_standardization,
+)
+vectorization.adapt(text_data)
+
+# Data augmentation for image data
+image_augmentation = keras.Sequential(
+    [
+        layers.RandomFlip("horizontal"),
+        layers.RandomRotation(0.2),
+        layers.RandomContrast(0.3),
+    ]
+)
+
+
+"""
+## Building a `tf.data.Dataset` pipeline for training
+
+We will generate pairs of images and corresponding captions using a `tf.data.Dataset` object.
+The pipeline consists of two steps:
+
+1. Read the image from the disk
+2. Tokenize all the five captions corresponding to the image
+"""
+
+
+def decode_and_resize(img_path):
+    img = tf.io.read_file(img_path)
+    img = tf.image.decode_jpeg(img, channels=3)
+    img = tf.image.resize(img, IMAGE_SIZE)
+    img = tf.image.convert_image_dtype(img, tf.float32)
+    return img
+
+
+def process_input(img_path, captions):
+    return decode_and_resize(img_path), vectorization(captions)
+
+
+def make_dataset(images, captions):
+    dataset = tf.data.Dataset.from_tensor_slices((images, captions))
+    dataset = dataset.shuffle(BATCH_SIZE * 8)
+    dataset = dataset.map(process_input, num_parallel_calls=AUTOTUNE)
+    dataset = dataset.batch(BATCH_SIZE).prefetch(AUTOTUNE)
+
+    return dataset
+
+
+# Pass the list of images and the list of corresponding captions
+train_dataset = make_dataset(list(train_data.keys()), list(train_data.values()))
+
+valid_dataset = make_dataset(list(valid_data.keys()), list(valid_data.values()))
+
+
+"""
+## Building the model
+
+Our image captioning architecture consists of three models:
+
+1. A CNN: used to extract the image features
+2. A TransformerEncoder: The extracted image features are then passed to a Transformer
+                    based encoder that generates a new representation of the inputs
+3. A TransformerDecoder: This model takes the encoder output and the text data
+                    (sequences) as inputs and tries to learn to generate the caption.
+"""
+
+
+def get_cnn_model():
+    base_model = efficientnet.EfficientNetB0(
+        input_shape=(*IMAGE_SIZE, 3),
+        include_top=False,
+        weights="imagenet",
+    )
+    # We freeze our feature extractor
+    base_model.trainable = False
+    base_model_out = base_model.output
+    base_model_out = layers.Reshape((-1, base_model_out.shape[-1]))(base_model_out)
+    cnn_model = keras.models.Model(base_model.input, base_model_out)
+    return cnn_model
+
+
+class TransformerEncoderBlock(layers.Layer):
+    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.dense_dim = dense_dim
+        self.num_heads = num_heads
+        self.attention_1 = layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=embed_dim, dropout=0.0
+        )
+        self.layernorm_1 = layers.LayerNormalization()
+        self.layernorm_2 = layers.LayerNormalization()
+        self.dense_1 = layers.Dense(embed_dim, activation="relu")
+
+    def call(self, inputs, training, mask=None):
+        inputs = self.layernorm_1(inputs)
+        inputs = self.dense_1(inputs)
+
+        attention_output_1 = self.attention_1(
+            query=inputs,
+            value=inputs,
+            key=inputs,
+            attention_mask=None,
+            training=training,
+        )
+        out_1 = self.layernorm_2(inputs + attention_output_1)
+        return out_1
+
+
+class PositionalEmbedding(layers.Layer):
+    def __init__(self, sequence_length, vocab_size, embed_dim, **kwargs):
+        super().__init__(**kwargs)
+        self.token_embeddings = layers.Embedding(
+            input_dim=vocab_size, output_dim=embed_dim
+        )
+        self.position_embeddings = layers.Embedding(
+            input_dim=sequence_length, output_dim=embed_dim
+        )
+        self.sequence_length = sequence_length
+        self.vocab_size = vocab_size
+        self.embed_dim = embed_dim
+        self.embed_scale = tf.math.sqrt(tf.cast(embed_dim, tf.float32))
+
+    def call(self, inputs):
+        length = tf.shape(inputs)[-1]
+        positions = tf.range(start=0, limit=length, delta=1)
+        embedded_tokens = self.token_embeddings(inputs)
+        embedded_tokens = embedded_tokens * self.embed_scale
+        embedded_positions = self.position_embeddings(positions)
+        return embedded_tokens + embedded_positions
+
+    def compute_mask(self, inputs, mask=None):
+        return tf.math.not_equal(inputs, 0)
+
+
+class TransformerDecoderBlock(layers.Layer):
+    def __init__(self, embed_dim, ff_dim, num_heads, **kwargs):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.ff_dim = ff_dim
+        self.num_heads = num_heads
+        self.attention_1 = layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=embed_dim, dropout=0.1
+        )
+        self.attention_2 = layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=embed_dim, dropout=0.1
+        )
+        self.ffn_layer_1 = layers.Dense(ff_dim, activation="relu")
+        self.ffn_layer_2 = layers.Dense(embed_dim)
+
+        self.layernorm_1 = layers.LayerNormalization()
+        self.layernorm_2 = layers.LayerNormalization()
+        self.layernorm_3 = layers.LayerNormalization()
+
+        self.embedding = PositionalEmbedding(
+            embed_dim=EMBED_DIM,
+            sequence_length=SEQ_LENGTH,
+            vocab_size=VOCAB_SIZE,
+        )
+        self.out = layers.Dense(VOCAB_SIZE, activation="softmax")
+
+        self.dropout_1 = layers.Dropout(0.3)
+        self.dropout_2 = layers.Dropout(0.5)
+        self.supports_masking = True
+
+    def call(self, inputs, encoder_outputs, training, mask=None):
+        inputs = self.embedding(inputs)
+        causal_mask = self.get_causal_attention_mask(inputs)
+
+        if mask is not None:
+            padding_mask = tf.cast(mask[:, :, tf.newaxis], dtype=tf.int32)
+            combined_mask = tf.cast(mask[:, tf.newaxis, :], dtype=tf.int32)
+            combined_mask = tf.minimum(combined_mask, causal_mask)
+
+        attention_output_1 = self.attention_1(
+            query=inputs,
+            value=inputs,
+            key=inputs,
+            attention_mask=combined_mask,
+            training=training,
+        )
+        out_1 = self.layernorm_1(inputs + attention_output_1)
+
+        attention_output_2 = self.attention_2(
+            query=out_1,
+            value=encoder_outputs,
+            key=encoder_outputs,
+            attention_mask=padding_mask,
+            training=training,
+        )
+        out_2 = self.layernorm_2(out_1 + attention_output_2)
+
+        ffn_out = self.ffn_layer_1(out_2)
+        ffn_out = self.dropout_1(ffn_out, training=training)
+        ffn_out = self.ffn_layer_2(ffn_out)
+
+        ffn_out = self.layernorm_3(ffn_out + out_2, training=training)
+        ffn_out = self.dropout_2(ffn_out, training=training)
+        preds = self.out(ffn_out)
+        return preds
+
+    def get_causal_attention_mask(self, inputs):
+        input_shape = tf.shape(inputs)
+        batch_size, sequence_length = input_shape[0], input_shape[1]
+        i = tf.range(sequence_length)[:, tf.newaxis]
+        j = tf.range(sequence_length)
+        mask = tf.cast(i >= j, dtype="int32")
+        mask = tf.reshape(mask, (1, input_shape[1], input_shape[1]))
+        mult = tf.concat(
+            [
+                tf.expand_dims(batch_size, -1),
+                tf.constant([1, 1], dtype=tf.int32),
+            ],
+            axis=0,
+        )
+        return tf.tile(mask, mult)
+
+
+class ImageCaptioningModel(keras.Model):
+    def __init__(
+        self,
+        cnn_model,
+        encoder,
+        decoder,
+        num_captions_per_image=5,
+        image_aug=None,
+    ):
+        super().__init__()
+        self.cnn_model = cnn_model
+        self.encoder = encoder
+        self.decoder = decoder
+        self.loss_tracker = keras.metrics.Mean(name="loss")
+        self.acc_tracker = keras.metrics.Mean(name="accuracy")
+        self.num_captions_per_image = num_captions_per_image
+        self.image_aug = image_aug
+
+    def calculate_loss(self, y_true, y_pred, mask):
+        loss = self.loss(y_true, y_pred)
+        mask = tf.cast(mask, dtype=loss.dtype)
+        loss *= mask
+        return tf.reduce_sum(loss) / tf.reduce_sum(mask)
+
+    def calculate_accuracy(self, y_true, y_pred, mask):
+        accuracy = tf.equal(y_true, tf.argmax(y_pred, axis=2))
+        accuracy = tf.math.logical_and(mask, accuracy)
+        accuracy = tf.cast(accuracy, dtype=tf.float32)
+        mask = tf.cast(mask, dtype=tf.float32)
+        return tf.reduce_sum(accuracy) / tf.reduce_sum(mask)
+
+    def _compute_caption_loss_and_acc(self, img_embed, batch_seq, training=True):
+        encoder_out = self.encoder(img_embed, training=training)
+        batch_seq_inp = batch_seq[:, :-1]
+        batch_seq_true = batch_seq[:, 1:]
+        mask = tf.math.not_equal(batch_seq_true, 0)
+        batch_seq_pred = self.decoder(
+            batch_seq_inp, encoder_out, training=training, mask=mask
+        )
+        loss = self.calculate_loss(batch_seq_true, batch_seq_pred, mask)
+        acc = self.calculate_accuracy(batch_seq_true, batch_seq_pred, mask)
+        return loss, acc
+
+    def train_step(self, batch_data):
+        batch_img, batch_seq = batch_data
+        batch_loss = 0
+        batch_acc = 0
+
+        if self.image_aug:
+            batch_img = self.image_aug(batch_img)
+
+        # 1. Get image embeddings
+        img_embed = self.cnn_model(batch_img)
+
+        # 2. Pass each of the five captions one by one to the decoder
+        # along with the encoder outputs and compute the loss as well as accuracy
+        # for each caption.
+        for i in range(self.num_captions_per_image):
+            with tf.GradientTape() as tape:
+                loss, acc = self._compute_caption_loss_and_acc(
+                    img_embed, batch_seq[:, i, :], training=True
+                )
+
+                # 3. Update loss and accuracy
+                batch_loss += loss
+                batch_acc += acc
+
+            # 4. Get the list of all the trainable weights
+            train_vars = (
+                self.encoder.trainable_variables + self.decoder.trainable_variables
+            )
+
+            # 5. Get the gradients
+            grads = tape.gradient(loss, train_vars)
+
+            # 6. Update the trainable weights
+            self.optimizer.apply_gradients(zip(grads, train_vars))
+
+        # 7. Update the trackers
+        batch_acc /= float(self.num_captions_per_image)
+        self.loss_tracker.update_state(batch_loss)
+        self.acc_tracker.update_state(batch_acc)
+
+        # 8. Return the loss and accuracy values
+        return {
+            "loss": self.loss_tracker.result(),
+            "acc": self.acc_tracker.result(),
+        }
+
+    def test_step(self, batch_data):
+        batch_img, batch_seq = batch_data
+        batch_loss = 0
+        batch_acc = 0
+
+        # 1. Get image embeddings
+        img_embed = self.cnn_model(batch_img)
+
+        # 2. Pass each of the five captions one by one to the decoder
+        # along with the encoder outputs and compute the loss as well as accuracy
+        # for each caption.
+        for i in range(self.num_captions_per_image):
+            loss, acc = self._compute_caption_loss_and_acc(
+                img_embed, batch_seq[:, i, :], training=False
+            )
+
+            # 3. Update batch loss and batch accuracy
+            batch_loss += loss
+            batch_acc += acc
+
+        batch_acc /= float(self.num_captions_per_image)
+
+        # 4. Update the trackers
+        self.loss_tracker.update_state(batch_loss)
+        self.acc_tracker.update_state(batch_acc)
+
+        # 5. Return the loss and accuracy values
+        return {
+            "loss": self.loss_tracker.result(),
+            "acc": self.acc_tracker.result(),
+        }
+
+    @property
+    def metrics(self):
+        # We need to list our metrics here so the `reset_states()` can be
+        # called automatically.
+        return [self.loss_tracker, self.acc_tracker]
+
+
+cnn_model = get_cnn_model()
+encoder = TransformerEncoderBlock(embed_dim=EMBED_DIM, dense_dim=FF_DIM, num_heads=1)
+decoder = TransformerDecoderBlock(embed_dim=EMBED_DIM, ff_dim=FF_DIM, num_heads=2)
+caption_model = ImageCaptioningModel(
+    cnn_model=cnn_model,
+    encoder=encoder,
+    decoder=decoder,
+    image_aug=image_augmentation,
+)
+
+"""
+## Model training
+"""
+
+
+# Define the loss function
+cross_entropy = keras.losses.SparseCategoricalCrossentropy(
+    from_logits=False,
+    reduction=None,
+)
+
+# EarlyStopping criteria
+early_stopping = keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)
+
+
+# Learning Rate Scheduler for the optimizer
+class LRSchedule(keras.optimizers.schedules.LearningRateSchedule):
+    def __init__(self, post_warmup_learning_rate, warmup_steps):
+        super().__init__()
+        self.post_warmup_learning_rate = post_warmup_learning_rate
+        self.warmup_steps = warmup_steps
+
+    def __call__(self, step):
+        global_step = tf.cast(step, tf.float32)
+        warmup_steps = tf.cast(self.warmup_steps, tf.float32)
+        warmup_progress = global_step / warmup_steps
+        warmup_learning_rate = self.post_warmup_learning_rate * warmup_progress
+        return tf.cond(
+            global_step < warmup_steps,
+            lambda: warmup_learning_rate,
+            lambda: self.post_warmup_learning_rate,
+        )
+
+
+# Create a learning rate schedule
+num_train_steps = len(train_dataset) * EPOCHS
+num_warmup_steps = num_train_steps // 15
+lr_schedule = LRSchedule(post_warmup_learning_rate=1e-4, warmup_steps=num_warmup_steps)
+
+# Compile the model
+caption_model.compile(optimizer=keras.optimizers.Adam(lr_schedule), loss=cross_entropy)
+
+# Fit the model
+caption_model.fit(
+    train_dataset,
+    epochs=EPOCHS,
+    validation_data=valid_dataset,
+    callbacks=[early_stopping],
+)
+
+"""
+## Check sample predictions
+"""
+
+vocab = vectorization.get_vocabulary()
+index_lookup = dict(zip(range(len(vocab)), vocab))
+max_decoded_sentence_length = SEQ_LENGTH - 1
+valid_images = list(valid_data.keys())
+
+
+def generate_caption():
+    # Select a random image from the validation dataset
+    sample_img = np.random.choice(valid_images)
+
+    # Read the image from the disk
+    sample_img = decode_and_resize(sample_img)
+    img = sample_img.numpy().clip(0, 255).astype(np.uint8)
+    plt.imshow(img)
+    plt.show()
+
+    # Pass the image to the CNN
+    img = tf.expand_dims(sample_img, 0)
+    img = caption_model.cnn_model(img)
+
+    # Pass the image features to the Transformer encoder
+    encoded_img = caption_model.encoder(img, training=False)
+
+    # Generate the caption using the Transformer decoder
+    decoded_caption = "<start> "
+    for i in range(max_decoded_sentence_length):
+        tokenized_caption = vectorization([decoded_caption])[:, :-1]
+        mask = tf.math.not_equal(tokenized_caption, 0)
+        predictions = caption_model.decoder(
+            tokenized_caption, encoded_img, training=False, mask=mask
+        )
+        sampled_token_index = np.argmax(predictions[0, i, :])
+        sampled_token = index_lookup[sampled_token_index]
+        if sampled_token == "<end>":
+            break
+        decoded_caption += " " + sampled_token
+
+    decoded_caption = decoded_caption.replace("<start> ", "")
+    decoded_caption = decoded_caption.replace(" <end>", "").strip()
+    print("Predicted Caption: ", decoded_caption)
+
+
+# Check predictions for a few samples
+generate_caption()
+generate_caption()
+generate_caption()
+
+"""
+## End Notes
+
+We saw that the model starts to generate reasonable captions after a few epochs. To keep
+this example easily runnable, we have trained it with a few constraints, like a minimal
+number of attention heads. To improve the predictions, you can try changing these training
+settings and find a good model for your use case.
+"""
diff --git a/knowledge_base/vision/image_classification_efficientnet_fine_tuning.py b/knowledge_base/vision/image_classification_efficientnet_fine_tuning.py
new file mode 100644
index 0000000000000000000000000000000000000000..2073af2210a62ce82988cb305fd9a62d6473eeaf
--- /dev/null
+++ b/knowledge_base/vision/image_classification_efficientnet_fine_tuning.py
@@ -0,0 +1,442 @@
+"""
+Title: Image classification via fine-tuning with EfficientNet
+Author: [Yixing Fu](https://github.com/yixingfu)
+Date created: 2020/06/30
+Last modified: 2023/07/10
+Description: Use EfficientNet with weights pre-trained on imagenet for Stanford Dogs classification.
+Accelerator: GPU
+"""
+
+"""
+
+## Introduction: what is EfficientNet
+
+EfficientNet, first introduced in [Tan and Le, 2019](https://arxiv.org/abs/1905.11946)
+is among the most efficient models (i.e. requiring least FLOPS for inference)
+that reaches State-of-the-Art accuracy on both
+imagenet and common image classification transfer learning tasks.
+
+The smallest base model is similar to [MnasNet](https://arxiv.org/abs/1807.11626), which
+reached near-SOTA with a significantly smaller model. By introducing a heuristic way to
+scale the model, EfficientNet provides a family of models (B0 to B7) that represents a
+good combination of efficiency and accuracy on a variety of scales. Such a scaling
+heuristics (compound-scaling, details see
+[Tan and Le, 2019](https://arxiv.org/abs/1905.11946)) allows the
+efficiency-oriented base model (B0) to surpass models at every scale, while avoiding
+extensive grid-search of hyperparameters.
+
+A summary of the latest updates on the model is available at
+[here](https://github.com/tensorflow/tpu/tree/master/models/official/efficientnet), where various
+augmentation schemes and semi-supervised learning approaches are applied to further
+improve the imagenet performance of the models. These extensions of the model can be used
+by updating weights without changing model architecture.
+
+## B0 to B7 variants of EfficientNet
+
+*(This section provides some details on "compound scaling", and can be skipped
+if you're only interested in using the models)*
+
+Based on the [original paper](https://arxiv.org/abs/1905.11946) people may have the
+impression that EfficientNet is a continuous family of models created by arbitrarily
+choosing scaling factor in as Eq.(3) of the paper.  However, choice of resolution,
+depth and width are also restricted by many factors:
+
+- Resolution: Resolutions not divisible by 8, 16, etc. cause zero-padding near boundaries
+of some layers which wastes computational resources. This especially applies to smaller
+variants of the model, hence the input resolution for B0 and B1 are chosen as 224 and
+240.
+
+- Depth and width: The building blocks of EfficientNet demands channel size to be
+multiples of 8.
+
+- Resource limit: Memory limitation may bottleneck resolution when depth
+and width can still increase. In such a situation, increasing depth and/or
+width but keep resolution can still improve performance.
+
+As a result, the depth, width and resolution of each variant of the EfficientNet models
+are hand-picked and proven to produce good results, though they may be significantly
+off from the compound scaling formula.
+Therefore, the keras implementation (detailed below) only provide these 8 models, B0 to B7,
+instead of allowing arbitray choice of width / depth / resolution parameters.
+
+## Keras implementation of EfficientNet
+
+An implementation of EfficientNet B0 to B7 has been shipped with Keras since v2.3. To
+use EfficientNetB0 for classifying 1000 classes of images from ImageNet, run:
+
+```python
+from tensorflow.keras.applications import EfficientNetB0
+model = EfficientNetB0(weights='imagenet')
+```
+
+This model takes input images of shape `(224, 224, 3)`, and the input data should be in the
+range `[0, 255]`. Normalization is included as part of the model.
+
+Because training EfficientNet on ImageNet takes a tremendous amount of resources and
+several techniques that are not a part of the model architecture itself. Hence the Keras
+implementation by default loads pre-trained weights obtained via training with
+[AutoAugment](https://arxiv.org/abs/1805.09501).
+
+For B0 to B7 base models, the input shapes are different. Here is a list of input shape
+expected for each model:
+
+| Base model | resolution|
+|----------------|-----|
+| EfficientNetB0 | 224 |
+| EfficientNetB1 | 240 |
+| EfficientNetB2 | 260 |
+| EfficientNetB3 | 300 |
+| EfficientNetB4 | 380 |
+| EfficientNetB5 | 456 |
+| EfficientNetB6 | 528 |
+| EfficientNetB7 | 600 |
+
+When the model is intended for transfer learning, the Keras implementation
+provides a option to remove the top layers:
+```
+model = EfficientNetB0(include_top=False, weights='imagenet')
+```
+This option excludes the final `Dense` layer that turns 1280 features on the penultimate
+layer into prediction of the 1000 ImageNet classes. Replacing the top layer with custom
+layers allows using EfficientNet as a feature extractor in a transfer learning workflow.
+
+Another argument in the model constructor worth noticing is `drop_connect_rate` which controls
+the dropout rate responsible for [stochastic depth](https://arxiv.org/abs/1603.09382).
+This parameter serves as a toggle for extra regularization in finetuning, but does not
+affect loaded weights. For example, when stronger regularization is desired, try:
+
+```python
+model = EfficientNetB0(weights='imagenet', drop_connect_rate=0.4)
+```
+The default value is 0.2.
+
+## Example: EfficientNetB0 for Stanford Dogs.
+
+EfficientNet is capable of a wide range of image classification tasks.
+This makes it a good model for transfer learning.
+As an end-to-end example, we will show using pre-trained EfficientNetB0 on
+[Stanford Dogs](http://vision.stanford.edu/aditya86/ImageNetDogs/main.html) dataset.
+
+"""
+
+"""
+## Setup and data loading
+"""
+
+import numpy as np
+import tensorflow_datasets as tfds
+import tensorflow as tf  # For tf.data
+import matplotlib.pyplot as plt
+import keras
+from keras import layers
+from keras.applications import EfficientNetB0
+
+# IMG_SIZE is determined by EfficientNet model choice
+IMG_SIZE = 224
+BATCH_SIZE = 64
+
+
+"""
+### Loading data
+
+Here we load data from [tensorflow_datasets](https://www.tensorflow.org/datasets)
+(hereafter TFDS).
+Stanford Dogs dataset is provided in
+TFDS as [stanford_dogs](https://www.tensorflow.org/datasets/catalog/stanford_dogs).
+It features 20,580 images that belong to 120 classes of dog breeds
+(12,000 for training and 8,580 for testing).
+
+By simply changing `dataset_name` below, you may also try this notebook for
+other datasets in TFDS such as
+[cifar10](https://www.tensorflow.org/datasets/catalog/cifar10),
+[cifar100](https://www.tensorflow.org/datasets/catalog/cifar100),
+[food101](https://www.tensorflow.org/datasets/catalog/food101),
+etc. When the images are much smaller than the size of EfficientNet input,
+we can simply upsample the input images. It has been shown in
+[Tan and Le, 2019](https://arxiv.org/abs/1905.11946) that transfer learning
+result is better for increased resolution even if input images remain small.
+"""
+
+dataset_name = "stanford_dogs"
+(ds_train, ds_test), ds_info = tfds.load(
+    dataset_name, split=["train", "test"], with_info=True, as_supervised=True
+)
+NUM_CLASSES = ds_info.features["label"].num_classes
+
+
+"""
+When the dataset include images with various size, we need to resize them into a
+shared size. The Stanford Dogs dataset includes only images at least 200x200
+pixels in size. Here we resize the images to the input size needed for EfficientNet.
+"""
+
+size = (IMG_SIZE, IMG_SIZE)
+ds_train = ds_train.map(lambda image, label: (tf.image.resize(image, size), label))
+ds_test = ds_test.map(lambda image, label: (tf.image.resize(image, size), label))
+
+"""
+### Visualizing the data
+
+The following code shows the first 9 images with their labels.
+"""
+
+
+def format_label(label):
+    string_label = label_info.int2str(label)
+    return string_label.split("-")[1]
+
+
+label_info = ds_info.features["label"]
+for i, (image, label) in enumerate(ds_train.take(9)):
+    ax = plt.subplot(3, 3, i + 1)
+    plt.imshow(image.numpy().astype("uint8"))
+    plt.title("{}".format(format_label(label)))
+    plt.axis("off")
+
+
+"""
+### Data augmentation
+
+We can use the preprocessing layers APIs for image augmentation.
+"""
+
+img_augmentation_layers = [
+    layers.RandomRotation(factor=0.15),
+    layers.RandomTranslation(height_factor=0.1, width_factor=0.1),
+    layers.RandomFlip(),
+    layers.RandomContrast(factor=0.1),
+]
+
+
+def img_augmentation(images):
+    for layer in img_augmentation_layers:
+        images = layer(images)
+    return images
+
+
+"""
+This `Sequential` model object can be used both as a part of
+the model we later build, and as a function to preprocess
+data before feeding into the model. Using them as function makes
+it easy to visualize the augmented images. Here we plot 9 examples
+of augmentation result of a given figure.
+"""
+
+for image, label in ds_train.take(1):
+    for i in range(9):
+        ax = plt.subplot(3, 3, i + 1)
+        aug_img = img_augmentation(np.expand_dims(image.numpy(), axis=0))
+        aug_img = np.array(aug_img)
+        plt.imshow(aug_img[0].astype("uint8"))
+        plt.title("{}".format(format_label(label)))
+        plt.axis("off")
+
+
+"""
+### Prepare inputs
+
+Once we verify the input data and augmentation are working correctly,
+we prepare dataset for training. The input data are resized to uniform
+`IMG_SIZE`. The labels are put into one-hot
+(a.k.a. categorical) encoding. The dataset is batched.
+
+Note: `prefetch` and `AUTOTUNE` may in some situation improve
+performance, but depends on environment and the specific dataset used.
+See this [guide](https://www.tensorflow.org/guide/data_performance)
+for more information on data pipeline performance.
+"""
+
+
+# One-hot / categorical encoding
+def input_preprocess_train(image, label):
+    image = img_augmentation(image)
+    label = tf.one_hot(label, NUM_CLASSES)
+    return image, label
+
+
+def input_preprocess_test(image, label):
+    label = tf.one_hot(label, NUM_CLASSES)
+    return image, label
+
+
+ds_train = ds_train.map(input_preprocess_train, num_parallel_calls=tf.data.AUTOTUNE)
+ds_train = ds_train.batch(batch_size=BATCH_SIZE, drop_remainder=True)
+ds_train = ds_train.prefetch(tf.data.AUTOTUNE)
+
+ds_test = ds_test.map(input_preprocess_test, num_parallel_calls=tf.data.AUTOTUNE)
+ds_test = ds_test.batch(batch_size=BATCH_SIZE, drop_remainder=True)
+
+
+"""
+## Training a model from scratch
+
+We build an EfficientNetB0 with 120 output classes, that is initialized from scratch:
+
+Note: the accuracy will increase very slowly and may overfit.
+"""
+
+model = EfficientNetB0(
+    include_top=True,
+    weights=None,
+    classes=NUM_CLASSES,
+    input_shape=(IMG_SIZE, IMG_SIZE, 3),
+)
+model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
+
+model.summary()
+
+epochs = 40  # @param {type: "slider", min:10, max:100}
+hist = model.fit(ds_train, epochs=epochs, validation_data=ds_test)
+
+
+"""
+Training the model is relatively fast. This might make it sounds easy to simply train EfficientNet on any
+dataset wanted from scratch. However, training EfficientNet on smaller datasets,
+especially those with lower resolution like CIFAR-100, faces the significant challenge of
+overfitting.
+
+Hence training from scratch requires very careful choice of hyperparameters and is
+difficult to find suitable regularization. It would also be much more demanding in resources.
+Plotting the training and validation accuracy
+makes it clear that validation accuracy stagnates at a low value.
+"""
+
+import matplotlib.pyplot as plt
+
+
+def plot_hist(hist):
+    plt.plot(hist.history["accuracy"])
+    plt.plot(hist.history["val_accuracy"])
+    plt.title("model accuracy")
+    plt.ylabel("accuracy")
+    plt.xlabel("epoch")
+    plt.legend(["train", "validation"], loc="upper left")
+    plt.show()
+
+
+plot_hist(hist)
+
+"""
+## Transfer learning from pre-trained weights
+
+Here we initialize the model with pre-trained ImageNet weights,
+and we fine-tune it on our own dataset.
+"""
+
+
+def build_model(num_classes):
+    inputs = layers.Input(shape=(IMG_SIZE, IMG_SIZE, 3))
+    model = EfficientNetB0(include_top=False, input_tensor=inputs, weights="imagenet")
+
+    # Freeze the pretrained weights
+    model.trainable = False
+
+    # Rebuild top
+    x = layers.GlobalAveragePooling2D(name="avg_pool")(model.output)
+    x = layers.BatchNormalization()(x)
+
+    top_dropout_rate = 0.2
+    x = layers.Dropout(top_dropout_rate, name="top_dropout")(x)
+    outputs = layers.Dense(num_classes, activation="softmax", name="pred")(x)
+
+    # Compile
+    model = keras.Model(inputs, outputs, name="EfficientNet")
+    optimizer = keras.optimizers.Adam(learning_rate=1e-2)
+    model.compile(
+        optimizer=optimizer, loss="categorical_crossentropy", metrics=["accuracy"]
+    )
+    return model
+
+
+"""
+The first step to transfer learning is to freeze all layers and train only the top
+layers. For this step, a relatively large learning rate (1e-2) can be used.
+Note that validation accuracy and loss will usually be better than training
+accuracy and loss. This is because the regularization is strong, which only
+suppresses training-time metrics.
+
+Note that the convergence may take up to 50 epochs depending on choice of learning rate.
+If image augmentation layers were not
+applied, the validation accuracy may only reach ~60%.
+"""
+
+model = build_model(num_classes=NUM_CLASSES)
+
+epochs = 25  # @param {type: "slider", min:8, max:80}
+hist = model.fit(ds_train, epochs=epochs, validation_data=ds_test)
+plot_hist(hist)
+
+"""
+The second step is to unfreeze a number of layers and fit the model using smaller
+learning rate. In this example we show unfreezing all layers, but depending on
+specific dataset it may be desireble to only unfreeze a fraction of all layers.
+
+When the feature extraction with
+pretrained model works good enough, this step would give a very limited gain on
+validation accuracy. In our case we only see a small improvement,
+as ImageNet pretraining already exposed the model to a good amount of dogs.
+
+On the other hand, when we use pretrained weights on a dataset that is more different
+from ImageNet, this fine-tuning step can be crucial as the feature extractor also
+needs to be adjusted by a considerable amount. Such a situation can be demonstrated
+if choosing CIFAR-100 dataset instead, where fine-tuning boosts validation accuracy
+by about 10% to pass 80% on `EfficientNetB0`.
+
+A side note on freezing/unfreezing models: setting `trainable` of a `Model` will
+simultaneously set all layers belonging to the `Model` to the same `trainable`
+attribute. Each layer is trainable only if both the layer itself and the model
+containing it are trainable. Hence when we need to partially freeze/unfreeze
+a model, we need to make sure the `trainable` attribute of the model is set
+to `True`.
+"""
+
+
+def unfreeze_model(model):
+    # We unfreeze the top 20 layers while leaving BatchNorm layers frozen
+    for layer in model.layers[-20:]:
+        if not isinstance(layer, layers.BatchNormalization):
+            layer.trainable = True
+
+    optimizer = keras.optimizers.Adam(learning_rate=1e-5)
+    model.compile(
+        optimizer=optimizer, loss="categorical_crossentropy", metrics=["accuracy"]
+    )
+
+
+unfreeze_model(model)
+
+epochs = 4  # @param {type: "slider", min:4, max:10}
+hist = model.fit(ds_train, epochs=epochs, validation_data=ds_test)
+plot_hist(hist)
+
+"""
+### Tips for fine tuning EfficientNet
+
+On unfreezing layers:
+
+- The `BatchNormalization` layers need to be kept frozen
+([more details](https://keras.io/guides/transfer_learning/)).
+If they are also turned to trainable, the
+first epoch after unfreezing will significantly reduce accuracy.
+- In some cases it may be beneficial to open up only a portion of layers instead of
+unfreezing all. This will make fine tuning much faster when going to larger models like
+B7.
+- Each block needs to be all turned on or off. This is because the architecture includes
+a shortcut from the first layer to the last layer for each block. Not respecting blocks
+also significantly harms the final performance.
+
+Some other tips for utilizing EfficientNet:
+
+- Larger variants of EfficientNet do not guarantee improved performance, especially for
+tasks with less data or fewer classes. In such a case, the larger variant of EfficientNet
+chosen, the harder it is to tune hyperparameters.
+- EMA (Exponential Moving Average) is very helpful in training EfficientNet from scratch,
+but not so much for transfer learning.
+- Do not use the RMSprop setup as in the original paper for transfer learning. The
+momentum and learning rate are too high for transfer learning. It will easily corrupt the
+pretrained weight and blow up the loss. A quick check is to see if loss (as categorical
+cross entropy) is getting significantly larger than log(NUM_CLASSES) after the same
+epoch. If so, the initial learning rate/momentum is too high.
+- Smaller batch size benefit validation accuracy, possibly due to effectively providing
+regularization.
+"""
diff --git a/knowledge_base/vision/image_classification_from_scratch.py b/knowledge_base/vision/image_classification_from_scratch.py
new file mode 100644
index 0000000000000000000000000000000000000000..12266b54596ff5a703b0f21099ee03bf69d99a73
--- /dev/null
+++ b/knowledge_base/vision/image_classification_from_scratch.py
@@ -0,0 +1,325 @@
+"""
+Title: Image classification from scratch
+Author: [fchollet](https://twitter.com/fchollet)
+Date created: 2020/04/27
+Last modified: 2023/11/09
+Description: Training an image classifier from scratch on the Kaggle Cats vs Dogs dataset.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+This example shows how to do image classification from scratch, starting from JPEG
+image files on disk, without leveraging pre-trained weights or a pre-made Keras
+Application model. We demonstrate the workflow on the Kaggle Cats vs Dogs binary
+classification dataset.
+
+We use the `image_dataset_from_directory` utility to generate the datasets, and
+we use Keras image preprocessing layers for image standardization and data augmentation.
+"""
+
+"""
+## Setup
+"""
+
+import os
+import numpy as np
+import keras
+from keras import layers
+from tensorflow import data as tf_data
+import matplotlib.pyplot as plt
+
+"""
+## Load the data: the Cats vs Dogs dataset
+
+### Raw data download
+
+First, let's download the 786M ZIP archive of the raw data:
+"""
+
+"""shell
+curl -O https://download.microsoft.com/download/3/E/1/3E1C3F21-ECDB-4869-8368-6DEBA77B919F/kagglecatsanddogs_5340.zip
+"""
+
+"""shell
+unzip -q kagglecatsanddogs_5340.zip
+ls
+"""
+
+"""
+Now we have a `PetImages` folder which contain two subfolders, `Cat` and `Dog`. Each
+subfolder contains image files for each category.
+"""
+
+"""shell
+ls PetImages
+"""
+
+"""
+### Filter out corrupted images
+
+When working with lots of real-world image data, corrupted images are a common
+occurence. Let's filter out badly-encoded images that do not feature the string "JFIF"
+in their header.
+"""
+
+num_skipped = 0
+for folder_name in ("Cat", "Dog"):
+    folder_path = os.path.join("PetImages", folder_name)
+    for fname in os.listdir(folder_path):
+        fpath = os.path.join(folder_path, fname)
+        try:
+            fobj = open(fpath, "rb")
+            is_jfif = b"JFIF" in fobj.peek(10)
+        finally:
+            fobj.close()
+
+        if not is_jfif:
+            num_skipped += 1
+            # Delete corrupted image
+            os.remove(fpath)
+
+print(f"Deleted {num_skipped} images.")
+
+"""
+## Generate a `Dataset`
+"""
+
+image_size = (180, 180)
+batch_size = 128
+
+train_ds, val_ds = keras.utils.image_dataset_from_directory(
+    "PetImages",
+    validation_split=0.2,
+    subset="both",
+    seed=1337,
+    image_size=image_size,
+    batch_size=batch_size,
+)
+
+"""
+## Visualize the data
+
+Here are the first 9 images in the training dataset.
+"""
+
+
+plt.figure(figsize=(10, 10))
+for images, labels in train_ds.take(1):
+    for i in range(9):
+        ax = plt.subplot(3, 3, i + 1)
+        plt.imshow(np.array(images[i]).astype("uint8"))
+        plt.title(int(labels[i]))
+        plt.axis("off")
+
+"""
+## Using image data augmentation
+
+When you don't have a large image dataset, it's a good practice to artificially
+introduce sample diversity by applying random yet realistic transformations to the
+training images, such as random horizontal flipping or small random rotations. This
+helps expose the model to different aspects of the training data while slowing down
+overfitting.
+"""
+
+data_augmentation_layers = [
+    layers.RandomFlip("horizontal"),
+    layers.RandomRotation(0.1),
+]
+
+
+def data_augmentation(images):
+    for layer in data_augmentation_layers:
+        images = layer(images)
+    return images
+
+
+"""
+Let's visualize what the augmented samples look like, by applying `data_augmentation`
+repeatedly to the first few images in the dataset:
+"""
+
+plt.figure(figsize=(10, 10))
+for images, _ in train_ds.take(1):
+    for i in range(9):
+        augmented_images = data_augmentation(images)
+        ax = plt.subplot(3, 3, i + 1)
+        plt.imshow(np.array(augmented_images[0]).astype("uint8"))
+        plt.axis("off")
+
+
+"""
+## Standardizing the data
+
+Our image are already in a standard size (180x180), as they are being yielded as
+contiguous `float32` batches by our dataset. However, their RGB channel values are in
+the `[0, 255]` range. This is not ideal for a neural network;
+in general you should seek to make your input values small. Here, we will
+standardize values to be in the `[0, 1]` by using a `Rescaling` layer at the start of
+our model.
+"""
+
+"""
+## Two options to preprocess the data
+
+There are two ways you could be using the `data_augmentation` preprocessor:
+
+**Option 1: Make it part of the model**, like this:
+
+```python
+inputs = keras.Input(shape=input_shape)
+x = data_augmentation(inputs)
+x = layers.Rescaling(1./255)(x)
+...  # Rest of the model
+```
+
+With this option, your data augmentation will happen *on device*, synchronously
+with the rest of the model execution, meaning that it will benefit from GPU
+acceleration.
+
+Note that data augmentation is inactive at test time, so the input samples will only be
+augmented during `fit()`, not when calling `evaluate()` or `predict()`.
+
+If you're training on GPU, this may be a good option.
+
+**Option 2: apply it to the dataset**, so as to obtain a dataset that yields batches of
+augmented images, like this:
+
+```python
+augmented_train_ds = train_ds.map(
+    lambda x, y: (data_augmentation(x, training=True), y))
+```
+
+With this option, your data augmentation will happen **on CPU**, asynchronously, and will
+be buffered before going into the model.
+
+If you're training on CPU, this is the better option, since it makes data augmentation
+asynchronous and non-blocking.
+
+In our case, we'll go with the second option. If you're not sure
+which one to pick, this second option (asynchronous preprocessing) is always a solid choice.
+"""
+
+"""
+## Configure the dataset for performance
+
+Let's apply data augmentation to our training dataset,
+and let's make sure to use buffered prefetching so we can yield data from disk without
+having I/O becoming blocking:
+"""
+
+# Apply `data_augmentation` to the training images.
+train_ds = train_ds.map(
+    lambda img, label: (data_augmentation(img), label),
+    num_parallel_calls=tf_data.AUTOTUNE,
+)
+# Prefetching samples in GPU memory helps maximize GPU utilization.
+train_ds = train_ds.prefetch(tf_data.AUTOTUNE)
+val_ds = val_ds.prefetch(tf_data.AUTOTUNE)
+
+"""
+## Build a model
+
+We'll build a small version of the Xception network. We haven't particularly tried to
+optimize the architecture; if you want to do a systematic search for the best model
+configuration, consider using
+[KerasTuner](https://github.com/keras-team/keras-tuner).
+
+Note that:
+
+- We start the model with the `data_augmentation` preprocessor, followed by a
+ `Rescaling` layer.
+- We include a `Dropout` layer before the final classification layer.
+"""
+
+
+def make_model(input_shape, num_classes):
+    inputs = keras.Input(shape=input_shape)
+
+    # Entry block
+    x = layers.Rescaling(1.0 / 255)(inputs)
+    x = layers.Conv2D(128, 3, strides=2, padding="same")(x)
+    x = layers.BatchNormalization()(x)
+    x = layers.Activation("relu")(x)
+
+    previous_block_activation = x  # Set aside residual
+
+    for size in [256, 512, 728]:
+        x = layers.Activation("relu")(x)
+        x = layers.SeparableConv2D(size, 3, padding="same")(x)
+        x = layers.BatchNormalization()(x)
+
+        x = layers.Activation("relu")(x)
+        x = layers.SeparableConv2D(size, 3, padding="same")(x)
+        x = layers.BatchNormalization()(x)
+
+        x = layers.MaxPooling2D(3, strides=2, padding="same")(x)
+
+        # Project residual
+        residual = layers.Conv2D(size, 1, strides=2, padding="same")(
+            previous_block_activation
+        )
+        x = layers.add([x, residual])  # Add back residual
+        previous_block_activation = x  # Set aside next residual
+
+    x = layers.SeparableConv2D(1024, 3, padding="same")(x)
+    x = layers.BatchNormalization()(x)
+    x = layers.Activation("relu")(x)
+
+    x = layers.GlobalAveragePooling2D()(x)
+    if num_classes == 2:
+        units = 1
+    else:
+        units = num_classes
+
+    x = layers.Dropout(0.25)(x)
+    # We specify activation=None so as to return logits
+    outputs = layers.Dense(units, activation=None)(x)
+    return keras.Model(inputs, outputs)
+
+
+model = make_model(input_shape=image_size + (3,), num_classes=2)
+keras.utils.plot_model(model, show_shapes=True)
+
+"""
+## Train the model
+"""
+
+epochs = 25
+
+callbacks = [
+    keras.callbacks.ModelCheckpoint("save_at_{epoch}.keras"),
+]
+model.compile(
+    optimizer=keras.optimizers.Adam(3e-4),
+    loss=keras.losses.BinaryCrossentropy(from_logits=True),
+    metrics=[keras.metrics.BinaryAccuracy(name="acc")],
+)
+model.fit(
+    train_ds,
+    epochs=epochs,
+    callbacks=callbacks,
+    validation_data=val_ds,
+)
+
+"""
+We get to >90% validation accuracy after training for 25 epochs on the full dataset
+(in practice, you can train for 50+ epochs before validation performance starts degrading).
+"""
+
+"""
+## Run inference on new data
+
+Note that data augmentation and dropout are inactive at inference time.
+"""
+
+img = keras.utils.load_img("PetImages/Cat/6779.jpg", target_size=image_size)
+plt.imshow(img)
+
+img_array = keras.utils.img_to_array(img)
+img_array = keras.ops.expand_dims(img_array, 0)  # Create batch axis
+
+predictions = model.predict(img_array)
+score = float(keras.ops.sigmoid(predictions[0][0]))
+print(f"This image is {100 * (1 - score):.2f}% cat and {100 * score:.2f}% dog.")
diff --git a/knowledge_base/vision/image_classification_using_global_context_vision_transformer.py b/knowledge_base/vision/image_classification_using_global_context_vision_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d0a3b78af2968aa38df2c612cd76578ac8b093f
--- /dev/null
+++ b/knowledge_base/vision/image_classification_using_global_context_vision_transformer.py
@@ -0,0 +1,1187 @@
+"""
+Title: Image Classification using Global Context Vision Transformer
+Author: Md Awsafur Rahman
+Date created: 2023/10/30
+Last modified: 2023/10/30
+Description: Implementation and fine-tuning of Global Context Vision Transformer for image classification.
+Accelerator: GPU
+"""
+
+"""
+# Setup
+"""
+
+"""shell
+pip install --upgrade keras_cv tensorflow
+pip install --upgrade keras
+"""
+
+import keras
+from keras_cv.layers import DropPath
+from keras import ops
+from keras import layers
+
+import tensorflow as tf  # only for dataloader
+import tensorflow_datasets as tfds  # for flower dataset
+
+from skimage.data import chelsea
+import matplotlib.pyplot as plt
+import numpy as np
+
+"""
+## Introduction
+
+In this notebook, we will utilize multi-backend Keras 3.0 to implement the
+[**GCViT: Global Context Vision Transformer**](https://arxiv.org/abs/2206.09959) paper,
+presented at ICML 2023 by A Hatamizadeh et al. The, we will fine-tune the model on the
+Flower dataset for image classification task, leveraging the official ImageNet pre-trained
+weights. A highlight of this notebook is its compatibility with multiple backends:
+TensorFlow, PyTorch, and JAX, showcasing the true potential of multi-backend Keras.
+"""
+
+"""
+## Motivation
+
+> **Note:** In this section we'll learn about the backstory of GCViT and try to
+understand why it is proposed.
+
+* During recent years, **Transformers** have achieved dominance in **Natural Language
+Processing (NLP)** tasks and with the **self-attention** mechanism which allows for
+capturing both long and short-range information.
+* Following this trend, **Vision Transformer (ViT)** proposed to utilize image patches as
+tokens in a gigantic architecture similar to encoder of the original Transformer.
+* Despite the historic dominance of **Convolutional Neural Network (CNN)** in computer
+vision, **ViT-based** models have shown **SOTA or competitive performance** in various
+computer vision tasks.
+<img src="https://raw.githubusercontent.com/awsaf49/gcvit-tf/main/image/vit_gif.gif"
+width=600>
+
+* However, **quadratic [`O(n^2)`] computational complexity** of self-attention and **lack
+of multi-scale information** makes it difficult for **ViT** to be considered as
+general-purpose architecture for Compute Vision tasks like **segmentation and object
+detection** where it requires **dense prediction at the pixel level**.
+* Swin Transformer has attempted to address the issues of **ViT** by proposing
+**multi-resolution/hierarchical** architectures in which the self-attention is computed
+in **local windows** and cross-window connections such as **window shifting** are used
+for modeling the interactions across different regions. But the **limited receptive field
+of local windows** can not capture long-range information, and cross-window-connection
+schemes such as **window-shifting only cover a small neighborhood** in the vicinity of
+each window. Also, it lacks **inductive-bias** that encourages certain translation
+invariance is still preferable for general-purpose visual modeling, particularly for the
+dense prediction tasks of object detection and semantic segmentation.
+
+<img src="https://raw.githubusercontent.com/awsaf49/gcvit-tf/main/image/swin_vs_vit.JPG"
+width=400>     <img
+src="https://raw.githubusercontent.com/awsaf49/gcvit-tf/main/image/shifted_window.JPG"
+width=400>
+<img src="https://raw.githubusercontent.com/awsaf49/gcvit-tf/main/image/swin_arch.JPG"
+width=800>
+
+* To address above limitations, **Global Context (GC) ViT** network is proposed.
+"""
+
+"""
+## Architecture
+
+Let's have a quick **overview** of our key components,
+1. `Stem/PatchEmbed:` A stem/patchify layer processes images at the network’s beginning.
+For this network, it creates **patches/tokens** and converts them into **embeddings**.
+2. `Level:` It is the repetitive building block that extracts features using different
+blocks.
+3. `Global Token Gen./FeatureExtraction:` It generates **global tokens/patches** with
+**Depthwise-CNN**, **SqueezeAndExcitation (Squeeze-Excitation)**, **CNN** and
+**MaxPooling**. So basically
+it's a Feature Extractor.
+4. `Block:` It is the repetitive module that applies attention to the features and
+projects them to a certain dimension.
+        1. `Local-MSA:` Local Multi head Self Attention.
+        2. `Global-MSA:` Global Multi head Self Attention.
+        3. `MLP:` Linear layer that projects a vector to another dimension.
+5. `Downsample/ReduceSize:` It is very similar to **Global Token Gen.** module except it
+uses **CNN** instead of **MaxPooling** to downsample with additional **Layer
+Normalization** modules.
+6. `Head:` It is the module responsible for the classification task.
+    1. `Pooling:` It converts `N x 2D` features to `N x 1D` features.
+    2. `Classifier:` It processes `N x 1D` features to make a decision about class.
+
+I've annotated the architecture figure to make it easier to digest,
+<img src="https://raw.githubusercontent.com/awsaf49/gcvit-tf/main/image/arch_annot.png">
+"""
+
+"""
+### Unit Blocks
+
+> **Note:** This blocks are used to build other modules throughout the paper. Most of the
+blocks are either borrowed from other work or modified version old work.
+
+1. `SqueezeAndExcitation`: **Squeeze-Excitation (SE)** aka **Bottleneck** module acts sd
+kind of **channel
+attention**. It consits of **AvgPooling**, **Dense/FullyConnected (FC)/Linear** ,
+**GELU** and **Sigmoid** module.
+<img src="https://raw.githubusercontent.com/awsaf49/gcvit-tf/main/image/se_annot.png"
+width=400>
+
+2. `Fused-MBConv:` This is similar to the one used in **EfficientNetV2**. It uses
+**Depthwise-Conv**, **GELU**, **SqueezeAndExcitation**, **Conv**, to extract feature with
+a resiudal
+connection. Note that, no new module is declared for this one, we simply applied
+corresponding modules directly.
+<img src="https://raw.githubusercontent.com/awsaf49/gcvit-tf/main/image/fmb_annot.png"
+width=350>
+
+3. `ReduceSize`: It is a **CNN** based **downsample** module which abvobe mentioned
+`Fused-MBConv` module to extract feature, **Strided Conv** to simultaneously reduce
+spatial dimension and increse channelwise dimention of the features and finally
+**LayerNormalization** module to normalize features. In the paper/figure this module is
+referred as **downsample** module. I think it is mention worthy that **SwniTransformer**
+used `PatchMerging` module instead of `ReduceSize` to reduce the spatial dimention and
+increase channelwise dimension which uses **fully-connected/dense/linear** module.
+According to the **GCViT** paper, one of the purposes of using `ReduceSize` is to add
+inductive bias through **CNN** module.
+<img src="https://raw.githubusercontent.com/awsaf49/gcvit-tf/main/image/down_annot.png"
+width=300>
+
+4. `MLP:` This is our very own **Multi Layer Perceptron** module. This a
+feed-forward/fully-connected/linear module which simply projects input to an arbitary
+dimension.
+"""
+
+
+class SqueezeAndExcitation(layers.Layer):
+    """Squeeze and excitation block.
+
+    Args:
+        output_dim: output features dimension, if `None` use same dim as input.
+        expansion: expansion ratio.
+    """
+
+    def __init__(self, output_dim=None, expansion=0.25, **kwargs):
+        super().__init__(**kwargs)
+        self.expansion = expansion
+        self.output_dim = output_dim
+
+    def build(self, input_shape):
+        inp = input_shape[-1]
+        self.output_dim = self.output_dim or inp
+        self.avg_pool = layers.GlobalAvgPool2D(keepdims=True, name="avg_pool")
+        self.fc = [
+            layers.Dense(int(inp * self.expansion), use_bias=False, name="fc_0"),
+            layers.Activation("gelu", name="fc_1"),
+            layers.Dense(self.output_dim, use_bias=False, name="fc_2"),
+            layers.Activation("sigmoid", name="fc_3"),
+        ]
+        super().build(input_shape)
+
+    def call(self, inputs, **kwargs):
+        x = self.avg_pool(inputs)
+        for layer in self.fc:
+            x = layer(x)
+        return x * inputs
+
+
+class ReduceSize(layers.Layer):
+    """Down-sampling block.
+
+    Args:
+        keepdims: if False spatial dim is reduced and channel dim is increased
+    """
+
+    def __init__(self, keepdims=False, **kwargs):
+        super().__init__(**kwargs)
+        self.keepdims = keepdims
+
+    def build(self, input_shape):
+        embed_dim = input_shape[-1]
+        dim_out = embed_dim if self.keepdims else 2 * embed_dim
+        self.pad1 = layers.ZeroPadding2D(1, name="pad1")
+        self.pad2 = layers.ZeroPadding2D(1, name="pad2")
+        self.conv = [
+            layers.DepthwiseConv2D(
+                kernel_size=3, strides=1, padding="valid", use_bias=False, name="conv_0"
+            ),
+            layers.Activation("gelu", name="conv_1"),
+            SqueezeAndExcitation(name="conv_2"),
+            layers.Conv2D(
+                embed_dim,
+                kernel_size=1,
+                strides=1,
+                padding="valid",
+                use_bias=False,
+                name="conv_3",
+            ),
+        ]
+        self.reduction = layers.Conv2D(
+            dim_out,
+            kernel_size=3,
+            strides=2,
+            padding="valid",
+            use_bias=False,
+            name="reduction",
+        )
+        self.norm1 = layers.LayerNormalization(
+            -1, 1e-05, name="norm1"
+        )  # eps like PyTorch
+        self.norm2 = layers.LayerNormalization(-1, 1e-05, name="norm2")
+
+    def call(self, inputs, **kwargs):
+        x = self.norm1(inputs)
+        xr = self.pad1(x)
+        for layer in self.conv:
+            xr = layer(xr)
+        x = x + xr
+        x = self.pad2(x)
+        x = self.reduction(x)
+        x = self.norm2(x)
+        return x
+
+
+class MLP(layers.Layer):
+    """Multi-Layer Perceptron (MLP) block.
+
+    Args:
+        hidden_features: hidden features dimension.
+        out_features: output features dimension.
+        activation: activation function.
+        dropout: dropout rate.
+    """
+
+    def __init__(
+        self,
+        hidden_features=None,
+        out_features=None,
+        activation="gelu",
+        dropout=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.hidden_features = hidden_features
+        self.out_features = out_features
+        self.activation = activation
+        self.dropout = dropout
+
+    def build(self, input_shape):
+        self.in_features = input_shape[-1]
+        self.hidden_features = self.hidden_features or self.in_features
+        self.out_features = self.out_features or self.in_features
+        self.fc1 = layers.Dense(self.hidden_features, name="fc1")
+        self.act = layers.Activation(self.activation, name="act")
+        self.fc2 = layers.Dense(self.out_features, name="fc2")
+        self.drop1 = layers.Dropout(self.dropout, name="drop1")
+        self.drop2 = layers.Dropout(self.dropout, name="drop2")
+
+    def call(self, inputs, **kwargs):
+        x = self.fc1(inputs)
+        x = self.act(x)
+        x = self.drop1(x)
+        x = self.fc2(x)
+        x = self.drop2(x)
+        return x
+
+
+"""
+### Stem
+
+> **Notes**: In the code, this module is referred to as **PatchEmbed** but on paper, it
+is referred to as **Stem**.
+
+In the model, we have first used `patch_embed` module. Let's try to understand this
+module. As we can see from the `call` method,
+1. This module first **pads** input
+2. Then uses **convolutions** to extract patches with embeddings.
+3. Finally, uses `ReduceSize` module to first extract features with **convolution** but
+neither reduces spatial dimension nor increases spatial dimension.
+4. One important point to notice, unlike **ViT** or **SwinTransformer**, **GCViT**
+creates **overlapping patches**. We can notice that from the code,
+`Conv2D(self.embed_dim, kernel_size=3, strides=2, name='proj')`. If we wanted
+**non-overlapping** patches then we would've used the same `kernel_size` and `stride`.
+5. This module reduces the spatial dimension of input by `4x`.
+> Summary: image → padding → convolution →
+(feature_extract + downsample)
+"""
+
+
+class PatchEmbed(layers.Layer):
+    """Patch embedding block.
+
+    Args:
+        embed_dim: feature size dimension.
+    """
+
+    def __init__(self, embed_dim, **kwargs):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+
+    def build(self, input_shape):
+        self.pad = layers.ZeroPadding2D(1, name="pad")
+        self.proj = layers.Conv2D(self.embed_dim, 3, 2, name="proj")
+        self.conv_down = ReduceSize(keepdims=True, name="conv_down")
+
+    def call(self, inputs, **kwargs):
+        x = self.pad(inputs)
+        x = self.proj(x)
+        x = self.conv_down(x)
+        return x
+
+
+"""
+### Global Token Gen.
+
+> **Notes:** It is one of the two **CNN** modules that is used to imppose inductive bias.
+
+As we can see from above cell, in the `level` we have first used `to_q_global/Global
+Token Gen./FeatureExtraction`. Let's try to understand how it works,
+
+* This module is series of `FeatureExtract` module, according to paper we need to
+repeat this module `K` times, where `K = log2(H/h)`, `H = feature_map_height`,
+`W = feature_map_width`.
+* `FeatureExtraction:` This layer is very similar to `ReduceSize` module except it uses
+**MaxPooling** module to reduce the dimension, it doesn't increse feature dimension
+(channelsie) and it doesn't uses **LayerNormalizaton**. This module is used to in
+`Generate Token Gen.` module repeatedly to generte **global tokens** for
+**global-context-attention**.
+* One important point to notice from the figure is that, **global tokens** is shared
+across the whole image which means we use only **one global window** for **all local
+tokens** in a image. This makes the computation very efficient.
+* For input feature map with shape `(B, H, W, C)`, we'll get output shape `(B, h, w, C)`.
+If we copy these global tokens for total `M` local windows in an image where,
+`M = (H x W)/(h x w) = num_window`, then output shape: `(B * M, h, w, C)`."
+
+> Summary: This module is used to `resize` the image to fit window.
+
+<img
+src="https://raw.githubusercontent.com/awsaf49/gcvit-tf/main/image/global_token_annot.png"
+width=800>
+"""
+
+
+class FeatureExtraction(layers.Layer):
+    """Feature extraction block.
+
+    Args:
+        keepdims: bool argument for maintaining the resolution.
+    """
+
+    def __init__(self, keepdims=False, **kwargs):
+        super().__init__(**kwargs)
+        self.keepdims = keepdims
+
+    def build(self, input_shape):
+        embed_dim = input_shape[-1]
+        self.pad1 = layers.ZeroPadding2D(1, name="pad1")
+        self.pad2 = layers.ZeroPadding2D(1, name="pad2")
+        self.conv = [
+            layers.DepthwiseConv2D(3, 1, use_bias=False, name="conv_0"),
+            layers.Activation("gelu", name="conv_1"),
+            SqueezeAndExcitation(name="conv_2"),
+            layers.Conv2D(embed_dim, 1, 1, use_bias=False, name="conv_3"),
+        ]
+        if not self.keepdims:
+            self.pool = layers.MaxPool2D(3, 2, name="pool")
+        super().build(input_shape)
+
+    def call(self, inputs, **kwargs):
+        x = inputs
+        xr = self.pad1(x)
+        for layer in self.conv:
+            xr = layer(xr)
+        x = x + xr
+        if not self.keepdims:
+            x = self.pool(self.pad2(x))
+        return x
+
+
+class GlobalQueryGenerator(layers.Layer):
+    """Global query generator.
+
+    Args:
+        keepdims: to keep the dimension of FeatureExtraction layer.
+        For instance, repeating log(56/7) = 3 blocks, with input
+        window dimension 56 and output window dimension 7 at down-sampling
+        ratio 2. Please check Fig.5 of GC ViT paper for details.
+    """
+
+    def __init__(self, keepdims=False, **kwargs):
+        super().__init__(**kwargs)
+        self.keepdims = keepdims
+
+    def build(self, input_shape):
+        self.to_q_global = [
+            FeatureExtraction(keepdims, name=f"to_q_global_{i}")
+            for i, keepdims in enumerate(self.keepdims)
+        ]
+        super().build(input_shape)
+
+    def call(self, inputs, **kwargs):
+        x = inputs
+        for layer in self.to_q_global:
+            x = layer(x)
+        return x
+
+
+"""
+### Attention
+
+> **Notes:** This is the core contribution of the paper.
+
+As we can see from the `call` method,
+1. `WindowAttention` module applies both **local** and **global** window attention
+depending on `global_query` parameter.
+
+2. First it converts input features into `query, key, value` for local attention and
+`key, value` for global attention. For global attention, it takes global query from
+`Global Token Gen.`. One thing to notice from the code is that we divide the **features
+or embed_dim** among all the **heads of Transformer** to reduce the computation.
+`qkv = tf.reshape(qkv, [B_, N, self.qkv_size, self.num_heads, C // self.num_heads])`
+3. Before sending query, key and value for attention, **global token** goes through an
+important process. Same global tokens or one global window gets copied for all the local
+windows to increase efficiency.
+`q_global = tf.repeat(q_global, repeats=B_//B, axis=0)`, here `B_//B` means `num_windows`
+in a image.
+4. Then simply applies `local-window-self-attention` or `global-window-attention`
+depending on `global_query` parameter. One thing to notice from the code is that we are
+adding **relative-positional-embedding** with the **attention mask** instead of the
+**patch embedding**.
+`attn = attn + relative_position_bias[tf.newaxis,]`
+<img src="https://raw.githubusercontent.com/awsaf49/gcvit-tf/main/image/lvg_msa.PNG"
+width=800>
+5. Now, let's think for a bit and try to understand what is happening here. Let's focus
+on the figure below. We can see from the left, that in the **local-attention** the
+**query is local** and it's **limited to the local window** (red square border) hence we
+don't have access to long-range information. But on the right that due to **global
+query** we're now **not limited to local-windows** (blue square border) and we have
+access to long-range information.
+<img src="https://raw.githubusercontent.com/awsaf49/gcvit-tf/main/image/lvg_arch.PNG"
+width=800>
+6. In **ViT** we compare (attention) image-tokens with image-tokens, in
+**SwinTransformer** we compare window-tokens with window-tokens but in **GCViT** we
+compare image-tokens with window-tokens. But now you may ask, how can compare(attention)
+image-tokens with window-tokens even after image-tokens have larger dimensions than
+window-tokens? (from above figure image-tokens have shape `(1, 8, 8, 3)` and
+window-tokens have shape `(1, 4, 4, 3)`). Yes, you are right we can't directly compare
+them hence we resize image-tokens to fit window-tokens with `Global Token
+Gen./FeatureExtraction` **CNN** module. The following table should give you a clear
+comparison,
+
+| Model            | Query Tokens    | Key-Value Tokens  | Attention Type            | Attention Coverage |
+|------------------|-----------------|-------------------|---------------------------|--------------------|
+| ViT              | image           | image             | self-attention            | global             |
+| SwinTransformer  | window          | window            | self-attention            | local              |
+| **GCViT**        | **resized-image** | **window**     | **image-window attention** | **global**        |
+
+"""
+
+
+class WindowAttention(layers.Layer):
+    """Local window attention.
+
+    This implementation was proposed by
+    [Liu et al., 2021](https://arxiv.org/abs/2103.14030) in SwinTransformer.
+
+    Args:
+        window_size: window size.
+        num_heads: number of attention head.
+        global_query: if the input contains global_query
+        qkv_bias: bool argument for query, key, value learnable bias.
+        qk_scale: bool argument to scaling query, key.
+        attention_dropout: attention dropout rate.
+        projection_dropout: output dropout rate.
+    """
+
+    def __init__(
+        self,
+        window_size,
+        num_heads,
+        global_query,
+        qkv_bias=True,
+        qk_scale=None,
+        attention_dropout=0.0,
+        projection_dropout=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        window_size = (window_size, window_size)
+        self.window_size = window_size
+        self.num_heads = num_heads
+        self.global_query = global_query
+        self.qkv_bias = qkv_bias
+        self.qk_scale = qk_scale
+        self.attention_dropout = attention_dropout
+        self.projection_dropout = projection_dropout
+
+    def build(self, input_shape):
+        embed_dim = input_shape[0][-1]
+        head_dim = embed_dim // self.num_heads
+        self.scale = self.qk_scale or head_dim**-0.5
+        self.qkv_size = 3 - int(self.global_query)
+        self.qkv = layers.Dense(
+            embed_dim * self.qkv_size, use_bias=self.qkv_bias, name="qkv"
+        )
+        self.relative_position_bias_table = self.add_weight(
+            name="relative_position_bias_table",
+            shape=[
+                (2 * self.window_size[0] - 1) * (2 * self.window_size[1] - 1),
+                self.num_heads,
+            ],
+            initializer=keras.initializers.TruncatedNormal(stddev=0.02),
+            trainable=True,
+            dtype=self.dtype,
+        )
+        self.attn_drop = layers.Dropout(self.attention_dropout, name="attn_drop")
+        self.proj = layers.Dense(embed_dim, name="proj")
+        self.proj_drop = layers.Dropout(self.projection_dropout, name="proj_drop")
+        self.softmax = layers.Activation("softmax", name="softmax")
+        super().build(input_shape)
+
+    def get_relative_position_index(self):
+        coords_h = ops.arange(self.window_size[0])
+        coords_w = ops.arange(self.window_size[1])
+        coords = ops.stack(ops.meshgrid(coords_h, coords_w, indexing="ij"), axis=0)
+        coords_flatten = ops.reshape(coords, [2, -1])
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
+        relative_coords = ops.transpose(relative_coords, axes=[1, 2, 0])
+        relative_coords_xx = relative_coords[:, :, 0] + self.window_size[0] - 1
+        relative_coords_yy = relative_coords[:, :, 1] + self.window_size[1] - 1
+        relative_coords_xx = relative_coords_xx * (2 * self.window_size[1] - 1)
+        relative_position_index = relative_coords_xx + relative_coords_yy
+        return relative_position_index
+
+    def call(self, inputs, **kwargs):
+        if self.global_query:
+            inputs, q_global = inputs
+            B = ops.shape(q_global)[0]  # B, N, C
+        else:
+            inputs = inputs[0]
+        B_, N, C = ops.shape(inputs)  # B*num_window, num_tokens, channels
+        qkv = self.qkv(inputs)
+        qkv = ops.reshape(
+            qkv, [B_, N, self.qkv_size, self.num_heads, C // self.num_heads]
+        )
+        qkv = ops.transpose(qkv, [2, 0, 3, 1, 4])
+        if self.global_query:
+            k, v = ops.split(
+                qkv, indices_or_sections=2, axis=0
+            )  # for unknown shame num=None will throw error
+            q_global = ops.repeat(
+                q_global, repeats=B_ // B, axis=0
+            )  # num_windows = B_//B => q_global same for all windows in a img
+            q = ops.reshape(q_global, [B_, N, self.num_heads, C // self.num_heads])
+            q = ops.transpose(q, axes=[0, 2, 1, 3])
+        else:
+            q, k, v = ops.split(qkv, indices_or_sections=3, axis=0)
+            q = ops.squeeze(q, axis=0)
+
+        k = ops.squeeze(k, axis=0)
+        v = ops.squeeze(v, axis=0)
+
+        q = q * self.scale
+        attn = q @ ops.transpose(k, axes=[0, 1, 3, 2])
+        relative_position_bias = ops.take(
+            self.relative_position_bias_table,
+            ops.reshape(self.get_relative_position_index(), [-1]),
+        )
+        relative_position_bias = ops.reshape(
+            relative_position_bias,
+            [
+                self.window_size[0] * self.window_size[1],
+                self.window_size[0] * self.window_size[1],
+                -1,
+            ],
+        )
+        relative_position_bias = ops.transpose(relative_position_bias, axes=[2, 0, 1])
+        attn = attn + relative_position_bias[None,]
+        attn = self.softmax(attn)
+        attn = self.attn_drop(attn)
+
+        x = ops.transpose((attn @ v), axes=[0, 2, 1, 3])
+        x = ops.reshape(x, [B_, N, C])
+        x = self.proj_drop(self.proj(x))
+        return x
+
+
+"""
+### Block
+
+> **Notes:** This module doesn't have any Convolutional module.
+
+In the `level` second module that we have used is `block`. Let's try to understand how it
+works. As we can see from the `call` method,
+1. `Block` module takes either only feature_maps for local attention or additional global
+query for global attention.
+2. Before sending feature maps for attention, this module converts **batch feature maps**
+to **batch windows** as we'll be applying **Window Attention**.
+3. Then we send batch **batch windows** for attention.
+4. After attention has been applied we revert **batch windows** to **batch feature maps**.
+5. Before sending the attention to applied features for output, this module applies
+**Stochastic Depth** regularization in the residual connection. Also, before applying
+**Stochastic Depth** it rescales the input with trainable parameters. Note that, this
+**Stochastic Depth** block hasn't been shown in the figure of the paper.
+
+<img src="https://raw.githubusercontent.com/awsaf49/gcvit-tf/main/image/block2.JPG"
+width=400>
+
+
+### Window
+In the `block` module, we have created **windows** before and after applying attention.
+Let's try to understand how we're creating windows,
+* Following module converts feature maps `(B, H, W, C)` to stacked windows
+`(B x H/h x W/w, h, w, C)` → `(num_windows_batch, window_size, window_size, channel)`
+* This module uses `reshape` & `transpose` to create these windows out of image instead
+of iterating over them.
+"""
+
+
+class Block(layers.Layer):
+    """GCViT block.
+
+    Args:
+        window_size: window size.
+        num_heads: number of attention head.
+        global_query: apply global window attention
+        mlp_ratio: MLP ratio.
+        qkv_bias: bool argument for query, key, value learnable bias.
+        qk_scale: bool argument to scaling query, key.
+        drop: dropout rate.
+        attention_dropout: attention dropout rate.
+        path_drop: drop path rate.
+        activation: activation function.
+        layer_scale: layer scaling coefficient.
+    """
+
+    def __init__(
+        self,
+        window_size,
+        num_heads,
+        global_query,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        dropout=0.0,
+        attention_dropout=0.0,
+        path_drop=0.0,
+        activation="gelu",
+        layer_scale=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.window_size = window_size
+        self.num_heads = num_heads
+        self.global_query = global_query
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.qk_scale = qk_scale
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.path_drop = path_drop
+        self.activation = activation
+        self.layer_scale = layer_scale
+
+    def build(self, input_shape):
+        B, H, W, C = input_shape[0]
+        self.norm1 = layers.LayerNormalization(-1, 1e-05, name="norm1")
+        self.attn = WindowAttention(
+            window_size=self.window_size,
+            num_heads=self.num_heads,
+            global_query=self.global_query,
+            qkv_bias=self.qkv_bias,
+            qk_scale=self.qk_scale,
+            attention_dropout=self.attention_dropout,
+            projection_dropout=self.dropout,
+            name="attn",
+        )
+        self.drop_path1 = DropPath(self.path_drop)
+        self.drop_path2 = DropPath(self.path_drop)
+        self.norm2 = layers.LayerNormalization(-1, 1e-05, name="norm2")
+        self.mlp = MLP(
+            hidden_features=int(C * self.mlp_ratio),
+            dropout=self.dropout,
+            activation=self.activation,
+            name="mlp",
+        )
+        if self.layer_scale is not None:
+            self.gamma1 = self.add_weight(
+                name="gamma1",
+                shape=[C],
+                initializer=keras.initializers.Constant(self.layer_scale),
+                trainable=True,
+                dtype=self.dtype,
+            )
+            self.gamma2 = self.add_weight(
+                name="gamma2",
+                shape=[C],
+                initializer=keras.initializers.Constant(self.layer_scale),
+                trainable=True,
+                dtype=self.dtype,
+            )
+        else:
+            self.gamma1 = 1.0
+            self.gamma2 = 1.0
+        self.num_windows = int(H // self.window_size) * int(W // self.window_size)
+        super().build(input_shape)
+
+    def call(self, inputs, **kwargs):
+        if self.global_query:
+            inputs, q_global = inputs
+        else:
+            inputs = inputs[0]
+        B, H, W, C = ops.shape(inputs)
+        x = self.norm1(inputs)
+        # create windows and concat them in batch axis
+        x = self.window_partition(x, self.window_size)  # (B_, win_h, win_w, C)
+        # flatten patch
+        x = ops.reshape(x, [-1, self.window_size * self.window_size, C])
+        # attention
+        if self.global_query:
+            x = self.attn([x, q_global])
+        else:
+            x = self.attn([x])
+        # reverse window partition
+        x = self.window_reverse(x, self.window_size, H, W, C)
+        # FFN
+        x = inputs + self.drop_path1(x * self.gamma1)
+        x = x + self.drop_path2(self.gamma2 * self.mlp(self.norm2(x)))
+        return x
+
+    def window_partition(self, x, window_size):
+        """
+        Args:
+            x: (B, H, W, C)
+            window_size: window size
+        Returns:
+            local window features (num_windows*B, window_size, window_size, C)
+        """
+        B, H, W, C = ops.shape(x)
+        x = ops.reshape(
+            x,
+            [
+                -1,
+                H // window_size,
+                window_size,
+                W // window_size,
+                window_size,
+                C,
+            ],
+        )
+        x = ops.transpose(x, axes=[0, 1, 3, 2, 4, 5])
+        windows = ops.reshape(x, [-1, window_size, window_size, C])
+        return windows
+
+    def window_reverse(self, windows, window_size, H, W, C):
+        """
+        Args:
+            windows: local window features (num_windows*B, window_size, window_size, C)
+            window_size: Window size
+            H: Height of image
+            W: Width of image
+            C: Channel of image
+        Returns:
+            x: (B, H, W, C)
+        """
+        x = ops.reshape(
+            windows,
+            [
+                -1,
+                H // window_size,
+                W // window_size,
+                window_size,
+                window_size,
+                C,
+            ],
+        )
+        x = ops.transpose(x, axes=[0, 1, 3, 2, 4, 5])
+        x = ops.reshape(x, [-1, H, W, C])
+        return x
+
+
+"""
+### Level
+
+> **Note:** This module has both Transformer and CNN modules.
+
+In the model, the second module that we have used is `level`. Let's try to understand
+this module. As we can see from the `call` method,
+1. First it creates **global_token** with a series of `FeatureExtraction` modules. As
+we'll see
+later that `FeatureExtraction` is nothing but a simple **CNN** based module.
+2. Then it uses series of`Block` modules to apply **local or global window attention**
+depending on depth level.
+3. Finally, it uses `ReduceSize` to reduce the dimension of **contextualized features**.
+
+> Summary: feature_map → global_token → local/global window
+attention → dowsample
+
+<img src="https://raw.githubusercontent.com/awsaf49/gcvit-tf/main/image/level.png"
+width=400>
+"""
+
+
+class Level(layers.Layer):
+    """GCViT level.
+
+    Args:
+        depth: number of layers in each stage.
+        num_heads: number of heads in each stage.
+        window_size: window size in each stage.
+        keepdims: dims to keep in FeatureExtraction.
+        downsample: bool argument for down-sampling.
+        mlp_ratio: MLP ratio.
+        qkv_bias: bool argument for query, key, value learnable bias.
+        qk_scale: bool argument to scaling query, key.
+        drop: dropout rate.
+        attention_dropout: attention dropout rate.
+        path_drop: drop path rate.
+        layer_scale: layer scaling coefficient.
+    """
+
+    def __init__(
+        self,
+        depth,
+        num_heads,
+        window_size,
+        keepdims,
+        downsample=True,
+        mlp_ratio=4.0,
+        qkv_bias=True,
+        qk_scale=None,
+        dropout=0.0,
+        attention_dropout=0.0,
+        path_drop=0.0,
+        layer_scale=None,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.depth = depth
+        self.num_heads = num_heads
+        self.window_size = window_size
+        self.keepdims = keepdims
+        self.downsample = downsample
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.qk_scale = qk_scale
+        self.dropout = dropout
+        self.attention_dropout = attention_dropout
+        self.path_drop = path_drop
+        self.layer_scale = layer_scale
+
+    def build(self, input_shape):
+        path_drop = (
+            [self.path_drop] * self.depth
+            if not isinstance(self.path_drop, list)
+            else self.path_drop
+        )
+        self.blocks = [
+            Block(
+                window_size=self.window_size,
+                num_heads=self.num_heads,
+                global_query=bool(i % 2),
+                mlp_ratio=self.mlp_ratio,
+                qkv_bias=self.qkv_bias,
+                qk_scale=self.qk_scale,
+                dropout=self.dropout,
+                attention_dropout=self.attention_dropout,
+                path_drop=path_drop[i],
+                layer_scale=self.layer_scale,
+                name=f"blocks_{i}",
+            )
+            for i in range(self.depth)
+        ]
+        self.down = ReduceSize(keepdims=False, name="downsample")
+        self.q_global_gen = GlobalQueryGenerator(self.keepdims, name="q_global_gen")
+        super().build(input_shape)
+
+    def call(self, inputs, **kwargs):
+        x = inputs
+        q_global = self.q_global_gen(x)  # shape: (B, win_size, win_size, C)
+        for i, blk in enumerate(self.blocks):
+            if i % 2:
+                x = blk([x, q_global])  # shape: (B, H, W, C)
+            else:
+                x = blk([x])  # shape: (B, H, W, C)
+        if self.downsample:
+            x = self.down(x)  # shape: (B, H//2, W//2, 2*C)
+        return x
+
+
+"""
+### Model
+
+Let's directly jump to the model. As we can see from the `call` method,
+1. It creates patch embeddings from an image. This layer doesn't flattens these
+embeddings which means output of this module will be
+`(batch, height/window_size, width/window_size, embed_dim)` instead of
+`(batch, height x width/window_size^2, embed_dim)`.
+2. Then it applies `Dropout` module which randomly sets input units to 0.
+3. It passes these embeddings to series of `Level` modules which we are calling `level`
+where,
+    1. Global token is generated
+    1. Both local & global attention is applied
+    1. Finally downsample is applied.
+4. So, output after `n` number of **levels**, shape: `(batch, width/window_size x 2^{n-1},
+width/window_size x 2^{n-1}, embed_dim x 2^{n-1})`. In the last layer,
+paper doesn't use **downsample** and increase **channels**.
+5. Output of above layer is normalized using `LayerNormalization` module.
+6. In the head, 2D features are converted to 1D features with `Pooling` module. Output
+shape after this module is `(batch, embed_dim x 2^{n-1})`
+7. Finally, pooled features are sent to `Dense/Linear` module for classification.
+
+> Sumamry: image → (patchs + embedding) → dropout
+→ (attention + feature extraction) → normalizaion →
+pooling → classify
+"""
+
+
+class GCViT(keras.Model):
+    """GCViT model.
+
+    Args:
+        window_size: window size in each stage.
+        embed_dim: feature size dimension.
+        depths: number of layers in each stage.
+        num_heads: number of heads in each stage.
+        drop_rate: dropout rate.
+        mlp_ratio: MLP ratio.
+        qkv_bias: bool argument for query, key, value learnable bias.
+        qk_scale: bool argument to scaling query, key.
+        attention_dropout: attention dropout rate.
+        path_drop: drop path rate.
+        layer_scale: layer scaling coefficient.
+        num_classes: number of classes.
+        head_activation: activation function for head.
+    """
+
+    def __init__(
+        self,
+        window_size,
+        embed_dim,
+        depths,
+        num_heads,
+        drop_rate=0.0,
+        mlp_ratio=3.0,
+        qkv_bias=True,
+        qk_scale=None,
+        attention_dropout=0.0,
+        path_drop=0.1,
+        layer_scale=None,
+        num_classes=1000,
+        head_activation="softmax",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.window_size = window_size
+        self.embed_dim = embed_dim
+        self.depths = depths
+        self.num_heads = num_heads
+        self.drop_rate = drop_rate
+        self.mlp_ratio = mlp_ratio
+        self.qkv_bias = qkv_bias
+        self.qk_scale = qk_scale
+        self.attention_dropout = attention_dropout
+        self.path_drop = path_drop
+        self.layer_scale = layer_scale
+        self.num_classes = num_classes
+        self.head_activation = head_activation
+
+        self.patch_embed = PatchEmbed(embed_dim=embed_dim, name="patch_embed")
+        self.pos_drop = layers.Dropout(drop_rate, name="pos_drop")
+        path_drops = np.linspace(0.0, path_drop, sum(depths))
+        keepdims = [(0, 0, 0), (0, 0), (1,), (1,)]
+        self.levels = []
+        for i in range(len(depths)):
+            path_drop = path_drops[sum(depths[:i]) : sum(depths[: i + 1])].tolist()
+            level = Level(
+                depth=depths[i],
+                num_heads=num_heads[i],
+                window_size=window_size[i],
+                keepdims=keepdims[i],
+                downsample=(i < len(depths) - 1),
+                mlp_ratio=mlp_ratio,
+                qkv_bias=qkv_bias,
+                qk_scale=qk_scale,
+                dropout=drop_rate,
+                attention_dropout=attention_dropout,
+                path_drop=path_drop,
+                layer_scale=layer_scale,
+                name=f"levels_{i}",
+            )
+            self.levels.append(level)
+        self.norm = layers.LayerNormalization(axis=-1, epsilon=1e-05, name="norm")
+        self.pool = layers.GlobalAvgPool2D(name="pool")
+        self.head = layers.Dense(num_classes, name="head", activation=head_activation)
+
+    def build(self, input_shape):
+        super().build(input_shape)
+        self.built = True
+
+    def call(self, inputs, **kwargs):
+        x = self.patch_embed(inputs)  # shape: (B, H, W, C)
+        x = self.pos_drop(x)
+        for level in self.levels:
+            x = level(x)  # shape: (B, H_, W_, C_)
+        x = self.norm(x)
+        x = self.pool(x)  # shape: (B, C__)
+        x = self.head(x)
+        return x
+
+    def build_graph(self, input_shape=(224, 224, 3)):
+        """
+        ref: https://www.kaggle.com/code/ipythonx/tf-hybrid-efficientnet-swin-transformer-gradcam
+        """
+        x = keras.Input(shape=input_shape)
+        return keras.Model(inputs=[x], outputs=self.call(x), name=self.name)
+
+    def summary(self, input_shape=(224, 224, 3)):
+        return self.build_graph(input_shape).summary()
+
+
+"""
+## Build Model
+
+* Let's build a complete model with all the modules that we've explained above. We'll
+build **GCViT-XXTiny** model with the configuration mentioned in the paper.
+* Also we'll load the ported official **pre-trained** weights and try for some
+predictions.
+"""
+
+# Model Configs
+config = {
+    "window_size": (7, 7, 14, 7),
+    "embed_dim": 64,
+    "depths": (2, 2, 6, 2),
+    "num_heads": (2, 4, 8, 16),
+    "mlp_ratio": 3.0,
+    "path_drop": 0.2,
+}
+ckpt_link = (
+    "https://github.com/awsaf49/gcvit-tf/releases/download/v1.1.6/gcvitxxtiny.keras"
+)
+
+# Build Model
+model = GCViT(**config)
+inp = ops.array(np.random.uniform(size=(1, 224, 224, 3)))
+out = model(inp)
+
+# Load Weights
+ckpt_path = keras.utils.get_file(ckpt_link.split("/")[-1], ckpt_link)
+model.load_weights(ckpt_path)
+
+# Summary
+model.summary((224, 224, 3))
+
+"""
+## Sanity check for Pre-Trained Weights
+"""
+
+img = keras.applications.imagenet_utils.preprocess_input(
+    chelsea(), mode="torch"
+)  # Chelsea the cat
+img = ops.image.resize(img, (224, 224))[None,]  # resize & create batch
+pred = model(img)
+pred_dec = keras.applications.imagenet_utils.decode_predictions(pred)[0]
+
+print("\n# Image:")
+plt.figure(figsize=(6, 6))
+plt.imshow(chelsea())
+plt.show()
+print()
+
+print("# Prediction (Top 5):")
+for i in range(5):
+    print("{:<12} : {:0.2f}".format(pred_dec[i][1], pred_dec[i][2]))
+
+"""
+# Fine-tune **GCViT** Model
+
+In the following cells, we will fine-tune **GCViT** model on Flower Dataset which
+consists `104` classes.
+"""
+
+"""
+### Configs
+"""
+
+# Model
+IMAGE_SIZE = (224, 224)
+
+# Hyper Params
+BATCH_SIZE = 32
+EPOCHS = 5
+
+# Dataset
+CLASSES = [
+    "dandelion",
+    "daisy",
+    "tulips",
+    "sunflowers",
+    "roses",
+]  # don't change the order
+
+# Other constants
+MEAN = 255 * np.array([0.485, 0.456, 0.406], dtype="float32")  # imagenet mean
+STD = 255 * np.array([0.229, 0.224, 0.225], dtype="float32")  # imagenet std
+AUTO = tf.data.AUTOTUNE
+
+"""
+## Data Loader
+"""
+
+
+def make_dataset(dataset: tf.data.Dataset, train: bool, image_size: int = IMAGE_SIZE):
+    def preprocess(image, label):
+        # for training, do augmentation
+        if train:
+            if tf.random.uniform(shape=[]) > 0.5:
+                image = tf.image.flip_left_right(image)
+        image = tf.image.resize(image, size=image_size, method="bicubic")
+        image = (image - MEAN) / STD  # normalization
+        return image, label
+
+    if train:
+        dataset = dataset.shuffle(BATCH_SIZE * 10)
+
+    return dataset.map(preprocess, AUTO).batch(BATCH_SIZE).prefetch(AUTO)
+
+
+"""
+### Flower Dataset
+"""
+
+train_dataset, val_dataset = tfds.load(
+    "tf_flowers",
+    split=["train[:90%]", "train[90%:]"],
+    as_supervised=True,
+    try_gcs=False,  # gcs_path is necessary for tpu,
+)
+
+train_dataset = make_dataset(train_dataset, True)
+val_dataset = make_dataset(val_dataset, False)
+
+"""
+### Re-Build Model for Flower Dataset
+"""
+
+# Re-Build Model
+model = GCViT(**config, num_classes=104)
+inp = ops.array(np.random.uniform(size=(1, 224, 224, 3)))
+out = model(inp)
+
+# Load Weights
+ckpt_path = keras.utils.get_file(ckpt_link.split("/")[-1], ckpt_link)
+model.load_weights(ckpt_path, skip_mismatch=True)
+
+model.compile(
+    loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
+)
+
+"""
+### Training
+"""
+
+history = model.fit(
+    train_dataset, validation_data=val_dataset, epochs=EPOCHS, verbose=1
+)
+
+"""
+## Reference
+
+* [gcvit-tf - A Python library for GCViT with TF2.0](https://github.com/awsaf49/gcvit-tf)
+* [gcvit - Official codebase for GCViT](https://github.com/NVlabs/GCVit)
+"""
diff --git a/knowledge_base/vision/image_classification_with_vision_transformer.py b/knowledge_base/vision/image_classification_with_vision_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..d25edd9ce86a33de40d9a0e4b77b1162f9b68263
--- /dev/null
+++ b/knowledge_base/vision/image_classification_with_vision_transformer.py
@@ -0,0 +1,332 @@
+"""
+Title: Image classification with Vision Transformer
+Author: [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/)
+Date created: 2021/01/18
+Last modified: 2021/01/18
+Description: Implementing the Vision Transformer (ViT) model for image classification.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+This example implements the [Vision Transformer (ViT)](https://arxiv.org/abs/2010.11929)
+model by Alexey Dosovitskiy et al. for image classification,
+and demonstrates it on the CIFAR-100 dataset.
+The ViT model applies the Transformer architecture with self-attention to sequences of
+image patches, without using convolution layers.
+
+"""
+
+"""
+## Setup
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "jax"  # @param ["tensorflow", "jax", "torch"]
+
+import keras
+from keras import layers
+from keras import ops
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+"""
+## Prepare the data
+"""
+
+num_classes = 100
+input_shape = (32, 32, 3)
+
+(x_train, y_train), (x_test, y_test) = keras.datasets.cifar100.load_data()
+
+print(f"x_train shape: {x_train.shape} - y_train shape: {y_train.shape}")
+print(f"x_test shape: {x_test.shape} - y_test shape: {y_test.shape}")
+
+
+"""
+## Configure the hyperparameters
+"""
+
+learning_rate = 0.001
+weight_decay = 0.0001
+batch_size = 256
+num_epochs = 10  # For real training, use num_epochs=100. 10 is a test value
+image_size = 72  # We'll resize input images to this size
+patch_size = 6  # Size of the patches to be extract from the input images
+num_patches = (image_size // patch_size) ** 2
+projection_dim = 64
+num_heads = 4
+transformer_units = [
+    projection_dim * 2,
+    projection_dim,
+]  # Size of the transformer layers
+transformer_layers = 8
+mlp_head_units = [
+    2048,
+    1024,
+]  # Size of the dense layers of the final classifier
+
+
+"""
+## Use data augmentation
+"""
+
+data_augmentation = keras.Sequential(
+    [
+        layers.Normalization(),
+        layers.Resizing(image_size, image_size),
+        layers.RandomFlip("horizontal"),
+        layers.RandomRotation(factor=0.02),
+        layers.RandomZoom(height_factor=0.2, width_factor=0.2),
+    ],
+    name="data_augmentation",
+)
+# Compute the mean and the variance of the training data for normalization.
+data_augmentation.layers[0].adapt(x_train)
+
+
+"""
+## Implement multilayer perceptron (MLP)
+"""
+
+
+def mlp(x, hidden_units, dropout_rate):
+    for units in hidden_units:
+        x = layers.Dense(units, activation=keras.activations.gelu)(x)
+        x = layers.Dropout(dropout_rate)(x)
+    return x
+
+
+"""
+## Implement patch creation as a layer
+"""
+
+
+class Patches(layers.Layer):
+    def __init__(self, patch_size):
+        super().__init__()
+        self.patch_size = patch_size
+
+    def call(self, images):
+        input_shape = ops.shape(images)
+        batch_size = input_shape[0]
+        height = input_shape[1]
+        width = input_shape[2]
+        channels = input_shape[3]
+        num_patches_h = height // self.patch_size
+        num_patches_w = width // self.patch_size
+        patches = keras.ops.image.extract_patches(images, size=self.patch_size)
+        patches = ops.reshape(
+            patches,
+            (
+                batch_size,
+                num_patches_h * num_patches_w,
+                self.patch_size * self.patch_size * channels,
+            ),
+        )
+        return patches
+
+    def get_config(self):
+        config = super().get_config()
+        config.update({"patch_size": self.patch_size})
+        return config
+
+
+"""
+Let's display patches for a sample image
+"""
+
+plt.figure(figsize=(4, 4))
+image = x_train[np.random.choice(range(x_train.shape[0]))]
+plt.imshow(image.astype("uint8"))
+plt.axis("off")
+
+resized_image = ops.image.resize(
+    ops.convert_to_tensor([image]), size=(image_size, image_size)
+)
+patches = Patches(patch_size)(resized_image)
+print(f"Image size: {image_size} X {image_size}")
+print(f"Patch size: {patch_size} X {patch_size}")
+print(f"Patches per image: {patches.shape[1]}")
+print(f"Elements per patch: {patches.shape[-1]}")
+
+n = int(np.sqrt(patches.shape[1]))
+plt.figure(figsize=(4, 4))
+for i, patch in enumerate(patches[0]):
+    ax = plt.subplot(n, n, i + 1)
+    patch_img = ops.reshape(patch, (patch_size, patch_size, 3))
+    plt.imshow(ops.convert_to_numpy(patch_img).astype("uint8"))
+    plt.axis("off")
+
+"""
+## Implement the patch encoding layer
+
+The `PatchEncoder` layer will linearly transform a patch by projecting it into a
+vector of size `projection_dim`. In addition, it adds a learnable position
+embedding to the projected vector.
+"""
+
+
+class PatchEncoder(layers.Layer):
+    def __init__(self, num_patches, projection_dim):
+        super().__init__()
+        self.num_patches = num_patches
+        self.projection = layers.Dense(units=projection_dim)
+        self.position_embedding = layers.Embedding(
+            input_dim=num_patches, output_dim=projection_dim
+        )
+
+    def call(self, patch):
+        positions = ops.expand_dims(
+            ops.arange(start=0, stop=self.num_patches, step=1), axis=0
+        )
+        projected_patches = self.projection(patch)
+        encoded = projected_patches + self.position_embedding(positions)
+        return encoded
+
+    def get_config(self):
+        config = super().get_config()
+        config.update({"num_patches": self.num_patches})
+        return config
+
+
+"""
+## Build the ViT model
+
+The ViT model consists of multiple Transformer blocks,
+which use the `layers.MultiHeadAttention` layer as a self-attention mechanism
+applied to the sequence of patches. The Transformer blocks produce a
+`[batch_size, num_patches, projection_dim]` tensor, which is processed via an
+classifier head with softmax to produce the final class probabilities output.
+
+Unlike the technique described in the [paper](https://arxiv.org/abs/2010.11929),
+which prepends a learnable embedding to the sequence of encoded patches to serve
+as the image representation, all the outputs of the final Transformer block are
+reshaped with `layers.Flatten()` and used as the image
+representation input to the classifier head.
+Note that the `layers.GlobalAveragePooling1D` layer
+could also be used instead to aggregate the outputs of the Transformer block,
+especially when the number of patches and the projection dimensions are large.
+"""
+
+
+def create_vit_classifier():
+    inputs = keras.Input(shape=input_shape)
+    # Augment data.
+    augmented = data_augmentation(inputs)
+    # Create patches.
+    patches = Patches(patch_size)(augmented)
+    # Encode patches.
+    encoded_patches = PatchEncoder(num_patches, projection_dim)(patches)
+
+    # Create multiple layers of the Transformer block.
+    for _ in range(transformer_layers):
+        # Layer normalization 1.
+        x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
+        # Create a multi-head attention layer.
+        attention_output = layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=projection_dim, dropout=0.1
+        )(x1, x1)
+        # Skip connection 1.
+        x2 = layers.Add()([attention_output, encoded_patches])
+        # Layer normalization 2.
+        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
+        # MLP.
+        x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1)
+        # Skip connection 2.
+        encoded_patches = layers.Add()([x3, x2])
+
+    # Create a [batch_size, projection_dim] tensor.
+    representation = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
+    representation = layers.Flatten()(representation)
+    representation = layers.Dropout(0.5)(representation)
+    # Add MLP.
+    features = mlp(representation, hidden_units=mlp_head_units, dropout_rate=0.5)
+    # Classify outputs.
+    logits = layers.Dense(num_classes)(features)
+    # Create the Keras model.
+    model = keras.Model(inputs=inputs, outputs=logits)
+    return model
+
+
+"""
+## Compile, train, and evaluate the mode
+"""
+
+
+def run_experiment(model):
+    optimizer = keras.optimizers.AdamW(
+        learning_rate=learning_rate, weight_decay=weight_decay
+    )
+
+    model.compile(
+        optimizer=optimizer,
+        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+        metrics=[
+            keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
+            keras.metrics.SparseTopKCategoricalAccuracy(5, name="top-5-accuracy"),
+        ],
+    )
+
+    checkpoint_filepath = "/tmp/checkpoint.weights.h5"
+    checkpoint_callback = keras.callbacks.ModelCheckpoint(
+        checkpoint_filepath,
+        monitor="val_accuracy",
+        save_best_only=True,
+        save_weights_only=True,
+    )
+
+    history = model.fit(
+        x=x_train,
+        y=y_train,
+        batch_size=batch_size,
+        epochs=num_epochs,
+        validation_split=0.1,
+        callbacks=[checkpoint_callback],
+    )
+
+    model.load_weights(checkpoint_filepath)
+    _, accuracy, top_5_accuracy = model.evaluate(x_test, y_test)
+    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
+    print(f"Test top 5 accuracy: {round(top_5_accuracy * 100, 2)}%")
+
+    return history
+
+
+vit_classifier = create_vit_classifier()
+history = run_experiment(vit_classifier)
+
+
+def plot_history(item):
+    plt.plot(history.history[item], label=item)
+    plt.plot(history.history["val_" + item], label="val_" + item)
+    plt.xlabel("Epochs")
+    plt.ylabel(item)
+    plt.title("Train and Validation {} Over Epochs".format(item), fontsize=14)
+    plt.legend()
+    plt.grid()
+    plt.show()
+
+
+plot_history("loss")
+plot_history("top-5-accuracy")
+
+
+"""
+After 100 epochs, the ViT model achieves around 55% accuracy and
+82% top-5 accuracy on the test data. These are not competitive results on the CIFAR-100 dataset,
+as a ResNet50V2 trained from scratch on the same data can achieve 67% accuracy.
+
+Note that the state of the art results reported in the
+[paper](https://arxiv.org/abs/2010.11929) are achieved by pre-training the ViT model using
+the JFT-300M dataset, then fine-tuning it on the target dataset. To improve the model quality
+without pre-training, you can try to train the model for more epochs, use a larger number of
+Transformer layers, resize the input images, change the patch size, or increase the projection dimensions.
+Besides, as mentioned in the paper, the quality of the model is affected not only by architecture choices,
+but also by parameters such as the learning rate schedule, optimizer, weight decay, etc.
+In practice, it's recommended to fine-tune a ViT model
+that was pre-trained using a large, high-resolution dataset.
+"""
diff --git a/knowledge_base/vision/integrated_gradients.py b/knowledge_base/vision/integrated_gradients.py
new file mode 100644
index 0000000000000000000000000000000000000000..e7d012a8b462a5a9d30afa74acd43e0cc41ebc87
--- /dev/null
+++ b/knowledge_base/vision/integrated_gradients.py
@@ -0,0 +1,498 @@
+"""
+Title: Model interpretability with Integrated Gradients
+Author: [A_K_Nain](https://twitter.com/A_K_Nain)
+Date created: 2020/06/02
+Last modified: 2020/06/02
+Description: How to obtain integrated gradients for a classification model.
+Accelerator: None
+"""
+
+"""
+## Integrated Gradients
+
+[Integrated Gradients](https://arxiv.org/abs/1703.01365) is a technique for
+attributing a classification model's prediction to its input features. It is
+a model interpretability technique: you can use it to visualize the relationship
+between input features and model predictions.
+
+Integrated Gradients is a variation on computing
+the gradient of the prediction output with regard to features of the input.
+To compute integrated gradients, we need to perform the following steps:
+
+1. Identify the input and the output. In our case, the input is an image and the
+output is the last layer of our model (dense layer with softmax activation).
+
+2. Compute which features are important to a neural network
+when making a prediction on a particular data point. To identify these features, we
+need to choose a baseline input. A baseline input can be a black image (all pixel
+values set to zero) or random noise. The shape of the baseline input needs to be
+the same as our input image, e.g. (299, 299, 3).
+
+3. Interpolate the baseline for a given number of steps. The number of steps represents
+the steps we need in the gradient approximation for a given input image. The number of
+steps is a hyperparameter. The authors recommend using anywhere between
+20 and 1000 steps.
+
+4. Preprocess these interpolated images and do a forward pass.
+5. Get the gradients for these interpolated images.
+6. Approximate the gradients integral using the trapezoidal rule.
+
+To read in-depth about integrated gradients and why this method works,
+consider reading this excellent
+[article](https://distill.pub/2020/attribution-baselines/).
+
+**References:**
+
+- Integrated Gradients original [paper](https://arxiv.org/abs/1703.01365)
+- [Original implementation](https://github.com/ankurtaly/Integrated-Gradients)
+"""
+
+"""
+## Setup
+"""
+
+
+import numpy as np
+import matplotlib.pyplot as plt
+from scipy import ndimage
+from IPython.display import Image, display
+
+import tensorflow as tf
+import keras
+from keras import layers
+from keras.applications import xception
+
+
+# Size of the input image
+img_size = (299, 299, 3)
+
+# Load Xception model with imagenet weights
+model = xception.Xception(weights="imagenet")
+
+# The local path to our target image
+img_path = keras.utils.get_file("elephant.jpg", "https://i.imgur.com/Bvro0YD.png")
+display(Image(img_path))
+
+"""
+## Integrated Gradients algorithm
+"""
+
+
+def get_img_array(img_path, size=(299, 299)):
+    # `img` is a PIL image of size 299x299
+    img = keras.utils.load_img(img_path, target_size=size)
+    # `array` is a float32 Numpy array of shape (299, 299, 3)
+    array = keras.utils.img_to_array(img)
+    # We add a dimension to transform our array into a "batch"
+    # of size (1, 299, 299, 3)
+    array = np.expand_dims(array, axis=0)
+    return array
+
+
+def get_gradients(img_input, top_pred_idx):
+    """Computes the gradients of outputs w.r.t input image.
+
+    Args:
+        img_input: 4D image tensor
+        top_pred_idx: Predicted label for the input image
+
+    Returns:
+        Gradients of the predictions w.r.t img_input
+    """
+    images = tf.cast(img_input, tf.float32)
+
+    with tf.GradientTape() as tape:
+        tape.watch(images)
+        preds = model(images)
+        top_class = preds[:, top_pred_idx]
+
+    grads = tape.gradient(top_class, images)
+    return grads
+
+
+def get_integrated_gradients(img_input, top_pred_idx, baseline=None, num_steps=50):
+    """Computes Integrated Gradients for a predicted label.
+
+    Args:
+        img_input (ndarray): Original image
+        top_pred_idx: Predicted label for the input image
+        baseline (ndarray): The baseline image to start with for interpolation
+        num_steps: Number of interpolation steps between the baseline
+            and the input used in the computation of integrated gradients. These
+            steps along determine the integral approximation error. By default,
+            num_steps is set to 50.
+
+    Returns:
+        Integrated gradients w.r.t input image
+    """
+    # If baseline is not provided, start with a black image
+    # having same size as the input image.
+    if baseline is None:
+        baseline = np.zeros(img_size).astype(np.float32)
+    else:
+        baseline = baseline.astype(np.float32)
+
+    # 1. Do interpolation.
+    img_input = img_input.astype(np.float32)
+    interpolated_image = [
+        baseline + (step / num_steps) * (img_input - baseline)
+        for step in range(num_steps + 1)
+    ]
+    interpolated_image = np.array(interpolated_image).astype(np.float32)
+
+    # 2. Preprocess the interpolated images
+    interpolated_image = xception.preprocess_input(interpolated_image)
+
+    # 3. Get the gradients
+    grads = []
+    for i, img in enumerate(interpolated_image):
+        img = tf.expand_dims(img, axis=0)
+        grad = get_gradients(img, top_pred_idx=top_pred_idx)
+        grads.append(grad[0])
+    grads = tf.convert_to_tensor(grads, dtype=tf.float32)
+
+    # 4. Approximate the integral using the trapezoidal rule
+    grads = (grads[:-1] + grads[1:]) / 2.0
+    avg_grads = tf.reduce_mean(grads, axis=0)
+
+    # 5. Calculate integrated gradients and return
+    integrated_grads = (img_input - baseline) * avg_grads
+    return integrated_grads
+
+
+def random_baseline_integrated_gradients(
+    img_input, top_pred_idx, num_steps=50, num_runs=2
+):
+    """Generates a number of random baseline images.
+
+    Args:
+        img_input (ndarray): 3D image
+        top_pred_idx: Predicted label for the input image
+        num_steps: Number of interpolation steps between the baseline
+            and the input used in the computation of integrated gradients. These
+            steps along determine the integral approximation error. By default,
+            num_steps is set to 50.
+        num_runs: number of baseline images to generate
+
+    Returns:
+        Averaged integrated gradients for `num_runs` baseline images
+    """
+    # 1. List to keep track of Integrated Gradients (IG) for all the images
+    integrated_grads = []
+
+    # 2. Get the integrated gradients for all the baselines
+    for run in range(num_runs):
+        baseline = np.random.random(img_size) * 255
+        igrads = get_integrated_gradients(
+            img_input=img_input,
+            top_pred_idx=top_pred_idx,
+            baseline=baseline,
+            num_steps=num_steps,
+        )
+        integrated_grads.append(igrads)
+
+    # 3. Return the average integrated gradients for the image
+    integrated_grads = tf.convert_to_tensor(integrated_grads)
+    return tf.reduce_mean(integrated_grads, axis=0)
+
+
+"""
+## Helper class for visualizing gradients and integrated gradients
+"""
+
+
+class GradVisualizer:
+    """Plot gradients of the outputs w.r.t an input image."""
+
+    def __init__(self, positive_channel=None, negative_channel=None):
+        if positive_channel is None:
+            self.positive_channel = [0, 255, 0]
+        else:
+            self.positive_channel = positive_channel
+
+        if negative_channel is None:
+            self.negative_channel = [255, 0, 0]
+        else:
+            self.negative_channel = negative_channel
+
+    def apply_polarity(self, attributions, polarity):
+        if polarity == "positive":
+            return np.clip(attributions, 0, 1)
+        else:
+            return np.clip(attributions, -1, 0)
+
+    def apply_linear_transformation(
+        self,
+        attributions,
+        clip_above_percentile=99.9,
+        clip_below_percentile=70.0,
+        lower_end=0.2,
+    ):
+        # 1. Get the thresholds
+        m = self.get_thresholded_attributions(
+            attributions, percentage=100 - clip_above_percentile
+        )
+        e = self.get_thresholded_attributions(
+            attributions, percentage=100 - clip_below_percentile
+        )
+
+        # 2. Transform the attributions by a linear function f(x) = a*x + b such that
+        # f(m) = 1.0 and f(e) = lower_end
+        transformed_attributions = (1 - lower_end) * (np.abs(attributions) - e) / (
+            m - e
+        ) + lower_end
+
+        # 3. Make sure that the sign of transformed attributions is the same as original attributions
+        transformed_attributions *= np.sign(attributions)
+
+        # 4. Only keep values that are bigger than the lower_end
+        transformed_attributions *= transformed_attributions >= lower_end
+
+        # 5. Clip values and return
+        transformed_attributions = np.clip(transformed_attributions, 0.0, 1.0)
+        return transformed_attributions
+
+    def get_thresholded_attributions(self, attributions, percentage):
+        if percentage == 100.0:
+            return np.min(attributions)
+
+        # 1. Flatten the attributions
+        flatten_attr = attributions.flatten()
+
+        # 2. Get the sum of the attributions
+        total = np.sum(flatten_attr)
+
+        # 3. Sort the attributions from largest to smallest.
+        sorted_attributions = np.sort(np.abs(flatten_attr))[::-1]
+
+        # 4. Calculate the percentage of the total sum that each attribution
+        # and the values about it contribute.
+        cum_sum = 100.0 * np.cumsum(sorted_attributions) / total
+
+        # 5. Threshold the attributions by the percentage
+        indices_to_consider = np.where(cum_sum >= percentage)[0][0]
+
+        # 6. Select the desired attributions and return
+        attributions = sorted_attributions[indices_to_consider]
+        return attributions
+
+    def binarize(self, attributions, threshold=0.001):
+        return attributions > threshold
+
+    def morphological_cleanup_fn(self, attributions, structure=np.ones((4, 4))):
+        closed = ndimage.grey_closing(attributions, structure=structure)
+        opened = ndimage.grey_opening(closed, structure=structure)
+        return opened
+
+    def draw_outlines(
+        self,
+        attributions,
+        percentage=90,
+        connected_component_structure=np.ones((3, 3)),
+    ):
+        # 1. Binarize the attributions.
+        attributions = self.binarize(attributions)
+
+        # 2. Fill the gaps
+        attributions = ndimage.binary_fill_holes(attributions)
+
+        # 3. Compute connected components
+        connected_components, num_comp = ndimage.label(
+            attributions, structure=connected_component_structure
+        )
+
+        # 4. Sum up the attributions for each component
+        total = np.sum(attributions[connected_components > 0])
+        component_sums = []
+        for comp in range(1, num_comp + 1):
+            mask = connected_components == comp
+            component_sum = np.sum(attributions[mask])
+            component_sums.append((component_sum, mask))
+
+        # 5. Compute the percentage of top components to keep
+        sorted_sums_and_masks = sorted(component_sums, key=lambda x: x[0], reverse=True)
+        sorted_sums = list(zip(*sorted_sums_and_masks))[0]
+        cumulative_sorted_sums = np.cumsum(sorted_sums)
+        cutoff_threshold = percentage * total / 100
+        cutoff_idx = np.where(cumulative_sorted_sums >= cutoff_threshold)[0][0]
+        if cutoff_idx > 2:
+            cutoff_idx = 2
+
+        # 6. Set the values for the kept components
+        border_mask = np.zeros_like(attributions)
+        for i in range(cutoff_idx + 1):
+            border_mask[sorted_sums_and_masks[i][1]] = 1
+
+        # 7. Make the mask hollow and show only the border
+        eroded_mask = ndimage.binary_erosion(border_mask, iterations=1)
+        border_mask[eroded_mask] = 0
+
+        # 8. Return the outlined mask
+        return border_mask
+
+    def process_grads(
+        self,
+        image,
+        attributions,
+        polarity="positive",
+        clip_above_percentile=99.9,
+        clip_below_percentile=0,
+        morphological_cleanup=False,
+        structure=np.ones((3, 3)),
+        outlines=False,
+        outlines_component_percentage=90,
+        overlay=True,
+    ):
+        if polarity not in ["positive", "negative"]:
+            raise ValueError(
+                f""" Allowed polarity values: 'positive' or 'negative'
+                                    but provided {polarity}"""
+            )
+        if clip_above_percentile < 0 or clip_above_percentile > 100:
+            raise ValueError("clip_above_percentile must be in [0, 100]")
+
+        if clip_below_percentile < 0 or clip_below_percentile > 100:
+            raise ValueError("clip_below_percentile must be in [0, 100]")
+
+        # 1. Apply polarity
+        if polarity == "positive":
+            attributions = self.apply_polarity(attributions, polarity=polarity)
+            channel = self.positive_channel
+        else:
+            attributions = self.apply_polarity(attributions, polarity=polarity)
+            attributions = np.abs(attributions)
+            channel = self.negative_channel
+
+        # 2. Take average over the channels
+        attributions = np.average(attributions, axis=2)
+
+        # 3. Apply linear transformation to the attributions
+        attributions = self.apply_linear_transformation(
+            attributions,
+            clip_above_percentile=clip_above_percentile,
+            clip_below_percentile=clip_below_percentile,
+            lower_end=0.0,
+        )
+
+        # 4. Cleanup
+        if morphological_cleanup:
+            attributions = self.morphological_cleanup_fn(
+                attributions, structure=structure
+            )
+        # 5. Draw the outlines
+        if outlines:
+            attributions = self.draw_outlines(
+                attributions, percentage=outlines_component_percentage
+            )
+
+        # 6. Expand the channel axis and convert to RGB
+        attributions = np.expand_dims(attributions, 2) * channel
+
+        # 7.Superimpose on the original image
+        if overlay:
+            attributions = np.clip((attributions * 0.8 + image), 0, 255)
+        return attributions
+
+    def visualize(
+        self,
+        image,
+        gradients,
+        integrated_gradients,
+        polarity="positive",
+        clip_above_percentile=99.9,
+        clip_below_percentile=0,
+        morphological_cleanup=False,
+        structure=np.ones((3, 3)),
+        outlines=False,
+        outlines_component_percentage=90,
+        overlay=True,
+        figsize=(15, 8),
+    ):
+        # 1. Make two copies of the original image
+        img1 = np.copy(image)
+        img2 = np.copy(image)
+
+        # 2. Process the normal gradients
+        grads_attr = self.process_grads(
+            image=img1,
+            attributions=gradients,
+            polarity=polarity,
+            clip_above_percentile=clip_above_percentile,
+            clip_below_percentile=clip_below_percentile,
+            morphological_cleanup=morphological_cleanup,
+            structure=structure,
+            outlines=outlines,
+            outlines_component_percentage=outlines_component_percentage,
+            overlay=overlay,
+        )
+
+        # 3. Process the integrated gradients
+        igrads_attr = self.process_grads(
+            image=img2,
+            attributions=integrated_gradients,
+            polarity=polarity,
+            clip_above_percentile=clip_above_percentile,
+            clip_below_percentile=clip_below_percentile,
+            morphological_cleanup=morphological_cleanup,
+            structure=structure,
+            outlines=outlines,
+            outlines_component_percentage=outlines_component_percentage,
+            overlay=overlay,
+        )
+
+        _, ax = plt.subplots(1, 3, figsize=figsize)
+        ax[0].imshow(image)
+        ax[1].imshow(grads_attr.astype(np.uint8))
+        ax[2].imshow(igrads_attr.astype(np.uint8))
+
+        ax[0].set_title("Input")
+        ax[1].set_title("Normal gradients")
+        ax[2].set_title("Integrated gradients")
+        plt.show()
+
+
+"""
+## Let's test-drive it
+"""
+
+# 1. Convert the image to numpy array
+img = get_img_array(img_path)
+
+# 2. Keep a copy of the original image
+orig_img = np.copy(img[0]).astype(np.uint8)
+
+# 3. Preprocess the image
+img_processed = tf.cast(xception.preprocess_input(img), dtype=tf.float32)
+
+# 4. Get model predictions
+preds = model.predict(img_processed)
+top_pred_idx = tf.argmax(preds[0])
+print("Predicted:", top_pred_idx, xception.decode_predictions(preds, top=1)[0])
+
+# 5. Get the gradients of the last layer for the predicted label
+grads = get_gradients(img_processed, top_pred_idx=top_pred_idx)
+
+# 6. Get the integrated gradients
+igrads = random_baseline_integrated_gradients(
+    np.copy(orig_img), top_pred_idx=top_pred_idx, num_steps=50, num_runs=2
+)
+
+# 7. Process the gradients and plot
+vis = GradVisualizer()
+vis.visualize(
+    image=orig_img,
+    gradients=grads[0].numpy(),
+    integrated_gradients=igrads.numpy(),
+    clip_above_percentile=99,
+    clip_below_percentile=0,
+)
+
+vis.visualize(
+    image=orig_img,
+    gradients=grads[0].numpy(),
+    integrated_gradients=igrads.numpy(),
+    clip_above_percentile=95,
+    clip_below_percentile=28,
+    morphological_cleanup=True,
+    outlines=True,
+)
diff --git a/knowledge_base/vision/involution.py b/knowledge_base/vision/involution.py
new file mode 100644
index 0000000000000000000000000000000000000000..571bf00b7ead6ab5f32c253bc7927c47dc86a9f2
--- /dev/null
+++ b/knowledge_base/vision/involution.py
@@ -0,0 +1,483 @@
+"""
+Title: Involutional neural networks
+Author: [Aritra Roy Gosthipaty](https://twitter.com/ariG23498)
+Date created: 2021/07/25
+Last modified: 2021/07/25
+Description: Deep dive into location-specific and channel-agnostic "involution" kernels.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+Convolution has been the basis of most modern neural
+networks for computer vision. A convolution kernel is
+spatial-agnostic and channel-specific. Because of this, it isn't able
+to adapt to different visual patterns with respect to
+different spatial locations. Along with location-related problems, the
+receptive field of convolution creates challenges with regard to capturing
+long-range spatial interactions.
+
+To address the above issues, Li et. al. rethink the properties
+of convolution in
+[Involution: Inverting the Inherence of Convolution for VisualRecognition](https://arxiv.org/abs/2103.06255).
+The authors propose the "involution kernel", that is location-specific and
+channel-agnostic. Due to the location-specific nature of the operation,
+the authors say that self-attention falls under the design paradigm of
+involution.
+
+This example describes the involution kernel, compares two image
+classification models, one with convolution and the other with
+involution, and also tries drawing a parallel with the self-attention
+layer.
+"""
+
+"""
+## Setup
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import tensorflow as tf
+import keras
+import matplotlib.pyplot as plt
+
+# Set seed for reproducibility.
+tf.random.set_seed(42)
+
+"""
+## Convolution
+
+Convolution remains the mainstay of deep neural networks for computer vision.
+To understand Involution, it is necessary to talk about the
+convolution operation.
+
+![Imgur](https://i.imgur.com/MSKLsm5.png)
+
+Consider an input tensor **X** with dimensions **H**, **W** and
+**C_in**. We take a collection of **C_out** convolution kernels each of
+shape **K**, **K**, **C_in**. With the multiply-add operation between
+the input tensor and the kernels we obtain an output tensor **Y** with
+dimensions **H**, **W**, **C_out**.
+
+In the diagram above `C_out=3`. This makes the output tensor of shape H,
+W and 3. One can notice that the convoltuion kernel does not depend on
+the spatial position of the input tensor which makes it
+**location-agnostic**. On the other hand, each channel in the output
+tensor is based on a specific convolution filter which makes is
+**channel-specific**.
+"""
+
+"""
+## Involution
+
+The idea is to have an operation that is both **location-specific**
+and **channel-agnostic**. Trying to implement these specific properties poses
+a challenge. With a fixed number of involution kernels (for each
+spatial position) we will **not** be able to process variable-resolution
+input tensors.
+
+To solve this problem, the authors have considered *generating* each
+kernel conditioned on specific spatial positions. With this method, we
+should be able to process variable-resolution input tensors with ease.
+The diagram below provides an intuition on this kernel generation
+method.
+
+![Imgur](https://i.imgur.com/jtrGGQg.png)
+"""
+
+
+class Involution(keras.layers.Layer):
+    def __init__(
+        self, channel, group_number, kernel_size, stride, reduction_ratio, name
+    ):
+        super().__init__(name=name)
+
+        # Initialize the parameters.
+        self.channel = channel
+        self.group_number = group_number
+        self.kernel_size = kernel_size
+        self.stride = stride
+        self.reduction_ratio = reduction_ratio
+
+    def build(self, input_shape):
+        # Get the shape of the input.
+        (_, height, width, num_channels) = input_shape
+
+        # Scale the height and width with respect to the strides.
+        height = height // self.stride
+        width = width // self.stride
+
+        # Define a layer that average pools the input tensor
+        # if stride is more than 1.
+        self.stride_layer = (
+            keras.layers.AveragePooling2D(
+                pool_size=self.stride, strides=self.stride, padding="same"
+            )
+            if self.stride > 1
+            else tf.identity
+        )
+        # Define the kernel generation layer.
+        self.kernel_gen = keras.Sequential(
+            [
+                keras.layers.Conv2D(
+                    filters=self.channel // self.reduction_ratio, kernel_size=1
+                ),
+                keras.layers.BatchNormalization(),
+                keras.layers.ReLU(),
+                keras.layers.Conv2D(
+                    filters=self.kernel_size * self.kernel_size * self.group_number,
+                    kernel_size=1,
+                ),
+            ]
+        )
+        # Define reshape layers
+        self.kernel_reshape = keras.layers.Reshape(
+            target_shape=(
+                height,
+                width,
+                self.kernel_size * self.kernel_size,
+                1,
+                self.group_number,
+            )
+        )
+        self.input_patches_reshape = keras.layers.Reshape(
+            target_shape=(
+                height,
+                width,
+                self.kernel_size * self.kernel_size,
+                num_channels // self.group_number,
+                self.group_number,
+            )
+        )
+        self.output_reshape = keras.layers.Reshape(
+            target_shape=(height, width, num_channels)
+        )
+
+    def call(self, x):
+        # Generate the kernel with respect to the input tensor.
+        # B, H, W, K*K*G
+        kernel_input = self.stride_layer(x)
+        kernel = self.kernel_gen(kernel_input)
+
+        # reshape the kerenl
+        # B, H, W, K*K, 1, G
+        kernel = self.kernel_reshape(kernel)
+
+        # Extract input patches.
+        # B, H, W, K*K*C
+        input_patches = tf.image.extract_patches(
+            images=x,
+            sizes=[1, self.kernel_size, self.kernel_size, 1],
+            strides=[1, self.stride, self.stride, 1],
+            rates=[1, 1, 1, 1],
+            padding="SAME",
+        )
+
+        # Reshape the input patches to align with later operations.
+        # B, H, W, K*K, C//G, G
+        input_patches = self.input_patches_reshape(input_patches)
+
+        # Compute the multiply-add operation of kernels and patches.
+        # B, H, W, K*K, C//G, G
+        output = tf.multiply(kernel, input_patches)
+        # B, H, W, C//G, G
+        output = tf.reduce_sum(output, axis=3)
+
+        # Reshape the output kernel.
+        # B, H, W, C
+        output = self.output_reshape(output)
+
+        # Return the output tensor and the kernel.
+        return output, kernel
+
+
+"""
+## Testing the Involution layer
+"""
+
+# Define the input tensor.
+input_tensor = tf.random.normal((32, 256, 256, 3))
+
+# Compute involution with stride 1.
+output_tensor, _ = Involution(
+    channel=3, group_number=1, kernel_size=5, stride=1, reduction_ratio=1, name="inv_1"
+)(input_tensor)
+print(f"with stride 1 ouput shape: {output_tensor.shape}")
+
+# Compute involution with stride 2.
+output_tensor, _ = Involution(
+    channel=3, group_number=1, kernel_size=5, stride=2, reduction_ratio=1, name="inv_2"
+)(input_tensor)
+print(f"with stride 2 ouput shape: {output_tensor.shape}")
+
+# Compute involution with stride 1, channel 16 and reduction ratio 2.
+output_tensor, _ = Involution(
+    channel=16, group_number=1, kernel_size=5, stride=1, reduction_ratio=2, name="inv_3"
+)(input_tensor)
+print(
+    "with channel 16 and reduction ratio 2 ouput shape: {}".format(output_tensor.shape)
+)
+
+"""
+## Image Classification
+
+In this section, we will build an image-classifier model. There will
+be two models one with convolutions and the other with involutions.
+
+The image-classification model is heavily inspired by this
+[Convolutional Neural Network (CNN)](https://www.tensorflow.org/tutorials/images/cnn)
+tutorial from Google.
+"""
+
+"""
+## Get the CIFAR10 Dataset
+"""
+
+# Load the CIFAR10 dataset.
+print("loading the CIFAR10 dataset...")
+(
+    (train_images, train_labels),
+    (
+        test_images,
+        test_labels,
+    ),
+) = keras.datasets.cifar10.load_data()
+
+# Normalize pixel values to be between 0 and 1.
+(train_images, test_images) = (train_images / 255.0, test_images / 255.0)
+
+# Shuffle and batch the dataset.
+train_ds = (
+    tf.data.Dataset.from_tensor_slices((train_images, train_labels))
+    .shuffle(256)
+    .batch(256)
+)
+test_ds = tf.data.Dataset.from_tensor_slices((test_images, test_labels)).batch(256)
+
+"""
+## Visualise the data
+"""
+
+class_names = [
+    "airplane",
+    "automobile",
+    "bird",
+    "cat",
+    "deer",
+    "dog",
+    "frog",
+    "horse",
+    "ship",
+    "truck",
+]
+
+plt.figure(figsize=(10, 10))
+for i in range(25):
+    plt.subplot(5, 5, i + 1)
+    plt.xticks([])
+    plt.yticks([])
+    plt.grid(False)
+    plt.imshow(train_images[i])
+    plt.xlabel(class_names[train_labels[i][0]])
+plt.show()
+
+"""
+## Convolutional Neural Network
+"""
+
+# Build the conv model.
+print("building the convolution model...")
+conv_model = keras.Sequential(
+    [
+        keras.layers.Conv2D(32, (3, 3), input_shape=(32, 32, 3), padding="same"),
+        keras.layers.ReLU(name="relu1"),
+        keras.layers.MaxPooling2D((2, 2)),
+        keras.layers.Conv2D(64, (3, 3), padding="same"),
+        keras.layers.ReLU(name="relu2"),
+        keras.layers.MaxPooling2D((2, 2)),
+        keras.layers.Conv2D(64, (3, 3), padding="same"),
+        keras.layers.ReLU(name="relu3"),
+        keras.layers.Flatten(),
+        keras.layers.Dense(64, activation="relu"),
+        keras.layers.Dense(10),
+    ]
+)
+
+# Compile the mode with the necessary loss function and optimizer.
+print("compiling the convolution model...")
+conv_model.compile(
+    optimizer="adam",
+    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    metrics=["accuracy"],
+)
+
+# Train the model.
+print("conv model training...")
+conv_hist = conv_model.fit(train_ds, epochs=20, validation_data=test_ds)
+
+"""
+## Involutional Neural Network
+"""
+
+# Build the involution model.
+print("building the involution model...")
+
+inputs = keras.Input(shape=(32, 32, 3))
+x, _ = Involution(
+    channel=3, group_number=1, kernel_size=3, stride=1, reduction_ratio=2, name="inv_1"
+)(inputs)
+x = keras.layers.ReLU()(x)
+x = keras.layers.MaxPooling2D((2, 2))(x)
+x, _ = Involution(
+    channel=3, group_number=1, kernel_size=3, stride=1, reduction_ratio=2, name="inv_2"
+)(x)
+x = keras.layers.ReLU()(x)
+x = keras.layers.MaxPooling2D((2, 2))(x)
+x, _ = Involution(
+    channel=3, group_number=1, kernel_size=3, stride=1, reduction_ratio=2, name="inv_3"
+)(x)
+x = keras.layers.ReLU()(x)
+x = keras.layers.Flatten()(x)
+x = keras.layers.Dense(64, activation="relu")(x)
+outputs = keras.layers.Dense(10)(x)
+
+inv_model = keras.Model(inputs=[inputs], outputs=[outputs], name="inv_model")
+
+# Compile the mode with the necessary loss function and optimizer.
+print("compiling the involution model...")
+inv_model.compile(
+    optimizer="adam",
+    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    metrics=["accuracy"],
+)
+
+# train the model
+print("inv model training...")
+inv_hist = inv_model.fit(train_ds, epochs=20, validation_data=test_ds)
+
+"""
+## Comparisons
+
+In this section, we will be looking at both the models and compare a
+few pointers.
+"""
+
+"""
+### Parameters
+
+One can see that with a similar architecture the parameters in a CNN
+is much larger than that of an INN (Involutional Neural Network).
+"""
+
+conv_model.summary()
+
+inv_model.summary()
+
+"""
+### Loss and Accuracy Plots
+
+Here, the loss and the accuracy plots demonstrate that INNs are slow
+learners (with lower parameters).
+"""
+
+plt.figure(figsize=(20, 5))
+
+plt.subplot(1, 2, 1)
+plt.title("Convolution Loss")
+plt.plot(conv_hist.history["loss"], label="loss")
+plt.plot(conv_hist.history["val_loss"], label="val_loss")
+plt.legend()
+
+plt.subplot(1, 2, 2)
+plt.title("Involution Loss")
+plt.plot(inv_hist.history["loss"], label="loss")
+plt.plot(inv_hist.history["val_loss"], label="val_loss")
+plt.legend()
+
+plt.show()
+
+plt.figure(figsize=(20, 5))
+
+plt.subplot(1, 2, 1)
+plt.title("Convolution Accuracy")
+plt.plot(conv_hist.history["accuracy"], label="accuracy")
+plt.plot(conv_hist.history["val_accuracy"], label="val_accuracy")
+plt.legend()
+
+plt.subplot(1, 2, 2)
+plt.title("Involution Accuracy")
+plt.plot(inv_hist.history["accuracy"], label="accuracy")
+plt.plot(inv_hist.history["val_accuracy"], label="val_accuracy")
+plt.legend()
+
+plt.show()
+
+"""
+## Visualizing Involution Kernels
+
+To visualize the kernels, we take the sum of **K×K** values from each
+involution kernel. **All the representatives at different spatial
+locations frame the corresponding heat map.**
+
+The authors mention:
+
+"Our proposed involution is reminiscent of self-attention and
+essentially could become a generalized version of it."
+
+With the visualization of the kernel we can indeed obtain an attention
+map of the image. The learned involution kernels provides attention to
+individual spatial positions of the input tensor. The
+**location-specific** property makes involution a generic space of models
+in which self-attention belongs.
+"""
+
+layer_names = ["inv_1", "inv_2", "inv_3"]
+outputs = [inv_model.get_layer(name).output[1] for name in layer_names]
+vis_model = keras.Model(inv_model.input, outputs)
+
+fig, axes = plt.subplots(nrows=10, ncols=4, figsize=(10, 30))
+
+for ax, test_image in zip(axes, test_images[:10]):
+    (inv1_kernel, inv2_kernel, inv3_kernel) = vis_model.predict(test_image[None, ...])
+    inv1_kernel = tf.reduce_sum(inv1_kernel, axis=[-1, -2, -3])
+    inv2_kernel = tf.reduce_sum(inv2_kernel, axis=[-1, -2, -3])
+    inv3_kernel = tf.reduce_sum(inv3_kernel, axis=[-1, -2, -3])
+
+    ax[0].imshow(keras.utils.array_to_img(test_image))
+    ax[0].set_title("Input Image")
+
+    ax[1].imshow(keras.utils.array_to_img(inv1_kernel[0, ..., None]))
+    ax[1].set_title("Involution Kernel 1")
+
+    ax[2].imshow(keras.utils.array_to_img(inv2_kernel[0, ..., None]))
+    ax[2].set_title("Involution Kernel 2")
+
+    ax[3].imshow(keras.utils.array_to_img(inv3_kernel[0, ..., None]))
+    ax[3].set_title("Involution Kernel 3")
+
+"""
+## Conclusions
+
+In this example, the main focus was to build an `Involution` layer which
+can be easily reused. While our comparisons were based on a specific
+task, feel free to use the layer for different tasks and report your
+results.
+
+According to me, the key take-away of involution is its
+relationship with self-attention. The intuition behind location-specific
+and channel-spefic processing makes sense in a lot of tasks.
+
+Moving forward one can:
+
+- Look at [Yannick's video](https://youtu.be/pH2jZun8MoY) on
+    involution for a better understanding.
+- Experiment with the various hyperparameters of the involution layer.
+- Build different models with the involution layer.
+- Try building a different kernel generation method altogether.
+
+You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/keras-io/involution)
+and try the demo on [Hugging Face Spaces](https://huggingface.co/spaces/keras-io/involution).
+"""
diff --git a/knowledge_base/vision/keypoint_detection.py b/knowledge_base/vision/keypoint_detection.py
new file mode 100644
index 0000000000000000000000000000000000000000..13a7a69ac4801b3f1d4d75cf1f08c9f6ac6365f2
--- /dev/null
+++ b/knowledge_base/vision/keypoint_detection.py
@@ -0,0 +1,459 @@
+"""
+Title: Keypoint Detection with Transfer Learning
+Author: [Sayak Paul](https://twitter.com/RisingSayak), converted to Keras 3 by [Muhammad Anas Raza](https://anasrz.com)
+Date created: 2021/05/02
+Last modified: 2023/07/19
+Description: Training a keypoint detector with data augmentation and transfer learning.
+Accelerator: GPU
+"""
+
+"""
+Keypoint detection consists of locating key object parts. For example, the key parts
+of our faces include nose tips, eyebrows, eye corners, and so on. These parts help to
+represent the underlying object in a feature-rich manner. Keypoint detection has
+applications that include pose estimation, face detection, etc.
+
+In this example, we will build a keypoint detector using the
+[StanfordExtra dataset](https://github.com/benjiebob/StanfordExtra),
+using transfer learning. This example requires TensorFlow 2.4 or higher,
+as well as [`imgaug`](https://imgaug.readthedocs.io/) library,
+which can be installed using the following command:
+"""
+
+"""shell
+pip install -q -U imgaug
+"""
+
+"""
+## Data collection
+"""
+
+"""
+The StanfordExtra dataset contains 12,000 images of dogs together with keypoints and
+segmentation maps. It is developed from the [Stanford dogs dataset](http://vision.stanford.edu/aditya86/ImageNetDogs/).
+It can be downloaded with the command below:
+"""
+
+"""shell
+wget -q http://vision.stanford.edu/aditya86/ImageNetDogs/images.tar
+"""
+
+"""
+Annotations are provided as a single JSON file in the StanfordExtra dataset and one needs
+to fill [this form](https://forms.gle/sRtbicgxsWvRtRmUA) to get access to it. The
+authors explicitly instruct users not to share the JSON file, and this example respects this wish:
+you should obtain the JSON file yourself.
+
+The JSON file is expected to be locally available as `stanfordextra_v12.zip`.
+
+After the files are downloaded, we can extract the archives.
+"""
+
+"""shell
+tar xf images.tar
+unzip -qq ~/stanfordextra_v12.zip
+"""
+
+"""
+## Imports
+"""
+from keras import layers
+import keras
+
+from imgaug.augmentables.kps import KeypointsOnImage
+from imgaug.augmentables.kps import Keypoint
+import imgaug.augmenters as iaa
+
+from PIL import Image
+from sklearn.model_selection import train_test_split
+from matplotlib import pyplot as plt
+import pandas as pd
+import numpy as np
+import json
+import os
+
+"""
+## Define hyperparameters
+"""
+
+IMG_SIZE = 224
+BATCH_SIZE = 64
+EPOCHS = 5
+NUM_KEYPOINTS = 24 * 2  # 24 pairs each having x and y coordinates
+
+"""
+## Load data
+
+The authors also provide a metadata file that specifies additional information about the
+keypoints, like color information, animal pose name, etc. We will load this file in a `pandas`
+dataframe to extract information for visualization purposes.
+"""
+
+IMG_DIR = "Images"
+JSON = "StanfordExtra_V12/StanfordExtra_v12.json"
+KEYPOINT_DEF = (
+    "https://github.com/benjiebob/StanfordExtra/raw/master/keypoint_definitions.csv"
+)
+
+# Load the ground-truth annotations.
+with open(JSON) as infile:
+    json_data = json.load(infile)
+
+# Set up a dictionary, mapping all the ground-truth information
+# with respect to the path of the image.
+json_dict = {i["img_path"]: i for i in json_data}
+
+"""
+A single entry of `json_dict` looks like the following:
+
+```
+'n02085782-Japanese_spaniel/n02085782_2886.jpg':
+{'img_bbox': [205, 20, 116, 201],
+ 'img_height': 272,
+ 'img_path': 'n02085782-Japanese_spaniel/n02085782_2886.jpg',
+ 'img_width': 350,
+ 'is_multiple_dogs': False,
+ 'joints': [[108.66666666666667, 252.0, 1],
+            [147.66666666666666, 229.0, 1],
+            [163.5, 208.5, 1],
+            [0, 0, 0],
+            [0, 0, 0],
+            [0, 0, 0],
+            [54.0, 244.0, 1],
+            [77.33333333333333, 225.33333333333334, 1],
+            [79.0, 196.5, 1],
+            [0, 0, 0],
+            [0, 0, 0],
+            [0, 0, 0],
+            [0, 0, 0],
+            [0, 0, 0],
+            [150.66666666666666, 86.66666666666667, 1],
+            [88.66666666666667, 73.0, 1],
+            [116.0, 106.33333333333333, 1],
+            [109.0, 123.33333333333333, 1],
+            [0, 0, 0],
+            [0, 0, 0],
+            [0, 0, 0],
+            [0, 0, 0],
+            [0, 0, 0],
+            [0, 0, 0]],
+ 'seg': ...}
+```
+"""
+
+"""
+In this example, the keys we are interested in are:
+
+* `img_path`
+* `joints`
+
+There are a total of 24 entries present inside `joints`. Each entry has 3 values:
+
+* x-coordinate
+* y-coordinate
+* visibility flag of the keypoints (1 indicates visibility and 0 indicates non-visibility)
+
+As we can see `joints` contain multiple `[0, 0, 0]` entries which denote that those
+keypoints were not labeled. In this example, we will consider both non-visible as well as
+unlabeled keypoints in order to allow mini-batch learning.
+"""
+
+# Load the metdata definition file and preview it.
+keypoint_def = pd.read_csv(KEYPOINT_DEF)
+keypoint_def.head()
+
+# Extract the colours and labels.
+colours = keypoint_def["Hex colour"].values.tolist()
+colours = ["#" + colour for colour in colours]
+labels = keypoint_def["Name"].values.tolist()
+
+
+# Utility for reading an image and for getting its annotations.
+def get_dog(name):
+    data = json_dict[name]
+    img_data = plt.imread(os.path.join(IMG_DIR, data["img_path"]))
+    # If the image is RGBA convert it to RGB.
+    if img_data.shape[-1] == 4:
+        img_data = img_data.astype(np.uint8)
+        img_data = Image.fromarray(img_data)
+        img_data = np.array(img_data.convert("RGB"))
+    data["img_data"] = img_data
+
+    return data
+
+
+"""
+## Visualize data
+
+Now, we write a utility function to visualize the images and their keypoints.
+"""
+
+
+# Parts of this code come from here:
+# https://github.com/benjiebob/StanfordExtra/blob/master/demo.ipynb
+def visualize_keypoints(images, keypoints):
+    fig, axes = plt.subplots(nrows=len(images), ncols=2, figsize=(16, 12))
+    [ax.axis("off") for ax in np.ravel(axes)]
+
+    for (ax_orig, ax_all), image, current_keypoint in zip(axes, images, keypoints):
+        ax_orig.imshow(image)
+        ax_all.imshow(image)
+
+        # If the keypoints were formed by `imgaug` then the coordinates need
+        # to be iterated differently.
+        if isinstance(current_keypoint, KeypointsOnImage):
+            for idx, kp in enumerate(current_keypoint.keypoints):
+                ax_all.scatter(
+                    [kp.x],
+                    [kp.y],
+                    c=colours[idx],
+                    marker="x",
+                    s=50,
+                    linewidths=5,
+                )
+        else:
+            current_keypoint = np.array(current_keypoint)
+            # Since the last entry is the visibility flag, we discard it.
+            current_keypoint = current_keypoint[:, :2]
+            for idx, (x, y) in enumerate(current_keypoint):
+                ax_all.scatter([x], [y], c=colours[idx], marker="x", s=50, linewidths=5)
+
+    plt.tight_layout(pad=2.0)
+    plt.show()
+
+
+# Select four samples randomly for visualization.
+samples = list(json_dict.keys())
+num_samples = 4
+selected_samples = np.random.choice(samples, num_samples, replace=False)
+
+images, keypoints = [], []
+
+for sample in selected_samples:
+    data = get_dog(sample)
+    image = data["img_data"]
+    keypoint = data["joints"]
+
+    images.append(image)
+    keypoints.append(keypoint)
+
+visualize_keypoints(images, keypoints)
+
+"""
+The plots show that we have images of non-uniform sizes, which is expected in most
+real-world scenarios. However, if we resize these images to have a uniform shape (for
+instance (224 x 224)) their ground-truth annotations will also be affected. The same
+applies if we apply any geometric transformation (horizontal flip, for e.g.) to an image.
+Fortunately, `imgaug` provides utilities that can handle this issue.
+In the next section, we will write a data generator inheriting the
+[`keras.utils.Sequence`](https://keras.io/api/utils/python_utils/#sequence-class) class
+that applies data augmentation on batches of data using `imgaug`.
+"""
+
+"""
+## Prepare data generator
+"""
+
+
+class KeyPointsDataset(keras.utils.PyDataset):
+    def __init__(self, image_keys, aug, batch_size=BATCH_SIZE, train=True, **kwargs):
+        super().__init__(**kwargs)
+        self.image_keys = image_keys
+        self.aug = aug
+        self.batch_size = batch_size
+        self.train = train
+        self.on_epoch_end()
+
+    def __len__(self):
+        return len(self.image_keys) // self.batch_size
+
+    def on_epoch_end(self):
+        self.indexes = np.arange(len(self.image_keys))
+        if self.train:
+            np.random.shuffle(self.indexes)
+
+    def __getitem__(self, index):
+        indexes = self.indexes[index * self.batch_size : (index + 1) * self.batch_size]
+        image_keys_temp = [self.image_keys[k] for k in indexes]
+        (images, keypoints) = self.__data_generation(image_keys_temp)
+
+        return (images, keypoints)
+
+    def __data_generation(self, image_keys_temp):
+        batch_images = np.empty((self.batch_size, IMG_SIZE, IMG_SIZE, 3), dtype="int")
+        batch_keypoints = np.empty(
+            (self.batch_size, 1, 1, NUM_KEYPOINTS), dtype="float32"
+        )
+
+        for i, key in enumerate(image_keys_temp):
+            data = get_dog(key)
+            current_keypoint = np.array(data["joints"])[:, :2]
+            kps = []
+
+            # To apply our data augmentation pipeline, we first need to
+            # form Keypoint objects with the original coordinates.
+            for j in range(0, len(current_keypoint)):
+                kps.append(Keypoint(x=current_keypoint[j][0], y=current_keypoint[j][1]))
+
+            # We then project the original image and its keypoint coordinates.
+            current_image = data["img_data"]
+            kps_obj = KeypointsOnImage(kps, shape=current_image.shape)
+
+            # Apply the augmentation pipeline.
+            (new_image, new_kps_obj) = self.aug(image=current_image, keypoints=kps_obj)
+            batch_images[i,] = new_image
+
+            # Parse the coordinates from the new keypoint object.
+            kp_temp = []
+            for keypoint in new_kps_obj:
+                kp_temp.append(np.nan_to_num(keypoint.x))
+                kp_temp.append(np.nan_to_num(keypoint.y))
+
+            # More on why this reshaping later.
+            batch_keypoints[i,] = np.array(kp_temp).reshape(1, 1, 24 * 2)
+
+        # Scale the coordinates to [0, 1] range.
+        batch_keypoints = batch_keypoints / IMG_SIZE
+
+        return (batch_images, batch_keypoints)
+
+
+"""
+To know more about how to operate with keypoints in `imgaug` check out
+[this document](https://imgaug.readthedocs.io/en/latest/source/examples_keypoints.html).
+"""
+
+"""
+## Define augmentation transforms
+"""
+
+train_aug = iaa.Sequential(
+    [
+        iaa.Resize(IMG_SIZE, interpolation="linear"),
+        iaa.Fliplr(0.3),
+        # `Sometimes()` applies a function randomly to the inputs with
+        # a given probability (0.3, in this case).
+        iaa.Sometimes(0.3, iaa.Affine(rotate=10, scale=(0.5, 0.7))),
+    ]
+)
+
+test_aug = iaa.Sequential([iaa.Resize(IMG_SIZE, interpolation="linear")])
+
+"""
+## Create training and validation splits
+"""
+
+np.random.shuffle(samples)
+train_keys, validation_keys = (
+    samples[int(len(samples) * 0.15) :],
+    samples[: int(len(samples) * 0.15)],
+)
+
+
+"""
+## Data generator investigation
+"""
+
+train_dataset = KeyPointsDataset(
+    train_keys, train_aug, workers=2, use_multiprocessing=True
+)
+validation_dataset = KeyPointsDataset(
+    validation_keys, test_aug, train=False, workers=2, use_multiprocessing=True
+)
+
+print(f"Total batches in training set: {len(train_dataset)}")
+print(f"Total batches in validation set: {len(validation_dataset)}")
+
+sample_images, sample_keypoints = next(iter(train_dataset))
+assert sample_keypoints.max() == 1.0
+assert sample_keypoints.min() == 0.0
+
+sample_keypoints = sample_keypoints[:4].reshape(-1, 24, 2) * IMG_SIZE
+visualize_keypoints(sample_images[:4], sample_keypoints)
+
+"""
+## Model building
+
+The [Stanford dogs dataset](http://vision.stanford.edu/aditya86/ImageNetDogs/) (on which
+the StanfordExtra dataset is based) was built using the [ImageNet-1k dataset](http://image-net.org/).
+So, it is likely that the models pretrained on the ImageNet-1k dataset would be useful
+for this task. We will use a MobileNetV2 pre-trained on this dataset as a backbone to
+extract meaningful features from the images and then pass those to a custom regression
+head for predicting coordinates.
+"""
+
+
+def get_model():
+    # Load the pre-trained weights of MobileNetV2 and freeze the weights
+    backbone = keras.applications.MobileNetV2(
+        weights="imagenet",
+        include_top=False,
+        input_shape=(IMG_SIZE, IMG_SIZE, 3),
+    )
+    backbone.trainable = False
+
+    inputs = layers.Input((IMG_SIZE, IMG_SIZE, 3))
+    x = keras.applications.mobilenet_v2.preprocess_input(inputs)
+    x = backbone(x)
+    x = layers.Dropout(0.3)(x)
+    x = layers.SeparableConv2D(
+        NUM_KEYPOINTS, kernel_size=5, strides=1, activation="relu"
+    )(x)
+    outputs = layers.SeparableConv2D(
+        NUM_KEYPOINTS, kernel_size=3, strides=1, activation="sigmoid"
+    )(x)
+
+    return keras.Model(inputs, outputs, name="keypoint_detector")
+
+
+"""
+Our custom network is fully-convolutional which makes it more parameter-friendly than the
+same version of the network having fully-connected dense layers.
+"""
+
+get_model().summary()
+
+"""
+Notice the output shape of the network: `(None, 1, 1, 48)`. This is why we have reshaped
+the coordinates as: `batch_keypoints[i, :] = np.array(kp_temp).reshape(1, 1, 24 * 2)`.
+"""
+
+"""
+## Model compilation and training
+
+For this example, we will train the network only for five epochs.
+"""
+
+model = get_model()
+model.compile(loss="mse", optimizer=keras.optimizers.Adam(1e-4))
+model.fit(train_dataset, validation_data=validation_dataset, epochs=EPOCHS)
+
+"""
+## Make predictions and visualize them
+"""
+
+sample_val_images, sample_val_keypoints = next(iter(validation_dataset))
+sample_val_images = sample_val_images[:4]
+sample_val_keypoints = sample_val_keypoints[:4].reshape(-1, 24, 2) * IMG_SIZE
+predictions = model.predict(sample_val_images).reshape(-1, 24, 2) * IMG_SIZE
+
+# Ground-truth
+visualize_keypoints(sample_val_images, sample_val_keypoints)
+
+# Predictions
+visualize_keypoints(sample_val_images, predictions)
+
+"""
+Predictions will likely improve with more training.
+"""
+
+"""
+## Going further
+
+* Try using other augmentation transforms from `imgaug` to investigate how that changes
+the results.
+* Here, we transferred the features from the pre-trained network linearly that is we did
+not [fine-tune](https://keras.io/guides/transfer_learning/) it. You are encouraged to fine-tune it on this task and see if that
+improves the performance. You can also try different architectures and see how they
+affect the final performance.
+"""
diff --git a/knowledge_base/vision/knowledge_distillation.py b/knowledge_base/vision/knowledge_distillation.py
new file mode 100644
index 0000000000000000000000000000000000000000..4af348658131d42db6a79d81868784662dbb28d1
--- /dev/null
+++ b/knowledge_base/vision/knowledge_distillation.py
@@ -0,0 +1,242 @@
+"""
+Title: Knowledge Distillation
+Author: [Kenneth Borup](https://twitter.com/Kennethborup)
+Date created: 2020/09/01
+Last modified: 2020/09/01
+Description: Implementation of classical Knowledge Distillation.
+Accelerator: GPU
+Converted to Keras 3 by: [Md Awsafur Rahman](https://awsaf49.github.io)
+"""
+
+"""
+## Introduction to Knowledge Distillation
+
+Knowledge Distillation is a procedure for model
+compression, in which a small (student) model is trained to match a large pre-trained
+(teacher) model. Knowledge is transferred from the teacher model to the student
+by minimizing a loss function, aimed at matching softened teacher logits as well as
+ground-truth labels.
+
+The logits are softened by applying a "temperature" scaling function in the softmax,
+effectively smoothing out the probability distribution and revealing
+inter-class relationships learned by the teacher.
+
+**Reference:**
+
+- [Hinton et al. (2015)](https://arxiv.org/abs/1503.02531)
+"""
+
+"""
+## Setup
+"""
+
+import os
+
+import keras
+from keras import layers
+from keras import ops
+import numpy as np
+
+"""
+## Construct `Distiller()` class
+
+The custom `Distiller()` class, overrides the `Model` methods `compile`, `compute_loss`,
+and `call`. In order to use the distiller, we need:
+
+- A trained teacher model
+- A student model to train
+- A student loss function on the difference between student predictions and ground-truth
+- A distillation loss function, along with a `temperature`, on the difference between the
+soft student predictions and the soft teacher labels
+- An `alpha` factor to weight the student and distillation loss
+- An optimizer for the student and (optional) metrics to evaluate performance
+
+In the `compute_loss` method, we perform a forward pass of both the teacher and student,
+calculate the loss with weighting of the `student_loss` and `distillation_loss` by `alpha`
+and `1 - alpha`, respectively. Note: only the student weights are updated.
+"""
+
+
+class Distiller(keras.Model):
+    def __init__(self, student, teacher):
+        super().__init__()
+        self.teacher = teacher
+        self.student = student
+
+    def compile(
+        self,
+        optimizer,
+        metrics,
+        student_loss_fn,
+        distillation_loss_fn,
+        alpha=0.1,
+        temperature=3,
+    ):
+        """Configure the distiller.
+
+        Args:
+            optimizer: Keras optimizer for the student weights
+            metrics: Keras metrics for evaluation
+            student_loss_fn: Loss function of difference between student
+                predictions and ground-truth
+            distillation_loss_fn: Loss function of difference between soft
+                student predictions and soft teacher predictions
+            alpha: weight to student_loss_fn and 1-alpha to distillation_loss_fn
+            temperature: Temperature for softening probability distributions.
+                Larger temperature gives softer distributions.
+        """
+        super().compile(optimizer=optimizer, metrics=metrics)
+        self.student_loss_fn = student_loss_fn
+        self.distillation_loss_fn = distillation_loss_fn
+        self.alpha = alpha
+        self.temperature = temperature
+
+    def compute_loss(
+        self, x=None, y=None, y_pred=None, sample_weight=None, allow_empty=False
+    ):
+        teacher_pred = self.teacher(x, training=False)
+        student_loss = self.student_loss_fn(y, y_pred)
+
+        distillation_loss = self.distillation_loss_fn(
+            ops.softmax(teacher_pred / self.temperature, axis=1),
+            ops.softmax(y_pred / self.temperature, axis=1),
+        ) * (self.temperature**2)
+
+        loss = self.alpha * student_loss + (1 - self.alpha) * distillation_loss
+        return loss
+
+    def call(self, x):
+        return self.student(x)
+
+
+"""
+## Create student and teacher models
+
+Initialy, we create a teacher model and a smaller student model. Both models are
+convolutional neural networks and created using `Sequential()`,
+but could be any Keras model.
+"""
+
+# Create the teacher
+teacher = keras.Sequential(
+    [
+        keras.Input(shape=(28, 28, 1)),
+        layers.Conv2D(256, (3, 3), strides=(2, 2), padding="same"),
+        layers.LeakyReLU(negative_slope=0.2),
+        layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding="same"),
+        layers.Conv2D(512, (3, 3), strides=(2, 2), padding="same"),
+        layers.Flatten(),
+        layers.Dense(10),
+    ],
+    name="teacher",
+)
+
+# Create the student
+student = keras.Sequential(
+    [
+        keras.Input(shape=(28, 28, 1)),
+        layers.Conv2D(16, (3, 3), strides=(2, 2), padding="same"),
+        layers.LeakyReLU(negative_slope=0.2),
+        layers.MaxPooling2D(pool_size=(2, 2), strides=(1, 1), padding="same"),
+        layers.Conv2D(32, (3, 3), strides=(2, 2), padding="same"),
+        layers.Flatten(),
+        layers.Dense(10),
+    ],
+    name="student",
+)
+
+# Clone student for later comparison
+student_scratch = keras.models.clone_model(student)
+
+"""
+## Prepare the dataset
+
+The dataset used for training the teacher and distilling the teacher is
+[MNIST](https://keras.io/api/datasets/mnist/), and the procedure would be equivalent for
+any other
+dataset, e.g. [CIFAR-10](https://keras.io/api/datasets/cifar10/), with a suitable choice
+of models. Both the student and teacher are trained on the training set and evaluated on
+the test set.
+"""
+
+# Prepare the train and test dataset.
+batch_size = 64
+(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
+
+# Normalize data
+x_train = x_train.astype("float32") / 255.0
+x_train = np.reshape(x_train, (-1, 28, 28, 1))
+
+x_test = x_test.astype("float32") / 255.0
+x_test = np.reshape(x_test, (-1, 28, 28, 1))
+
+
+"""
+## Train the teacher
+
+In knowledge distillation we assume that the teacher is trained and fixed. Thus, we start
+by training the teacher model on the training set in the usual way.
+"""
+
+# Train teacher as usual
+teacher.compile(
+    optimizer=keras.optimizers.Adam(),
+    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    metrics=[keras.metrics.SparseCategoricalAccuracy()],
+)
+
+# Train and evaluate teacher on data.
+teacher.fit(x_train, y_train, epochs=5)
+teacher.evaluate(x_test, y_test)
+
+"""
+## Distill teacher to student
+
+We have already trained the teacher model, and we only need to initialize a
+`Distiller(student, teacher)` instance, `compile()` it with the desired losses,
+hyperparameters and optimizer, and distill the teacher to the student.
+"""
+
+# Initialize and compile distiller
+distiller = Distiller(student=student, teacher=teacher)
+distiller.compile(
+    optimizer=keras.optimizers.Adam(),
+    metrics=[keras.metrics.SparseCategoricalAccuracy()],
+    student_loss_fn=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    distillation_loss_fn=keras.losses.KLDivergence(),
+    alpha=0.1,
+    temperature=10,
+)
+
+# Distill teacher to student
+distiller.fit(x_train, y_train, epochs=3)
+
+# Evaluate student on test dataset
+distiller.evaluate(x_test, y_test)
+
+"""
+## Train student from scratch for comparison
+
+We can also train an equivalent student model from scratch without the teacher, in order
+to evaluate the performance gain obtained by knowledge distillation.
+"""
+
+# Train student as doen usually
+student_scratch.compile(
+    optimizer=keras.optimizers.Adam(),
+    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    metrics=[keras.metrics.SparseCategoricalAccuracy()],
+)
+
+# Train and evaluate student trained from scratch.
+student_scratch.fit(x_train, y_train, epochs=3)
+student_scratch.evaluate(x_test, y_test)
+
+"""
+If the teacher is trained for 5 full epochs and the student is distilled on this teacher
+for 3 full epochs, you should in this example experience a performance boost compared to
+training the same student model from scratch, and even compared to the teacher itself.
+You should expect the teacher to have accuracy around 97.6%, the student trained from
+scratch should be around 97.6%, and the distilled student should be around 98.1%. Remove
+or try out different seeds to use different weight initializations.
+"""
diff --git a/knowledge_base/vision/learnable_resizer.py b/knowledge_base/vision/learnable_resizer.py
new file mode 100644
index 0000000000000000000000000000000000000000..f13ff73eb5743e20e16243b53190e9a4c1030206
--- /dev/null
+++ b/knowledge_base/vision/learnable_resizer.py
@@ -0,0 +1,321 @@
+"""
+Title: Learning to Resize in Computer Vision
+Author: [Sayak Paul](https://twitter.com/RisingSayak)
+Date created: 2021/04/30
+Last modified: 2023/12/18
+Description: How to optimally learn representations of images for a given resolution.
+Accelerator: GPU
+"""
+
+"""
+It is a common belief that if we constrain vision models to perceive things as humans do,
+their performance can be improved. For example, in [this work](https://arxiv.org/abs/1811.12231),
+Geirhos et al. showed that the vision models pre-trained on the ImageNet-1k dataset are
+biased towards texture, whereas human beings mostly use the shape descriptor to develop a
+common perception. But does this belief always apply, especially when it comes to improving
+the performance of vision models?
+
+It turns out it may not always be the case. When training vision models, it is common to
+resize images to a lower dimension ((224 x 224), (299 x 299), etc.) to allow mini-batch
+learning and also to keep up the compute limitations.  We generally make use of image
+resizing methods like **bilinear interpolation** for this step and the resized images do
+not lose much of their perceptual character to the human eyes. In
+[Learning to Resize Images for Computer Vision Tasks](https://arxiv.org/abs/2103.09950v1), Talebi et al. show
+that if we try to optimize the perceptual quality of the images for the vision models
+rather than the human eyes, their performance can further be improved. They investigate
+the following question:
+
+**For a given image resolution and a model, how to best resize the given images?**
+
+As shown in the paper, this idea helps to consistently improve the performance of the
+common vision models (pre-trained on ImageNet-1k) like DenseNet-121, ResNet-50,
+MobileNetV2, and EfficientNets. In this example, we will implement the learnable image
+resizing module as proposed in the paper and demonstrate that on the
+[Cats and Dogs dataset](https://www.microsoft.com/en-us/download/details.aspx?id=54765)
+using the [DenseNet-121](https://arxiv.org/abs/1608.06993) architecture.
+"""
+
+"""
+## Setup
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+import keras
+from keras import ops
+from keras import layers
+import tensorflow as tf
+
+import tensorflow_datasets as tfds
+
+tfds.disable_progress_bar()
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+"""
+## Define hyperparameters
+"""
+
+"""
+In order to facilitate mini-batch learning, we need to have a fixed shape for the images
+inside a given batch. This is why an initial resizing is required. We first resize all
+the images to (300 x 300) shape and then learn their optimal representation for the
+(150 x 150) resolution.
+"""
+
+INP_SIZE = (300, 300)
+TARGET_SIZE = (150, 150)
+INTERPOLATION = "bilinear"
+
+AUTO = tf.data.AUTOTUNE
+BATCH_SIZE = 64
+EPOCHS = 5
+
+"""
+In this example, we will use the bilinear interpolation but the learnable image resizer
+module is not dependent on any specific interpolation method. We can also use others,
+such as bicubic.
+"""
+
+"""
+## Load and prepare the dataset
+
+For this example, we will only use 40% of the total training dataset.
+"""
+
+train_ds, validation_ds = tfds.load(
+    "cats_vs_dogs",
+    # Reserve 10% for validation
+    split=["train[:40%]", "train[40%:50%]"],
+    as_supervised=True,
+)
+
+
+def preprocess_dataset(image, label):
+    image = ops.image.resize(image, (INP_SIZE[0], INP_SIZE[1]))
+    label = ops.one_hot(label, num_classes=2)
+    return (image, label)
+
+
+train_ds = (
+    train_ds.shuffle(BATCH_SIZE * 100)
+    .map(preprocess_dataset, num_parallel_calls=AUTO)
+    .batch(BATCH_SIZE)
+    .prefetch(AUTO)
+)
+validation_ds = (
+    validation_ds.map(preprocess_dataset, num_parallel_calls=AUTO)
+    .batch(BATCH_SIZE)
+    .prefetch(AUTO)
+)
+
+"""
+## Define the learnable resizer utilities
+
+The figure below (courtesy: [Learning to Resize Images for Computer Vision Tasks](https://arxiv.org/abs/2103.09950v1))
+presents the structure of the learnable resizing module:
+
+![](https://i.ibb.co/gJYtSs0/image.png)
+"""
+
+
+def conv_block(x, filters, kernel_size, strides, activation=layers.LeakyReLU(0.2)):
+    x = layers.Conv2D(filters, kernel_size, strides, padding="same", use_bias=False)(x)
+    x = layers.BatchNormalization()(x)
+    if activation:
+        x = activation(x)
+    return x
+
+
+def res_block(x):
+    inputs = x
+    x = conv_block(x, 16, 3, 1)
+    x = conv_block(x, 16, 3, 1, activation=None)
+    return layers.Add()([inputs, x])
+
+    # Note: user can change num_res_blocks to >1 also if needed
+
+
+def get_learnable_resizer(filters=16, num_res_blocks=1, interpolation=INTERPOLATION):
+    inputs = layers.Input(shape=[None, None, 3])
+
+    # First, perform naive resizing.
+    naive_resize = layers.Resizing(*TARGET_SIZE, interpolation=interpolation)(inputs)
+
+    # First convolution block without batch normalization.
+    x = layers.Conv2D(filters=filters, kernel_size=7, strides=1, padding="same")(inputs)
+    x = layers.LeakyReLU(0.2)(x)
+
+    # Second convolution block with batch normalization.
+    x = layers.Conv2D(filters=filters, kernel_size=1, strides=1, padding="same")(x)
+    x = layers.LeakyReLU(0.2)(x)
+    x = layers.BatchNormalization()(x)
+
+    # Intermediate resizing as a bottleneck.
+    bottleneck = layers.Resizing(*TARGET_SIZE, interpolation=interpolation)(x)
+
+    # Residual passes.
+    # First res_block will get bottleneck output as input
+    x = res_block(bottleneck)
+    # Remaining res_blocks will get previous res_block output as input
+    for _ in range(num_res_blocks - 1):
+        x = res_block(x)
+
+    # Projection.
+    x = layers.Conv2D(
+        filters=filters, kernel_size=3, strides=1, padding="same", use_bias=False
+    )(x)
+    x = layers.BatchNormalization()(x)
+
+    # Skip connection.
+    x = layers.Add()([bottleneck, x])
+
+    # Final resized image.
+    x = layers.Conv2D(filters=3, kernel_size=7, strides=1, padding="same")(x)
+    final_resize = layers.Add()([naive_resize, x])
+
+    return keras.Model(inputs, final_resize, name="learnable_resizer")
+
+
+learnable_resizer = get_learnable_resizer()
+
+"""
+## Visualize the outputs of the learnable resizing module
+
+Here, we visualize how the resized images would look like after being passed through the
+random weights of the resizer.
+"""
+
+sample_images, _ = next(iter(train_ds))
+
+
+plt.figure(figsize=(16, 10))
+for i, image in enumerate(sample_images[:6]):
+    image = image / 255
+
+    ax = plt.subplot(3, 4, 2 * i + 1)
+    plt.title("Input Image")
+    plt.imshow(image.numpy().squeeze())
+    plt.axis("off")
+
+    ax = plt.subplot(3, 4, 2 * i + 2)
+    resized_image = learnable_resizer(image[None, ...])
+    plt.title("Resized Image")
+    plt.imshow(resized_image.numpy().squeeze())
+    plt.axis("off")
+
+"""
+## Model building utility
+"""
+
+
+def get_model():
+    backbone = keras.applications.DenseNet121(
+        weights=None,
+        include_top=True,
+        classes=2,
+        input_shape=((TARGET_SIZE[0], TARGET_SIZE[1], 3)),
+    )
+    backbone.trainable = True
+
+    inputs = layers.Input((INP_SIZE[0], INP_SIZE[1], 3))
+    x = layers.Rescaling(scale=1.0 / 255)(inputs)
+    x = learnable_resizer(x)
+    outputs = backbone(x)
+
+    return keras.Model(inputs, outputs)
+
+
+"""
+The structure of the learnable image resizer module allows for flexible integrations with
+different vision models.
+"""
+
+"""
+## Compile and train our model with learnable resizer
+"""
+
+model = get_model()
+model.compile(
+    loss=keras.losses.CategoricalCrossentropy(label_smoothing=0.1),
+    optimizer="sgd",
+    metrics=["accuracy"],
+)
+model.fit(train_ds, validation_data=validation_ds, epochs=EPOCHS)
+
+"""
+## Visualize the outputs of the trained visualizer
+"""
+
+plt.figure(figsize=(16, 10))
+for i, image in enumerate(sample_images[:6]):
+    image = image / 255
+
+    ax = plt.subplot(3, 4, 2 * i + 1)
+    plt.title("Input Image")
+    plt.imshow(image.numpy().squeeze())
+    plt.axis("off")
+
+    ax = plt.subplot(3, 4, 2 * i + 2)
+    resized_image = learnable_resizer(image[None, ...])
+    plt.title("Resized Image")
+    plt.imshow(resized_image.numpy().squeeze() / 10)
+    plt.axis("off")
+
+"""
+The plot shows that the visuals of the images have improved with training. The following
+table shows the benefits of using the resizing module in comparison to using the bilinear
+interpolation:
+
+|           Model           	| Number of  parameters (Million) 	| Top-1 accuracy 	|
+|:-------------------------:	|:-------------------------------:	|:--------------:	|
+|   With the learnable resizer  	|             7.051717            	|      67.67%     	|
+| Without the learnable resizer 	|             7.039554            	|      60.19%      	|
+
+For more details, you can check out [this repository](https://github.com/sayakpaul/Learnable-Image-Resizing).
+Note the above-reported models were trained for 10 epochs on 90% of the training set of
+Cats and Dogs unlike this example. Also, note that the increase in the number of
+parameters due to the resizing module is very negligible. To ensure that the improvement
+in the performance is not due to stochasticity, the models were trained using the same
+initial random weights.
+
+Now, a question worth asking here is -  _isn't the improved accuracy simply a consequence
+of adding more layers (the resizer is a mini network after all) to the model, compared to
+the baseline?_
+
+To show that it is not the case, the authors conduct the following experiment:
+
+* Take a pre-trained model trained some size, say (224 x 224).
+
+* Now, first, use it to infer predictions on images resized to a lower resolution. Record
+the performance.
+
+* For the second experiment, plug in the resizer module at the top of the pre-trained
+model and warm-start the training. Record the performance.
+
+Now, the authors argue that using the second option is better because it helps the model
+learn how to adjust the representations better with respect to the given resolution.
+Since the results purely are empirical, a few more experiments such as analyzing the
+cross-channel interaction would have been even better. It is worth noting that elements
+like [Squeeze and Excitation (SE) blocks](https://arxiv.org/abs/1709.01507), [Global Context (GC) blocks](https://arxiv.org/abs/1904.11492) also add a few
+parameters to an existing network but they are known to help a network process
+information in systematic ways to improve the overall performance.
+"""
+
+"""
+## Notes
+
+* To impose shape bias inside the vision models, Geirhos et al. trained them with a
+combination of natural and stylized images. It might be interesting to investigate if
+this learnable resizing module could achieve something similar as the outputs seem to
+discard the texture information.
+
+* The resizer module can handle arbitrary resolutions and aspect ratios which is very
+important for tasks like object detection and segmentation.
+
+* There is another closely related topic on ***adaptive image resizing*** that attempts
+to resize images/feature maps adaptively during training. [EfficientV2](https://arxiv.org/abs/2104.00298)
+uses this idea.
+"""
diff --git a/knowledge_base/vision/masked_image_modeling.py b/knowledge_base/vision/masked_image_modeling.py
new file mode 100644
index 0000000000000000000000000000000000000000..fd471a3a0369c5631a54412b508cd279562b5d02
--- /dev/null
+++ b/knowledge_base/vision/masked_image_modeling.py
@@ -0,0 +1,942 @@
+"""
+Title: Masked image modeling with Autoencoders
+Author: [Aritra Roy Gosthipaty](https://twitter.com/arig23498), [Sayak Paul](https://twitter.com/RisingSayak)
+Date created: 2021/12/20
+Last modified: 2021/12/21
+Description: Implementing Masked Autoencoders for self-supervised pretraining.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+In deep learning, models with growing **capacity** and **capability** can easily overfit
+on large datasets (ImageNet-1K). In the field of natural language processing, the
+appetite for data has been **successfully addressed** by self-supervised pretraining.
+
+In the academic paper
+[Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377)
+by He et. al. the authors propose a simple yet effective method to pretrain large
+vision models (here [ViT Huge](https://arxiv.org/abs/2010.11929)). Inspired from
+the pretraining algorithm of BERT ([Devlin et al.](https://arxiv.org/abs/1810.04805)),
+they mask patches of an image and, through an autoencoder predict the masked patches.
+In the spirit of "masked language modeling", this pretraining task could be referred
+to as "masked image modeling".
+
+In this example, we implement
+[Masked Autoencoders Are Scalable Vision Learners](https://arxiv.org/abs/2111.06377)
+with the [CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html) dataset. After
+pretraining a scaled down version of ViT, we also implement the linear evaluation
+pipeline on CIFAR-10.
+
+
+This implementation covers (MAE refers to Masked Autoencoder):
+
+- The masking algorithm
+- MAE encoder
+- MAE decoder
+- Evaluation with linear probing
+
+As a reference, we reuse some of the code presented in
+[this example](https://keras.io/examples/vision/image_classification_with_vision_transformer/).
+
+"""
+
+"""
+## Imports
+"""
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import tensorflow as tf
+import keras
+from keras import layers
+
+import matplotlib.pyplot as plt
+import numpy as np
+import random
+
+# Setting seeds for reproducibility.
+SEED = 42
+keras.utils.set_random_seed(SEED)
+
+"""
+## Hyperparameters for pretraining
+
+Please feel free to change the hyperparameters and check your results. The best way to
+get an intuition about the architecture is to experiment with it. Our hyperparameters are
+heavily inspired by the design guidelines laid out by the authors in
+[the original paper](https://arxiv.org/abs/2111.06377).
+"""
+
+# DATA
+BUFFER_SIZE = 1024
+BATCH_SIZE = 256
+AUTO = tf.data.AUTOTUNE
+INPUT_SHAPE = (32, 32, 3)
+NUM_CLASSES = 10
+
+# OPTIMIZER
+LEARNING_RATE = 5e-3
+WEIGHT_DECAY = 1e-4
+
+# PRETRAINING
+EPOCHS = 100
+
+# AUGMENTATION
+IMAGE_SIZE = 48  # We will resize input images to this size.
+PATCH_SIZE = 6  # Size of the patches to be extracted from the input images.
+NUM_PATCHES = (IMAGE_SIZE // PATCH_SIZE) ** 2
+MASK_PROPORTION = 0.75  # We have found 75% masking to give us the best results.
+
+# ENCODER and DECODER
+LAYER_NORM_EPS = 1e-6
+ENC_PROJECTION_DIM = 128
+DEC_PROJECTION_DIM = 64
+ENC_NUM_HEADS = 4
+ENC_LAYERS = 6
+DEC_NUM_HEADS = 4
+DEC_LAYERS = (
+    2  # The decoder is lightweight but should be reasonably deep for reconstruction.
+)
+ENC_TRANSFORMER_UNITS = [
+    ENC_PROJECTION_DIM * 2,
+    ENC_PROJECTION_DIM,
+]  # Size of the transformer layers.
+DEC_TRANSFORMER_UNITS = [
+    DEC_PROJECTION_DIM * 2,
+    DEC_PROJECTION_DIM,
+]
+
+"""
+## Load and prepare the CIFAR-10 dataset
+"""
+
+(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()
+(x_train, y_train), (x_val, y_val) = (
+    (x_train[:40000], y_train[:40000]),
+    (x_train[40000:], y_train[40000:]),
+)
+print(f"Training samples: {len(x_train)}")
+print(f"Validation samples: {len(x_val)}")
+print(f"Testing samples: {len(x_test)}")
+
+train_ds = tf.data.Dataset.from_tensor_slices(x_train)
+train_ds = train_ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(AUTO)
+
+val_ds = tf.data.Dataset.from_tensor_slices(x_val)
+val_ds = val_ds.batch(BATCH_SIZE).prefetch(AUTO)
+
+test_ds = tf.data.Dataset.from_tensor_slices(x_test)
+test_ds = test_ds.batch(BATCH_SIZE).prefetch(AUTO)
+
+"""
+## Data augmentation
+
+In previous self-supervised pretraining methodologies
+([SimCLR](https://arxiv.org/abs/2002.05709) alike), we have noticed that the data
+augmentation pipeline plays an important role. On the other hand the authors of this
+paper point out that Masked Autoencoders **do not** rely on augmentations. They propose a
+simple augmentation pipeline of:
+
+
+- Resizing
+- Random cropping (fixed-sized or random sized)
+- Random horizontal flipping
+"""
+
+
+def get_train_augmentation_model():
+    model = keras.Sequential(
+        [
+            layers.Rescaling(1 / 255.0),
+            layers.Resizing(INPUT_SHAPE[0] + 20, INPUT_SHAPE[0] + 20),
+            layers.RandomCrop(IMAGE_SIZE, IMAGE_SIZE),
+            layers.RandomFlip("horizontal"),
+        ],
+        name="train_data_augmentation",
+    )
+    return model
+
+
+def get_test_augmentation_model():
+    model = keras.Sequential(
+        [
+            layers.Rescaling(1 / 255.0),
+            layers.Resizing(IMAGE_SIZE, IMAGE_SIZE),
+        ],
+        name="test_data_augmentation",
+    )
+    return model
+
+
+"""
+## A layer for extracting patches from images
+
+This layer takes images as input and divides them into patches. The layer also includes
+two utility method:
+
+- `show_patched_image` -- Takes a batch of images and its corresponding patches to plot a
+random pair of image and patches.
+- `reconstruct_from_patch` -- Takes a single instance of patches and stitches them
+together into the original image.
+"""
+
+
+class Patches(layers.Layer):
+    def __init__(self, patch_size=PATCH_SIZE, **kwargs):
+        super().__init__(**kwargs)
+        self.patch_size = patch_size
+
+        # Assuming the image has three channels each patch would be
+        # of size (patch_size, patch_size, 3).
+        self.resize = layers.Reshape((-1, patch_size * patch_size * 3))
+
+    def call(self, images):
+        # Create patches from the input images
+        patches = tf.image.extract_patches(
+            images=images,
+            sizes=[1, self.patch_size, self.patch_size, 1],
+            strides=[1, self.patch_size, self.patch_size, 1],
+            rates=[1, 1, 1, 1],
+            padding="VALID",
+        )
+
+        # Reshape the patches to (batch, num_patches, patch_area) and return it.
+        patches = self.resize(patches)
+        return patches
+
+    def show_patched_image(self, images, patches):
+        # This is a utility function which accepts a batch of images and its
+        # corresponding patches and help visualize one image and its patches
+        # side by side.
+        idx = np.random.choice(patches.shape[0])
+        print(f"Index selected: {idx}.")
+
+        plt.figure(figsize=(4, 4))
+        plt.imshow(keras.utils.array_to_img(images[idx]))
+        plt.axis("off")
+        plt.show()
+
+        n = int(np.sqrt(patches.shape[1]))
+        plt.figure(figsize=(4, 4))
+        for i, patch in enumerate(patches[idx]):
+            ax = plt.subplot(n, n, i + 1)
+            patch_img = tf.reshape(patch, (self.patch_size, self.patch_size, 3))
+            plt.imshow(keras.utils.img_to_array(patch_img))
+            plt.axis("off")
+        plt.show()
+
+        # Return the index chosen to validate it outside the method.
+        return idx
+
+    # taken from https://stackoverflow.com/a/58082878/10319735
+    def reconstruct_from_patch(self, patch):
+        # This utility function takes patches from a *single* image and
+        # reconstructs it back into the image. This is useful for the train
+        # monitor callback.
+        num_patches = patch.shape[0]
+        n = int(np.sqrt(num_patches))
+        patch = tf.reshape(patch, (num_patches, self.patch_size, self.patch_size, 3))
+        rows = tf.split(patch, n, axis=0)
+        rows = [tf.concat(tf.unstack(x), axis=1) for x in rows]
+        reconstructed = tf.concat(rows, axis=0)
+        return reconstructed
+
+
+"""
+Let's visualize the image patches.
+"""
+
+# Get a batch of images.
+image_batch = next(iter(train_ds))
+
+# Augment the images.
+augmentation_model = get_train_augmentation_model()
+augmented_images = augmentation_model(image_batch)
+
+# Define the patch layer.
+patch_layer = Patches()
+
+# Get the patches from the batched images.
+patches = patch_layer(images=augmented_images)
+
+# Now pass the images and the corresponding patches
+# to the `show_patched_image` method.
+random_index = patch_layer.show_patched_image(images=augmented_images, patches=patches)
+
+# Chose the same chose image and try reconstructing the patches
+# into the original image.
+image = patch_layer.reconstruct_from_patch(patches[random_index])
+plt.imshow(image)
+plt.axis("off")
+plt.show()
+
+"""
+## Patch encoding with masking
+
+Quoting the paper
+
+> Following ViT, we divide an image into regular non-overlapping patches. Then we sample
+a subset of patches and mask (i.e., remove) the remaining ones. Our sampling strategy is
+straightforward: we sample random patches without replacement, following a uniform
+distribution. We simply refer to this as “random sampling”.
+
+This layer includes masking and encoding the patches.
+
+The utility methods of the layer are:
+
+- `get_random_indices` -- Provides the mask and unmask indices.
+- `generate_masked_image` -- Takes patches and unmask indices, results in a random masked
+image. This is an essential utility method for our training monitor callback (defined
+later).
+"""
+
+
+class PatchEncoder(layers.Layer):
+    def __init__(
+        self,
+        patch_size=PATCH_SIZE,
+        projection_dim=ENC_PROJECTION_DIM,
+        mask_proportion=MASK_PROPORTION,
+        downstream=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.patch_size = patch_size
+        self.projection_dim = projection_dim
+        self.mask_proportion = mask_proportion
+        self.downstream = downstream
+
+        # This is a trainable mask token initialized randomly from a normal
+        # distribution.
+        self.mask_token = tf.Variable(
+            tf.random.normal([1, patch_size * patch_size * 3]), trainable=True
+        )
+
+    def build(self, input_shape):
+        (_, self.num_patches, self.patch_area) = input_shape
+
+        # Create the projection layer for the patches.
+        self.projection = layers.Dense(units=self.projection_dim)
+
+        # Create the positional embedding layer.
+        self.position_embedding = layers.Embedding(
+            input_dim=self.num_patches, output_dim=self.projection_dim
+        )
+
+        # Number of patches that will be masked.
+        self.num_mask = int(self.mask_proportion * self.num_patches)
+
+    def call(self, patches):
+        # Get the positional embeddings.
+        batch_size = tf.shape(patches)[0]
+        positions = tf.range(start=0, limit=self.num_patches, delta=1)
+        pos_embeddings = self.position_embedding(positions[tf.newaxis, ...])
+        pos_embeddings = tf.tile(
+            pos_embeddings, [batch_size, 1, 1]
+        )  # (B, num_patches, projection_dim)
+
+        # Embed the patches.
+        patch_embeddings = (
+            self.projection(patches) + pos_embeddings
+        )  # (B, num_patches, projection_dim)
+
+        if self.downstream:
+            return patch_embeddings
+        else:
+            mask_indices, unmask_indices = self.get_random_indices(batch_size)
+            # The encoder input is the unmasked patch embeddings. Here we gather
+            # all the patches that should be unmasked.
+            unmasked_embeddings = tf.gather(
+                patch_embeddings, unmask_indices, axis=1, batch_dims=1
+            )  # (B, unmask_numbers, projection_dim)
+
+            # Get the unmasked and masked position embeddings. We will need them
+            # for the decoder.
+            unmasked_positions = tf.gather(
+                pos_embeddings, unmask_indices, axis=1, batch_dims=1
+            )  # (B, unmask_numbers, projection_dim)
+            masked_positions = tf.gather(
+                pos_embeddings, mask_indices, axis=1, batch_dims=1
+            )  # (B, mask_numbers, projection_dim)
+
+            # Repeat the mask token number of mask times.
+            # Mask tokens replace the masks of the image.
+            mask_tokens = tf.repeat(self.mask_token, repeats=self.num_mask, axis=0)
+            mask_tokens = tf.repeat(
+                mask_tokens[tf.newaxis, ...], repeats=batch_size, axis=0
+            )
+
+            # Get the masked embeddings for the tokens.
+            masked_embeddings = self.projection(mask_tokens) + masked_positions
+            return (
+                unmasked_embeddings,  # Input to the encoder.
+                masked_embeddings,  # First part of input to the decoder.
+                unmasked_positions,  # Added to the encoder outputs.
+                mask_indices,  # The indices that were masked.
+                unmask_indices,  # The indices that were unmaksed.
+            )
+
+    def get_random_indices(self, batch_size):
+        # Create random indices from a uniform distribution and then split
+        # it into mask and unmask indices.
+        rand_indices = tf.argsort(
+            tf.random.uniform(shape=(batch_size, self.num_patches)), axis=-1
+        )
+        mask_indices = rand_indices[:, : self.num_mask]
+        unmask_indices = rand_indices[:, self.num_mask :]
+        return mask_indices, unmask_indices
+
+    def generate_masked_image(self, patches, unmask_indices):
+        # Choose a random patch and it corresponding unmask index.
+        idx = np.random.choice(patches.shape[0])
+        patch = patches[idx]
+        unmask_index = unmask_indices[idx]
+
+        # Build a numpy array of same shape as patch.
+        new_patch = np.zeros_like(patch)
+
+        # Iterate of the new_patch and plug the unmasked patches.
+        count = 0
+        for i in range(unmask_index.shape[0]):
+            new_patch[unmask_index[i]] = patch[unmask_index[i]]
+        return new_patch, idx
+
+
+"""
+Let's see the masking process in action on a sample image.
+"""
+
+# Create the patch encoder layer.
+patch_encoder = PatchEncoder()
+
+# Get the embeddings and positions.
+(
+    unmasked_embeddings,
+    masked_embeddings,
+    unmasked_positions,
+    mask_indices,
+    unmask_indices,
+) = patch_encoder(patches=patches)
+
+
+# Show a maksed patch image.
+new_patch, random_index = patch_encoder.generate_masked_image(patches, unmask_indices)
+
+plt.figure(figsize=(10, 10))
+plt.subplot(1, 2, 1)
+img = patch_layer.reconstruct_from_patch(new_patch)
+plt.imshow(keras.utils.array_to_img(img))
+plt.axis("off")
+plt.title("Masked")
+plt.subplot(1, 2, 2)
+img = augmented_images[random_index]
+plt.imshow(keras.utils.array_to_img(img))
+plt.axis("off")
+plt.title("Original")
+plt.show()
+
+"""
+## MLP
+
+This serves as the fully connected feed forward network of the transformer architecture.
+"""
+
+
+def mlp(x, dropout_rate, hidden_units):
+    for units in hidden_units:
+        x = layers.Dense(units, activation=tf.nn.gelu)(x)
+        x = layers.Dropout(dropout_rate)(x)
+    return x
+
+
+"""
+## MAE encoder
+
+The MAE encoder is ViT. The only point to note here is that the encoder outputs a layer
+normalized output.
+"""
+
+
+def create_encoder(num_heads=ENC_NUM_HEADS, num_layers=ENC_LAYERS):
+    inputs = layers.Input((None, ENC_PROJECTION_DIM))
+    x = inputs
+
+    for _ in range(num_layers):
+        # Layer normalization 1.
+        x1 = layers.LayerNormalization(epsilon=LAYER_NORM_EPS)(x)
+
+        # Create a multi-head attention layer.
+        attention_output = layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=ENC_PROJECTION_DIM, dropout=0.1
+        )(x1, x1)
+
+        # Skip connection 1.
+        x2 = layers.Add()([attention_output, x])
+
+        # Layer normalization 2.
+        x3 = layers.LayerNormalization(epsilon=LAYER_NORM_EPS)(x2)
+
+        # MLP.
+        x3 = mlp(x3, hidden_units=ENC_TRANSFORMER_UNITS, dropout_rate=0.1)
+
+        # Skip connection 2.
+        x = layers.Add()([x3, x2])
+
+    outputs = layers.LayerNormalization(epsilon=LAYER_NORM_EPS)(x)
+    return keras.Model(inputs, outputs, name="mae_encoder")
+
+
+"""
+## MAE decoder
+
+The authors point out that they use an **asymmetric** autoencoder model. They use a
+lightweight decoder that takes "<10% computation per token vs. the encoder". We are not
+specific with the "<10% computation" in our implementation but have used a smaller
+decoder (both in terms of depth and projection dimensions).
+"""
+
+
+def create_decoder(
+    num_layers=DEC_LAYERS, num_heads=DEC_NUM_HEADS, image_size=IMAGE_SIZE
+):
+    inputs = layers.Input((NUM_PATCHES, ENC_PROJECTION_DIM))
+    x = layers.Dense(DEC_PROJECTION_DIM)(inputs)
+
+    for _ in range(num_layers):
+        # Layer normalization 1.
+        x1 = layers.LayerNormalization(epsilon=LAYER_NORM_EPS)(x)
+
+        # Create a multi-head attention layer.
+        attention_output = layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=DEC_PROJECTION_DIM, dropout=0.1
+        )(x1, x1)
+
+        # Skip connection 1.
+        x2 = layers.Add()([attention_output, x])
+
+        # Layer normalization 2.
+        x3 = layers.LayerNormalization(epsilon=LAYER_NORM_EPS)(x2)
+
+        # MLP.
+        x3 = mlp(x3, hidden_units=DEC_TRANSFORMER_UNITS, dropout_rate=0.1)
+
+        # Skip connection 2.
+        x = layers.Add()([x3, x2])
+
+    x = layers.LayerNormalization(epsilon=LAYER_NORM_EPS)(x)
+    x = layers.Flatten()(x)
+    pre_final = layers.Dense(units=image_size * image_size * 3, activation="sigmoid")(x)
+    outputs = layers.Reshape((image_size, image_size, 3))(pre_final)
+
+    return keras.Model(inputs, outputs, name="mae_decoder")
+
+
+"""
+## MAE trainer
+
+This is the trainer module. We wrap the encoder and decoder inside of a `tf.keras.Model`
+subclass. This allows us to customize what happens in the `model.fit()` loop.
+"""
+
+
+class MaskedAutoencoder(keras.Model):
+    def __init__(
+        self,
+        train_augmentation_model,
+        test_augmentation_model,
+        patch_layer,
+        patch_encoder,
+        encoder,
+        decoder,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.train_augmentation_model = train_augmentation_model
+        self.test_augmentation_model = test_augmentation_model
+        self.patch_layer = patch_layer
+        self.patch_encoder = patch_encoder
+        self.encoder = encoder
+        self.decoder = decoder
+
+    def calculate_loss(self, images, test=False):
+        # Augment the input images.
+        if test:
+            augmented_images = self.test_augmentation_model(images)
+        else:
+            augmented_images = self.train_augmentation_model(images)
+
+        # Patch the augmented images.
+        patches = self.patch_layer(augmented_images)
+
+        # Encode the patches.
+        (
+            unmasked_embeddings,
+            masked_embeddings,
+            unmasked_positions,
+            mask_indices,
+            unmask_indices,
+        ) = self.patch_encoder(patches)
+
+        # Pass the unmaksed patche to the encoder.
+        encoder_outputs = self.encoder(unmasked_embeddings)
+
+        # Create the decoder inputs.
+        encoder_outputs = encoder_outputs + unmasked_positions
+        decoder_inputs = tf.concat([encoder_outputs, masked_embeddings], axis=1)
+
+        # Decode the inputs.
+        decoder_outputs = self.decoder(decoder_inputs)
+        decoder_patches = self.patch_layer(decoder_outputs)
+
+        loss_patch = tf.gather(patches, mask_indices, axis=1, batch_dims=1)
+        loss_output = tf.gather(decoder_patches, mask_indices, axis=1, batch_dims=1)
+
+        # Compute the total loss.
+        total_loss = self.compute_loss(y=loss_patch, y_pred=loss_output)
+
+        return total_loss, loss_patch, loss_output
+
+    def train_step(self, images):
+        with tf.GradientTape() as tape:
+            total_loss, loss_patch, loss_output = self.calculate_loss(images)
+
+        # Apply gradients.
+        train_vars = [
+            self.train_augmentation_model.trainable_variables,
+            self.patch_layer.trainable_variables,
+            self.patch_encoder.trainable_variables,
+            self.encoder.trainable_variables,
+            self.decoder.trainable_variables,
+        ]
+        grads = tape.gradient(total_loss, train_vars)
+        tv_list = []
+        for grad, var in zip(grads, train_vars):
+            for g, v in zip(grad, var):
+                tv_list.append((g, v))
+        self.optimizer.apply_gradients(tv_list)
+
+        # Report progress.
+        results = {}
+        for metric in self.metrics:
+            metric.update_state(loss_patch, loss_output)
+            results[metric.name] = metric.result()
+        return results
+
+    def test_step(self, images):
+        total_loss, loss_patch, loss_output = self.calculate_loss(images, test=True)
+
+        # Update the trackers.
+        results = {}
+        for metric in self.metrics:
+            metric.update_state(loss_patch, loss_output)
+            results[metric.name] = metric.result()
+        return results
+
+
+"""
+## Model initialization
+"""
+
+train_augmentation_model = get_train_augmentation_model()
+test_augmentation_model = get_test_augmentation_model()
+patch_layer = Patches()
+patch_encoder = PatchEncoder()
+encoder = create_encoder()
+decoder = create_decoder()
+
+mae_model = MaskedAutoencoder(
+    train_augmentation_model=train_augmentation_model,
+    test_augmentation_model=test_augmentation_model,
+    patch_layer=patch_layer,
+    patch_encoder=patch_encoder,
+    encoder=encoder,
+    decoder=decoder,
+)
+
+"""
+## Training callbacks
+"""
+
+"""
+### Visualization callback
+"""
+
+# Taking a batch of test inputs to measure model's progress.
+test_images = next(iter(test_ds))
+
+
+class TrainMonitor(keras.callbacks.Callback):
+    def __init__(self, epoch_interval=None):
+        self.epoch_interval = epoch_interval
+
+    def on_epoch_end(self, epoch, logs=None):
+        if self.epoch_interval and epoch % self.epoch_interval == 0:
+            test_augmented_images = self.model.test_augmentation_model(test_images)
+            test_patches = self.model.patch_layer(test_augmented_images)
+            (
+                test_unmasked_embeddings,
+                test_masked_embeddings,
+                test_unmasked_positions,
+                test_mask_indices,
+                test_unmask_indices,
+            ) = self.model.patch_encoder(test_patches)
+            test_encoder_outputs = self.model.encoder(test_unmasked_embeddings)
+            test_encoder_outputs = test_encoder_outputs + test_unmasked_positions
+            test_decoder_inputs = tf.concat(
+                [test_encoder_outputs, test_masked_embeddings], axis=1
+            )
+            test_decoder_outputs = self.model.decoder(test_decoder_inputs)
+
+            # Show a maksed patch image.
+            test_masked_patch, idx = self.model.patch_encoder.generate_masked_image(
+                test_patches, test_unmask_indices
+            )
+            print(f"\nIdx chosen: {idx}")
+            original_image = test_augmented_images[idx]
+            masked_image = self.model.patch_layer.reconstruct_from_patch(
+                test_masked_patch
+            )
+            reconstructed_image = test_decoder_outputs[idx]
+
+            fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(15, 5))
+            ax[0].imshow(original_image)
+            ax[0].set_title(f"Original: {epoch:03d}")
+
+            ax[1].imshow(masked_image)
+            ax[1].set_title(f"Masked: {epoch:03d}")
+
+            ax[2].imshow(reconstructed_image)
+            ax[2].set_title(f"Resonstructed: {epoch:03d}")
+
+            plt.show()
+            plt.close()
+
+
+"""
+### Learning rate scheduler
+"""
+
+# Some code is taken from:
+# https://www.kaggle.com/ashusma/training-rfcx-tensorflow-tpu-effnet-b2.
+
+
+class WarmUpCosine(keras.optimizers.schedules.LearningRateSchedule):
+    def __init__(
+        self, learning_rate_base, total_steps, warmup_learning_rate, warmup_steps
+    ):
+        super().__init__()
+
+        self.learning_rate_base = learning_rate_base
+        self.total_steps = total_steps
+        self.warmup_learning_rate = warmup_learning_rate
+        self.warmup_steps = warmup_steps
+        self.pi = tf.constant(np.pi)
+
+    def __call__(self, step):
+        if self.total_steps < self.warmup_steps:
+            raise ValueError("Total_steps must be larger or equal to warmup_steps.")
+
+        cos_annealed_lr = tf.cos(
+            self.pi
+            * (tf.cast(step, tf.float32) - self.warmup_steps)
+            / float(self.total_steps - self.warmup_steps)
+        )
+        learning_rate = 0.5 * self.learning_rate_base * (1 + cos_annealed_lr)
+
+        if self.warmup_steps > 0:
+            if self.learning_rate_base < self.warmup_learning_rate:
+                raise ValueError(
+                    "Learning_rate_base must be larger or equal to "
+                    "warmup_learning_rate."
+                )
+            slope = (
+                self.learning_rate_base - self.warmup_learning_rate
+            ) / self.warmup_steps
+            warmup_rate = slope * tf.cast(step, tf.float32) + self.warmup_learning_rate
+            learning_rate = tf.where(
+                step < self.warmup_steps, warmup_rate, learning_rate
+            )
+        return tf.where(
+            step > self.total_steps, 0.0, learning_rate, name="learning_rate"
+        )
+
+
+total_steps = int((len(x_train) / BATCH_SIZE) * EPOCHS)
+warmup_epoch_percentage = 0.15
+warmup_steps = int(total_steps * warmup_epoch_percentage)
+scheduled_lrs = WarmUpCosine(
+    learning_rate_base=LEARNING_RATE,
+    total_steps=total_steps,
+    warmup_learning_rate=0.0,
+    warmup_steps=warmup_steps,
+)
+
+lrs = [scheduled_lrs(step) for step in range(total_steps)]
+plt.plot(lrs)
+plt.xlabel("Step", fontsize=14)
+plt.ylabel("LR", fontsize=14)
+plt.show()
+
+# Assemble the callbacks.
+train_callbacks = [TrainMonitor(epoch_interval=5)]
+
+"""
+## Model compilation and training
+"""
+
+optimizer = keras.optimizers.AdamW(
+    learning_rate=scheduled_lrs, weight_decay=WEIGHT_DECAY
+)
+
+# Compile and pretrain the model.
+mae_model.compile(
+    optimizer=optimizer, loss=keras.losses.MeanSquaredError(), metrics=["mae"]
+)
+history = mae_model.fit(
+    train_ds,
+    epochs=EPOCHS,
+    validation_data=val_ds,
+    callbacks=train_callbacks,
+)
+
+# Measure its performance.
+loss, mae = mae_model.evaluate(test_ds)
+print(f"Loss: {loss:.2f}")
+print(f"MAE: {mae:.2f}")
+
+"""
+## Evaluation with linear probing
+"""
+
+"""
+### Extract the encoder model along with other layers
+"""
+
+# Extract the augmentation layers.
+train_augmentation_model = mae_model.train_augmentation_model
+test_augmentation_model = mae_model.test_augmentation_model
+
+# Extract the patchers.
+patch_layer = mae_model.patch_layer
+patch_encoder = mae_model.patch_encoder
+patch_encoder.downstream = True  # Swtich the downstream flag to True.
+
+# Extract the encoder.
+encoder = mae_model.encoder
+
+# Pack as a model.
+downstream_model = keras.Sequential(
+    [
+        layers.Input((IMAGE_SIZE, IMAGE_SIZE, 3)),
+        patch_layer,
+        patch_encoder,
+        encoder,
+        layers.BatchNormalization(),  # Refer to A.1 (Linear probing).
+        layers.GlobalAveragePooling1D(),
+        layers.Dense(NUM_CLASSES, activation="softmax"),
+    ],
+    name="linear_probe_model",
+)
+
+# Only the final classification layer of the `downstream_model` should be trainable.
+for layer in downstream_model.layers[:-1]:
+    layer.trainable = False
+
+downstream_model.summary()
+
+"""
+We are using average pooling to extract learned representations from the MAE encoder.
+Another approach would be to use a learnable dummy token inside the encoder during
+pretraining (resembling the [CLS] token). Then we can extract representations from that
+token during the downstream tasks.
+"""
+
+"""
+### Prepare datasets for linear probing
+"""
+
+
+def prepare_data(images, labels, is_train=True):
+    if is_train:
+        augmentation_model = train_augmentation_model
+    else:
+        augmentation_model = test_augmentation_model
+
+    dataset = tf.data.Dataset.from_tensor_slices((images, labels))
+    if is_train:
+        dataset = dataset.shuffle(BUFFER_SIZE)
+
+    dataset = dataset.batch(BATCH_SIZE).map(
+        lambda x, y: (augmentation_model(x), y), num_parallel_calls=AUTO
+    )
+    return dataset.prefetch(AUTO)
+
+
+train_ds = prepare_data(x_train, y_train)
+val_ds = prepare_data(x_train, y_train, is_train=False)
+test_ds = prepare_data(x_test, y_test, is_train=False)
+
+"""
+### Perform linear probing
+"""
+
+linear_probe_epochs = 50
+linear_prob_lr = 0.1
+warm_epoch_percentage = 0.1
+steps = int((len(x_train) // BATCH_SIZE) * linear_probe_epochs)
+
+warmup_steps = int(steps * warm_epoch_percentage)
+scheduled_lrs = WarmUpCosine(
+    learning_rate_base=linear_prob_lr,
+    total_steps=steps,
+    warmup_learning_rate=0.0,
+    warmup_steps=warmup_steps,
+)
+
+optimizer = keras.optimizers.SGD(learning_rate=scheduled_lrs, momentum=0.9)
+downstream_model.compile(
+    optimizer=optimizer, loss="sparse_categorical_crossentropy", metrics=["accuracy"]
+)
+downstream_model.fit(train_ds, validation_data=val_ds, epochs=linear_probe_epochs)
+
+loss, accuracy = downstream_model.evaluate(test_ds)
+accuracy = round(accuracy * 100, 2)
+print(f"Accuracy on the test set: {accuracy}%.")
+
+"""
+We believe that with a more sophisticated hyperparameter tuning process and a longer
+pretraining it is possible to improve this performance further. For comparison, we took
+the encoder architecture and
+[trained it from scratch](https://github.com/ariG23498/mae-scalable-vision-learners/blob/master/regular-classification.ipynb)
+in a fully supervised manner. This gave us ~76% test top-1 accuracy. The authors of
+MAE demonstrates strong performance on the ImageNet-1k dataset as well as
+other downstream tasks like object detection and semantic segmentation.
+"""
+
+"""
+## Final notes
+
+We refer the interested readers to other examples on self-supervised learning present on
+keras.io:
+
+* [SimCLR](https://keras.io/examples/vision/semisupervised_simclr/)
+* [NNCLR](https://keras.io/examples/vision/nnclr)
+* [SimSiam](https://keras.io/examples/vision/simsiam)
+
+This idea of using BERT flavored pretraining in computer vision was also explored in
+[Selfie](https://arxiv.org/abs/1906.02940), but it could not demonstrate strong results.
+Another concurrent work that explores the idea of masked image modeling is
+[SimMIM](https://arxiv.org/abs/2111.09886). Finally, as a fun fact, we, the authors of
+this example also explored the idea of ["reconstruction as a pretext task"](https://i.ibb.co/k5CpwDX/image.png)
+in 2020 but we could not prevent the network from representation collapse, and
+hence we did not get strong downstream performance.
+
+We would like to thank [Xinlei Chen](http://xinleic.xyz/)
+(one of the authors of MAE) for helpful discussions. We are grateful to
+[JarvisLabs](https://jarvislabs.ai/) and
+[Google Developers Experts](https://developers.google.com/programs/experts/)
+program for helping with GPU credits.
+"""
diff --git a/knowledge_base/vision/metric_learning.py b/knowledge_base/vision/metric_learning.py
new file mode 100644
index 0000000000000000000000000000000000000000..3376afd63c396d657e8c98c81d1a335b18f39018
--- /dev/null
+++ b/knowledge_base/vision/metric_learning.py
@@ -0,0 +1,325 @@
+"""
+Title: Metric learning for image similarity search
+Author: [Mat Kelcey](https://twitter.com/mat_kelcey)
+Date created: 2020/06/05
+Last modified: 2020/06/09
+Description: Example of using similarity metric learning on CIFAR-10 images.
+Accelerator: GPU
+"""
+
+"""
+## Overview
+
+Metric learning aims to train models that can embed inputs into a high-dimensional space
+such that "similar" inputs, as defined by the training scheme, are located close to each
+other. These models once trained can produce embeddings for downstream systems where such
+similarity is useful; examples include as a ranking signal for search or as a form of
+pretrained embedding model for another supervised problem.
+
+For a more detailed overview of metric learning see:
+
+* [What is metric learning?](http://contrib.scikit-learn.org/metric-learn/introduction.html)
+* ["Using crossentropy for metric learning" tutorial](https://www.youtube.com/watch?v=Jb4Ewl5RzkI)
+"""
+
+"""
+## Setup
+
+Set Keras backend to tensorflow.
+"""
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import random
+import matplotlib.pyplot as plt
+import numpy as np
+import tensorflow as tf
+from collections import defaultdict
+from PIL import Image
+from sklearn.metrics import ConfusionMatrixDisplay
+import keras
+from keras import layers
+
+"""
+## Dataset
+
+For this example we will be using the
+[CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html) dataset.
+"""
+
+from keras.datasets import cifar10
+
+
+(x_train, y_train), (x_test, y_test) = cifar10.load_data()
+
+x_train = x_train.astype("float32") / 255.0
+y_train = np.squeeze(y_train)
+x_test = x_test.astype("float32") / 255.0
+y_test = np.squeeze(y_test)
+
+"""
+To get a sense of the dataset we can visualise a grid of 25 random examples.
+
+
+"""
+
+height_width = 32
+
+
+def show_collage(examples):
+    box_size = height_width + 2
+    num_rows, num_cols = examples.shape[:2]
+
+    collage = Image.new(
+        mode="RGB",
+        size=(num_cols * box_size, num_rows * box_size),
+        color=(250, 250, 250),
+    )
+    for row_idx in range(num_rows):
+        for col_idx in range(num_cols):
+            array = (np.array(examples[row_idx, col_idx]) * 255).astype(np.uint8)
+            collage.paste(
+                Image.fromarray(array), (col_idx * box_size, row_idx * box_size)
+            )
+
+    # Double size for visualisation.
+    collage = collage.resize((2 * num_cols * box_size, 2 * num_rows * box_size))
+    return collage
+
+
+# Show a collage of 5x5 random images.
+sample_idxs = np.random.randint(0, 50000, size=(5, 5))
+examples = x_train[sample_idxs]
+show_collage(examples)
+
+"""
+Metric learning provides training data not as explicit `(X, y)` pairs but instead uses
+multiple instances that are related in the way we want to express similarity. In our
+example we will use instances of the same class to represent similarity; a single
+training instance will not be one image, but a pair of images of the same class. When
+referring to the images in this pair we'll use the common metric learning names of the
+`anchor` (a randomly chosen image) and the `positive` (another randomly chosen image of
+the same class).
+
+To facilitate this we need to build a form of lookup that maps from classes to the
+instances of that class. When generating data for training we will sample from this
+lookup.
+"""
+
+class_idx_to_train_idxs = defaultdict(list)
+for y_train_idx, y in enumerate(y_train):
+    class_idx_to_train_idxs[y].append(y_train_idx)
+
+class_idx_to_test_idxs = defaultdict(list)
+for y_test_idx, y in enumerate(y_test):
+    class_idx_to_test_idxs[y].append(y_test_idx)
+
+"""
+For this example we are using the simplest approach to training; a batch will consist of
+`(anchor, positive)` pairs spread across the classes. The goal of learning will be to
+move the anchor and positive pairs closer together and further away from other instances
+in the batch. In this case the batch size will be dictated by the number of classes; for
+CIFAR-10 this is 10.
+"""
+
+num_classes = 10
+
+
+class AnchorPositivePairs(keras.utils.Sequence):
+    def __init__(self, num_batches):
+        super().__init__()
+        self.num_batches = num_batches
+
+    def __len__(self):
+        return self.num_batches
+
+    def __getitem__(self, _idx):
+        x = np.empty((2, num_classes, height_width, height_width, 3), dtype=np.float32)
+        for class_idx in range(num_classes):
+            examples_for_class = class_idx_to_train_idxs[class_idx]
+            anchor_idx = random.choice(examples_for_class)
+            positive_idx = random.choice(examples_for_class)
+            while positive_idx == anchor_idx:
+                positive_idx = random.choice(examples_for_class)
+            x[0, class_idx] = x_train[anchor_idx]
+            x[1, class_idx] = x_train[positive_idx]
+        return x
+
+
+"""
+We can visualise a batch in another collage. The top row shows randomly chosen anchors
+from the 10 classes, the bottom row shows the corresponding 10 positives.
+"""
+
+examples = next(iter(AnchorPositivePairs(num_batches=1)))
+
+show_collage(examples)
+
+"""
+## Embedding model
+
+We define a custom model with a `train_step` that first embeds both anchors and positives
+and then uses their pairwise dot products as logits for a softmax.
+"""
+
+
+class EmbeddingModel(keras.Model):
+    def train_step(self, data):
+        # Note: Workaround for open issue, to be removed.
+        if isinstance(data, tuple):
+            data = data[0]
+        anchors, positives = data[0], data[1]
+
+        with tf.GradientTape() as tape:
+            # Run both anchors and positives through model.
+            anchor_embeddings = self(anchors, training=True)
+            positive_embeddings = self(positives, training=True)
+
+            # Calculate cosine similarity between anchors and positives. As they have
+            # been normalised this is just the pair wise dot products.
+            similarities = keras.ops.einsum(
+                "ae,pe->ap", anchor_embeddings, positive_embeddings
+            )
+
+            # Since we intend to use these as logits we scale them by a temperature.
+            # This value would normally be chosen as a hyper parameter.
+            temperature = 0.2
+            similarities /= temperature
+
+            # We use these similarities as logits for a softmax. The labels for
+            # this call are just the sequence [0, 1, 2, ..., num_classes] since we
+            # want the main diagonal values, which correspond to the anchor/positive
+            # pairs, to be high. This loss will move embeddings for the
+            # anchor/positive pairs together and move all other pairs apart.
+            sparse_labels = keras.ops.arange(num_classes)
+            loss = self.compute_loss(y=sparse_labels, y_pred=similarities)
+
+        # Calculate gradients and apply via optimizer.
+        gradients = tape.gradient(loss, self.trainable_variables)
+        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
+
+        # Update and return metrics (specifically the one for the loss value).
+        for metric in self.metrics:
+            # Calling `self.compile` will by default add a `keras.metrics.Mean` loss
+            if metric.name == "loss":
+                metric.update_state(loss)
+            else:
+                metric.update_state(sparse_labels, similarities)
+
+        return {m.name: m.result() for m in self.metrics}
+
+
+"""
+Next we describe the architecture that maps from an image to an embedding. This model
+simply consists of a sequence of 2d convolutions followed by global pooling with a final
+linear projection to an embedding space. As is common in metric learning we normalise the
+embeddings so that we can use simple dot products to measure similarity. For simplicity
+this model is intentionally small.
+"""
+
+inputs = layers.Input(shape=(height_width, height_width, 3))
+x = layers.Conv2D(filters=32, kernel_size=3, strides=2, activation="relu")(inputs)
+x = layers.Conv2D(filters=64, kernel_size=3, strides=2, activation="relu")(x)
+x = layers.Conv2D(filters=128, kernel_size=3, strides=2, activation="relu")(x)
+x = layers.GlobalAveragePooling2D()(x)
+embeddings = layers.Dense(units=8, activation=None)(x)
+embeddings = layers.UnitNormalization()(embeddings)
+
+model = EmbeddingModel(inputs, embeddings)
+
+"""
+Finally we run the training. On a Google Colab GPU instance this takes about a minute.
+"""
+model.compile(
+    optimizer=keras.optimizers.Adam(learning_rate=1e-3),
+    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+)
+
+history = model.fit(AnchorPositivePairs(num_batches=1000), epochs=20)
+
+plt.plot(history.history["loss"])
+plt.show()
+
+"""
+## Testing
+
+We can review the quality of this model by applying it to the test set and considering
+near neighbours in the embedding space.
+
+First we embed the test set and calculate all near neighbours. Recall that since the
+embeddings are unit length we can calculate cosine similarity via dot products.
+"""
+
+near_neighbours_per_example = 10
+
+embeddings = model.predict(x_test)
+gram_matrix = np.einsum("ae,be->ab", embeddings, embeddings)
+near_neighbours = np.argsort(gram_matrix.T)[:, -(near_neighbours_per_example + 1) :]
+
+"""
+As a visual check of these embeddings we can build a collage of the near neighbours for 5
+random examples. The first column of the image below is a randomly selected image, the
+following 10 columns show the nearest neighbours in order of similarity.
+"""
+
+num_collage_examples = 5
+
+examples = np.empty(
+    (
+        num_collage_examples,
+        near_neighbours_per_example + 1,
+        height_width,
+        height_width,
+        3,
+    ),
+    dtype=np.float32,
+)
+for row_idx in range(num_collage_examples):
+    examples[row_idx, 0] = x_test[row_idx]
+    anchor_near_neighbours = reversed(near_neighbours[row_idx][:-1])
+    for col_idx, nn_idx in enumerate(anchor_near_neighbours):
+        examples[row_idx, col_idx + 1] = x_test[nn_idx]
+
+show_collage(examples)
+
+"""
+We can also get a quantified view of the performance by considering the correctness of
+near neighbours in terms of a confusion matrix.
+
+Let us sample 10 examples from each of the 10 classes and consider their near neighbours
+as a form of prediction; that is, does the example and its near neighbours share the same
+class?
+
+We observe that each animal class does generally well, and is confused the most with the
+other animal classes. The vehicle classes follow the same pattern.
+"""
+
+confusion_matrix = np.zeros((num_classes, num_classes))
+
+# For each class.
+for class_idx in range(num_classes):
+    # Consider 10 examples.
+    example_idxs = class_idx_to_test_idxs[class_idx][:10]
+    for y_test_idx in example_idxs:
+        # And count the classes of its near neighbours.
+        for nn_idx in near_neighbours[y_test_idx][:-1]:
+            nn_class_idx = y_test[nn_idx]
+            confusion_matrix[class_idx, nn_class_idx] += 1
+
+# Display a confusion matrix.
+labels = [
+    "Airplane",
+    "Automobile",
+    "Bird",
+    "Cat",
+    "Deer",
+    "Dog",
+    "Frog",
+    "Horse",
+    "Ship",
+    "Truck",
+]
+disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix, display_labels=labels)
+disp.plot(include_values=True, cmap="viridis", ax=None, xticks_rotation="vertical")
+plt.show()
diff --git a/knowledge_base/vision/metric_learning_tf_similarity.py b/knowledge_base/vision/metric_learning_tf_similarity.py
new file mode 100644
index 0000000000000000000000000000000000000000..ffcfc1c4dc0c154ffbfd58d4f3ad7bcb374d4c7b
--- /dev/null
+++ b/knowledge_base/vision/metric_learning_tf_similarity.py
@@ -0,0 +1,428 @@
+"""
+Title: Metric learning for image similarity search using TensorFlow Similarity
+Author: [Owen Vallis](https://twitter.com/owenvallis)
+Date created: 2021/09/30
+Last modified: 2022/02/29
+Description: Example of using similarity metric learning on CIFAR-10 images.
+Accelerator: GPU
+"""
+
+"""
+## Overview
+
+This example is based on the
+["Metric learning for image similarity search" example](https://keras.io/examples/vision/metric_learning/).
+We aim to use the same data set but implement the model using
+[TensorFlow Similarity](https://github.com/tensorflow/similarity).
+
+Metric learning aims to train models that can embed inputs into a
+high-dimensional space such that "similar" inputs are pulled closer to each
+other and "dissimilar" inputs are pushed farther apart. Once trained, these
+models can produce embeddings for downstream systems where such similarity is
+useful, for instance as a ranking signal for search or as a form of pretrained
+embedding model for another supervised problem.
+
+For a more detailed overview of metric learning, see:
+
+* [What is metric learning?](http://contrib.scikit-learn.org/metric-learn/introduction.html)
+* ["Using crossentropy for metric learning" tutorial](https://www.youtube.com/watch?v=Jb4Ewl5RzkI)
+"""
+
+"""
+## Setup
+
+This tutorial will use the [TensorFlow Similarity](https://github.com/tensorflow/similarity) library
+to learn and evaluate the similarity embedding.
+TensorFlow Similarity provides components that:
+
+* Make training contrastive models simple and fast.
+* Make it easier to ensure that batches contain pairs of examples.
+* Enable the evaluation of the quality of the embedding.
+
+TensorFlow Similarity can be installed easily via pip, as follows:
+
+```
+pip -q install tensorflow_similarity
+```
+
+"""
+
+import random
+
+from matplotlib import pyplot as plt
+from mpl_toolkits import axes_grid1
+import numpy as np
+
+import tensorflow as tf
+from tensorflow import keras
+
+import tensorflow_similarity as tfsim
+
+
+tfsim.utils.tf_cap_memory()
+
+print("TensorFlow:", tf.__version__)
+print("TensorFlow Similarity:", tfsim.__version__)
+
+"""
+## Dataset samplers
+
+We will be using the
+[CIFAR-10](https://www.tensorflow.org/datasets/catalog/cifar10)
+dataset for this tutorial.
+
+For a similarity model to learn efficiently, each batch must contain at least 2
+examples of each class.
+
+To make this easy, tf_similarity offers `Sampler` objects that enable you to set both
+the number of classes and the minimum number of examples of each class per
+batch.
+
+The training and validation datasets will be created using the
+`TFDatasetMultiShotMemorySampler` object. This creates a sampler that loads datasets
+from [TensorFlow Datasets](https://www.tensorflow.org/datasets) and yields
+batches containing a target number of classes and a target number of examples
+per class. Additionally, we can restrict the sampler to only yield the subset of
+classes defined in `class_list`, enabling us to train on a subset of the classes
+and then test how the embedding generalizes to the unseen classes. This can be
+useful when working on few-shot learning problems.
+
+The following cell creates a train_ds sample that:
+
+* Loads the CIFAR-10 dataset from TFDS and then takes the `examples_per_class_per_batch`.
+* Ensures the sampler restricts the classes to those defined in `class_list`.
+* Ensures each batch contains 10 different classes with 8 examples each.
+
+We also create a validation dataset in the same way, but we limit the total number of
+examples per class to 100 and the examples per class per batch is set to the
+default of 2.
+"""
+# This determines the number of classes used during training.
+# Here we are using all the classes.
+num_known_classes = 10
+class_list = random.sample(population=range(10), k=num_known_classes)
+
+classes_per_batch = 10
+# Passing multiple examples per class per batch ensures that each example has
+# multiple positive pairs. This can be useful when performing triplet mining or
+# when using losses like `MultiSimilarityLoss` or `CircleLoss` as these can
+# take a weighted mix of all the positive pairs. In general, more examples per
+# class will lead to more information for the positive pairs, while more classes
+# per batch will provide more varied information in the negative pairs. However,
+# the losses compute the pairwise distance between the examples in a batch so
+# the upper limit of the batch size is restricted by the memory.
+examples_per_class_per_batch = 8
+
+print(
+    "Batch size is: "
+    f"{min(classes_per_batch, num_known_classes) * examples_per_class_per_batch}"
+)
+
+print(" Create Training Data ".center(34, "#"))
+train_ds = tfsim.samplers.TFDatasetMultiShotMemorySampler(
+    "cifar10",
+    classes_per_batch=min(classes_per_batch, num_known_classes),
+    splits="train",
+    steps_per_epoch=4000,
+    examples_per_class_per_batch=examples_per_class_per_batch,
+    class_list=class_list,
+)
+
+print("\n" + " Create Validation Data ".center(34, "#"))
+val_ds = tfsim.samplers.TFDatasetMultiShotMemorySampler(
+    "cifar10",
+    classes_per_batch=classes_per_batch,
+    splits="test",
+    total_examples_per_class=100,
+)
+
+"""
+## Visualize the dataset
+
+The samplers will shuffle the dataset, so we can get a sense of the dataset by
+plotting the first 25 images.
+
+The samplers provide a `get_slice(begin, size)` method that allows us to easily
+select a block of samples.
+
+Alternatively, we can use the `generate_batch()` method to yield a batch. This
+can allow us to check that a batch contains the expected number of classes and
+examples per class.
+"""
+
+num_cols = num_rows = 5
+# Get the first 25 examples.
+x_slice, y_slice = train_ds.get_slice(begin=0, size=num_cols * num_rows)
+
+fig = plt.figure(figsize=(6.0, 6.0))
+grid = axes_grid1.ImageGrid(fig, 111, nrows_ncols=(num_cols, num_rows), axes_pad=0.1)
+
+for ax, im, label in zip(grid, x_slice, y_slice):
+    ax.imshow(im)
+    ax.axis("off")
+
+"""
+## Embedding model
+
+Next we define a `SimilarityModel` using the Keras Functional API. The model
+is a standard convnet with the addition of a `MetricEmbedding` layer that
+applies L2 normalization. The metric embedding layer is helpful when using
+`Cosine` distance as we only care about the angle between the vectors.
+
+Additionally, the `SimilarityModel` provides a number of helper methods for:
+
+* Indexing embedded examples
+* Performing example lookups
+* Evaluating the classification
+* Evaluating the quality of the embedding space
+
+See the [TensorFlow Similarity documentation](https://github.com/tensorflow/similarity)
+for more details.
+"""
+
+embedding_size = 256
+
+inputs = keras.layers.Input((32, 32, 3))
+x = keras.layers.Rescaling(scale=1.0 / 255)(inputs)
+x = keras.layers.Conv2D(64, 3, activation="relu")(x)
+x = keras.layers.BatchNormalization()(x)
+x = keras.layers.Conv2D(128, 3, activation="relu")(x)
+x = keras.layers.BatchNormalization()(x)
+x = keras.layers.MaxPool2D((4, 4))(x)
+x = keras.layers.Conv2D(256, 3, activation="relu")(x)
+x = keras.layers.BatchNormalization()(x)
+x = keras.layers.Conv2D(256, 3, activation="relu")(x)
+x = keras.layers.GlobalMaxPool2D()(x)
+outputs = tfsim.layers.MetricEmbedding(embedding_size)(x)
+
+# building model
+model = tfsim.models.SimilarityModel(inputs, outputs)
+model.summary()
+
+"""
+## Similarity loss
+
+The similarity loss expects batches containing at least 2 examples of each
+class, from which it computes the loss over the pairwise positive and negative
+distances. Here we are using `MultiSimilarityLoss()`
+([paper](ihttps://arxiv.org/abs/1904.06627)), one of several losses in
+[TensorFlow Similarity](https://github.com/tensorflow/similarity). This loss
+attempts to use all informative pairs in the batch, taking into account the
+self-similarity, positive-similarity, and the negative-similarity.
+"""
+
+epochs = 3
+learning_rate = 0.002
+val_steps = 50
+
+# init similarity loss
+loss = tfsim.losses.MultiSimilarityLoss()
+
+# compiling and training
+model.compile(
+    optimizer=keras.optimizers.Adam(learning_rate),
+    loss=loss,
+    steps_per_execution=10,
+)
+history = model.fit(
+    train_ds, epochs=epochs, validation_data=val_ds, validation_steps=val_steps
+)
+
+"""
+## Indexing
+
+Now that we have trained our model, we can create an index of examples. Here we
+batch index the first 200 validation examples by passing the x and y to the index
+along with storing the image in the data parameter. The `x_index` values are
+embedded and then added to the index to make them searchable. The `y_index` and
+data parameters are optional but allow the user to associate metadata with the
+embedded example.
+"""
+
+x_index, y_index = val_ds.get_slice(begin=0, size=200)
+model.reset_index()
+model.index(x_index, y_index, data=x_index)
+
+"""
+## Calibration
+
+Once the index is built, we can calibrate a distance threshold using a matching
+strategy and a calibration metric.
+
+Here we are searching for the optimal F1 score while using K=1 as our
+classifier. All matches at or below the calibrated threshold distance will be
+labeled as a Positive match between the query example and the label associated
+with the match result, while all matches above the threshold distance will be
+labeled as a Negative match.
+
+Additionally, we pass in extra metrics to compute as well. All values in the
+output are computed at the calibrated threshold.
+
+Finally, `model.calibrate()` returns a `CalibrationResults` object containing:
+
+* `"cutpoints"`: A Python dict mapping the cutpoint name to a dict containing the
+`ClassificationMetric` values associated with a particular distance threshold,
+e.g., `"optimal" : {"acc": 0.90, "f1": 0.92}`.
+* `"thresholds"`: A Python dict mapping `ClassificationMetric` names to a list
+containing the metric's value computed at each of the distance thresholds, e.g.,
+`{"f1": [0.99, 0.80], "distance": [0.0, 1.0]}`.
+"""
+
+x_train, y_train = train_ds.get_slice(begin=0, size=1000)
+calibration = model.calibrate(
+    x_train,
+    y_train,
+    calibration_metric="f1",
+    matcher="match_nearest",
+    extra_metrics=["precision", "recall", "binary_accuracy"],
+    verbose=1,
+)
+
+"""
+## Visualization
+
+It may be difficult to get a sense of the model quality from the metrics alone.
+A complementary approach is to manually inspect a set of query results to get a
+feel for the match quality.
+
+Here we take 10 validation examples and plot them with their 5 nearest
+neighbors and the distances to the query example. Looking at the results, we see
+that while they are imperfect they still represent meaningfully similar images,
+and that the model is able to find similar images irrespective of their pose or
+image illumination.
+
+We can also see that the model is very confident with certain images, resulting
+in very small distances between the query and the neighbors. Conversely, we see
+more mistakes in the class labels as the distances become larger. This is one of
+the reasons why calibration is critical for matching applications.
+"""
+
+num_neighbors = 5
+labels = [
+    "Airplane",
+    "Automobile",
+    "Bird",
+    "Cat",
+    "Deer",
+    "Dog",
+    "Frog",
+    "Horse",
+    "Ship",
+    "Truck",
+    "Unknown",
+]
+class_mapping = {c_id: c_lbl for c_id, c_lbl in zip(range(11), labels)}
+
+x_display, y_display = val_ds.get_slice(begin=200, size=10)
+# lookup nearest neighbors in the index
+nns = model.lookup(x_display, k=num_neighbors)
+
+# display
+for idx in np.argsort(y_display):
+    tfsim.visualization.viz_neigbors_imgs(
+        x_display[idx],
+        y_display[idx],
+        nns[idx],
+        class_mapping=class_mapping,
+        fig_size=(16, 2),
+    )
+
+"""
+## Metrics
+
+We can also plot the extra metrics contained in the `CalibrationResults` to get
+a sense of the matching performance as the distance threshold increases.
+
+The following plots show the Precision, Recall, and F1 Score. We can see that
+the matching precision degrades as the distance increases, but that the
+percentage of the queries that we accept as positive matches (recall) grows
+faster up to the calibrated distance threshold.
+"""
+
+fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
+x = calibration.thresholds["distance"]
+
+ax1.plot(x, calibration.thresholds["precision"], label="precision")
+ax1.plot(x, calibration.thresholds["recall"], label="recall")
+ax1.plot(x, calibration.thresholds["f1"], label="f1 score")
+ax1.legend()
+ax1.set_title("Metric evolution as distance increase")
+ax1.set_xlabel("Distance")
+ax1.set_ylim((-0.05, 1.05))
+
+ax2.plot(calibration.thresholds["recall"], calibration.thresholds["precision"])
+ax2.set_title("Precision recall curve")
+ax2.set_xlabel("Recall")
+ax2.set_ylabel("Precision")
+ax2.set_ylim((-0.05, 1.05))
+plt.show()
+
+"""
+We can also take 100 examples for each class and plot the confusion matrix for
+each example and their nearest match. We also add an "extra" 10th class to
+represent the matches above the calibrated distance threshold.
+
+We can see that most of the errors are between the animal classes with an
+interesting number of confusions between Airplane and Bird. Additionally, we see
+that only a few of the 100 examples for each class returned matches outside of
+the calibrated distance threshold.
+"""
+
+cutpoint = "optimal"
+
+# This yields 100 examples for each class.
+# We defined this when we created the val_ds sampler.
+x_confusion, y_confusion = val_ds.get_slice(0, -1)
+
+matches = model.match(x_confusion, cutpoint=cutpoint, no_match_label=10)
+cm = tfsim.visualization.confusion_matrix(
+    matches,
+    y_confusion,
+    labels=labels,
+    title="Confusion matrix for cutpoint:%s" % cutpoint,
+    normalize=False,
+)
+
+"""
+## No Match
+
+We can plot the examples outside of the calibrated threshold to see which images
+are not matching any indexed examples.
+
+This may provide insight into what other examples may need to be indexed or
+surface anomalous examples within the class.
+"""
+
+idx_no_match = np.where(np.array(matches) == 10)
+no_match_queries = x_confusion[idx_no_match]
+if len(no_match_queries):
+    plt.imshow(no_match_queries[0])
+else:
+    print("All queries have a match below the distance threshold.")
+
+"""
+## Visualize clusters
+
+One of the best ways to quickly get a sense of the quality of how the model is
+doing and understand it's short comings is to project the embedding into a 2D
+space.
+
+This allows us to inspect clusters of images and understand which classes are
+entangled.
+"""
+
+# Each class in val_ds was restricted to 100 examples.
+num_examples_to_clusters = 1000
+thumb_size = 96
+plot_size = 800
+vx, vy = val_ds.get_slice(0, num_examples_to_clusters)
+
+# Uncomment to run the interactive projector.
+# tfsim.visualization.projector(
+#     model.predict(vx),
+#     labels=vy,
+#     images=vx,
+#     class_mapping=class_mapping,
+#     image_size=thumb_size,
+#     plot_size=plot_size,
+# )
diff --git a/knowledge_base/vision/mirnet.py b/knowledge_base/vision/mirnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..af1099cd042cb8c87527fabc52e5b0d9794a1ef1
--- /dev/null
+++ b/knowledge_base/vision/mirnet.py
@@ -0,0 +1,507 @@
+"""
+Title: Low-light image enhancement using MIRNet
+Author: [Soumik Rakshit](http://github.com/soumik12345)
+Date created: 2021/09/11
+Last modified: 2023/07/15
+Description: Implementing the MIRNet architecture for low-light image enhancement.
+Accelerator: GPU
+Converted to Keras 3 by: [Soumik Rakshit](http://github.com/soumik12345)
+"""
+
+"""
+## Introduction
+
+With the goal of recovering high-quality image content from its degraded version, image
+restoration enjoys numerous applications, such as in
+photography, security, medical imaging, and remote sensing. In this example, we implement the
+**MIRNet** model for low-light image enhancement, a fully-convolutional architecture that
+learns an enriched set of
+features that combines contextual information from multiple scales, while
+simultaneously preserving the high-resolution spatial details.
+
+### References:
+
+- [Learning Enriched Features for Real Image Restoration and Enhancement](https://arxiv.org/abs/2003.06792)
+- [The Retinex Theory of Color Vision](http://www.cnbc.cmu.edu/~tai/cp_papers/E.Land_Retinex_Theory_ScientifcAmerican.pdf)
+- [Two deterministic half-quadratic regularization algorithms for computed imaging](https://ieeexplore.ieee.org/document/413553)
+"""
+
+"""
+## Downloading LOLDataset
+
+The **LoL Dataset** has been created for low-light image enhancement.
+It provides 485 images for training and 15 for testing. Each image pair in the dataset
+consists of a low-light input image and its corresponding well-exposed reference image.
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import random
+import numpy as np
+from glob import glob
+from PIL import Image, ImageOps
+import matplotlib.pyplot as plt
+
+import keras
+from keras import layers
+
+import tensorflow as tf
+
+"""shell
+wget https://huggingface.co/datasets/geekyrakshit/LoL-Dataset/resolve/main/lol_dataset.zip
+unzip -q lol_dataset.zip && rm lol_dataset.zip
+"""
+
+"""
+## Creating a TensorFlow Dataset
+
+We use 300 image pairs from the LoL Dataset's training set for training,
+and we use the remaining 185 image pairs for validation.
+We generate random crops of size `128 x 128` from the image pairs to be
+used for both training and validation.
+"""
+
+random.seed(10)
+
+IMAGE_SIZE = 128
+BATCH_SIZE = 4
+MAX_TRAIN_IMAGES = 300
+
+
+def read_image(image_path):
+    image = tf.io.read_file(image_path)
+    image = tf.image.decode_png(image, channels=3)
+    image.set_shape([None, None, 3])
+    image = tf.cast(image, dtype=tf.float32) / 255.0
+    return image
+
+
+def random_crop(low_image, enhanced_image):
+    low_image_shape = tf.shape(low_image)[:2]
+    low_w = tf.random.uniform(
+        shape=(), maxval=low_image_shape[1] - IMAGE_SIZE + 1, dtype=tf.int32
+    )
+    low_h = tf.random.uniform(
+        shape=(), maxval=low_image_shape[0] - IMAGE_SIZE + 1, dtype=tf.int32
+    )
+    low_image_cropped = low_image[
+        low_h : low_h + IMAGE_SIZE, low_w : low_w + IMAGE_SIZE
+    ]
+    enhanced_image_cropped = enhanced_image[
+        low_h : low_h + IMAGE_SIZE, low_w : low_w + IMAGE_SIZE
+    ]
+    # in order to avoid `NONE` during shape inference
+    low_image_cropped.set_shape([IMAGE_SIZE, IMAGE_SIZE, 3])
+    enhanced_image_cropped.set_shape([IMAGE_SIZE, IMAGE_SIZE, 3])
+    return low_image_cropped, enhanced_image_cropped
+
+
+def load_data(low_light_image_path, enhanced_image_path):
+    low_light_image = read_image(low_light_image_path)
+    enhanced_image = read_image(enhanced_image_path)
+    low_light_image, enhanced_image = random_crop(low_light_image, enhanced_image)
+    return low_light_image, enhanced_image
+
+
+def get_dataset(low_light_images, enhanced_images):
+    dataset = tf.data.Dataset.from_tensor_slices((low_light_images, enhanced_images))
+    dataset = dataset.map(load_data, num_parallel_calls=tf.data.AUTOTUNE)
+    dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
+    return dataset
+
+
+train_low_light_images = sorted(glob("./lol_dataset/our485/low/*"))[:MAX_TRAIN_IMAGES]
+train_enhanced_images = sorted(glob("./lol_dataset/our485/high/*"))[:MAX_TRAIN_IMAGES]
+
+val_low_light_images = sorted(glob("./lol_dataset/our485/low/*"))[MAX_TRAIN_IMAGES:]
+val_enhanced_images = sorted(glob("./lol_dataset/our485/high/*"))[MAX_TRAIN_IMAGES:]
+
+test_low_light_images = sorted(glob("./lol_dataset/eval15/low/*"))
+test_enhanced_images = sorted(glob("./lol_dataset/eval15/high/*"))
+
+
+train_dataset = get_dataset(train_low_light_images, train_enhanced_images)
+val_dataset = get_dataset(val_low_light_images, val_enhanced_images)
+
+
+print("Train Dataset:", train_dataset.element_spec)
+print("Val Dataset:", val_dataset.element_spec)
+
+"""
+## MIRNet Model
+
+Here are the main features of the MIRNet model:
+
+- A feature extraction model that computes a complementary set of features across multiple
+spatial scales, while maintaining the original high-resolution features to preserve
+precise spatial details.
+- A regularly repeated mechanism for information exchange, where the features across
+multi-resolution branches are progressively fused together for improved representation
+learning.
+- A new approach to fuse multi-scale features using a selective kernel network
+that dynamically combines variable receptive fields and faithfully preserves
+the original feature information at each spatial resolution.
+- A recursive residual design that progressively breaks down the input signal
+in order to simplify the overall learning process, and allows the construction
+of very deep networks.
+
+
+![](https://raw.githubusercontent.com/soumik12345/MIRNet/master/assets/mirnet_architecture.png)
+"""
+
+"""
+### Selective Kernel Feature Fusion
+
+The Selective Kernel Feature Fusion or SKFF module performs dynamic adjustment of
+receptive fields via two operations: **Fuse** and **Select**. The Fuse operator generates
+global feature descriptors by combining the information from multi-resolution streams.
+The Select operator uses these descriptors to recalibrate the feature maps (of different
+streams) followed by their aggregation.
+
+**Fuse**: The SKFF receives inputs from three parallel convolution streams carrying
+different scales of information. We first combine these multi-scale features using an
+element-wise sum, on which we apply Global Average Pooling (GAP) across the spatial
+dimension. Next, we apply a channel- downscaling convolution layer to generate a compact
+feature representation which passes through three parallel channel-upscaling convolution
+layers (one for each resolution stream) and provides us with three feature descriptors.
+
+**Select**: This operator applies the softmax function to the feature descriptors to
+obtain the corresponding activations that are used to adaptively recalibrate multi-scale
+feature maps. The aggregated features are defined as the sum of product of the corresponding
+multi-scale feature and the feature descriptor.
+
+![](https://i.imgur.com/7U6ixF6.png)
+"""
+
+
+def selective_kernel_feature_fusion(
+    multi_scale_feature_1, multi_scale_feature_2, multi_scale_feature_3
+):
+    channels = list(multi_scale_feature_1.shape)[-1]
+    combined_feature = layers.Add()(
+        [multi_scale_feature_1, multi_scale_feature_2, multi_scale_feature_3]
+    )
+    gap = layers.GlobalAveragePooling2D()(combined_feature)
+    channel_wise_statistics = layers.Reshape((1, 1, channels))(gap)
+    compact_feature_representation = layers.Conv2D(
+        filters=channels // 8, kernel_size=(1, 1), activation="relu"
+    )(channel_wise_statistics)
+    feature_descriptor_1 = layers.Conv2D(
+        channels, kernel_size=(1, 1), activation="softmax"
+    )(compact_feature_representation)
+    feature_descriptor_2 = layers.Conv2D(
+        channels, kernel_size=(1, 1), activation="softmax"
+    )(compact_feature_representation)
+    feature_descriptor_3 = layers.Conv2D(
+        channels, kernel_size=(1, 1), activation="softmax"
+    )(compact_feature_representation)
+    feature_1 = multi_scale_feature_1 * feature_descriptor_1
+    feature_2 = multi_scale_feature_2 * feature_descriptor_2
+    feature_3 = multi_scale_feature_3 * feature_descriptor_3
+    aggregated_feature = layers.Add()([feature_1, feature_2, feature_3])
+    return aggregated_feature
+
+
+"""
+### Dual Attention Unit
+
+The Dual Attention Unit or DAU is used to extract features in the convolutional streams.
+While the SKFF block fuses information across multi-resolution branches, we also need a
+mechanism to share information within a feature tensor, both along the spatial and the
+channel dimensions which is done by the DAU block. The DAU suppresses less useful
+features and only allows more informative ones to pass further. This feature
+recalibration is achieved by using **Channel Attention** and **Spatial Attention**
+mechanisms.
+
+The **Channel Attention** branch exploits the inter-channel relationships of the
+convolutional feature maps by applying squeeze and excitation operations. Given a feature
+map, the squeeze operation applies Global Average Pooling across spatial dimensions to
+encode global context, thus yielding a feature descriptor. The excitation operator passes
+this feature descriptor through two convolutional layers followed by the sigmoid gating
+and generates activations. Finally, the output of Channel Attention branch is obtained by
+rescaling the input feature map with the output activations.
+
+The **Spatial Attention** branch is designed to exploit the inter-spatial dependencies of
+convolutional features. The goal of Spatial Attention is to generate a spatial attention
+map and use it to recalibrate the incoming features. To generate the spatial attention
+map, the Spatial Attention branch first independently applies Global Average Pooling and
+Max Pooling operations on input features along the channel dimensions and concatenates
+the outputs to form a resultant feature map which is then passed through a convolution
+and sigmoid activation to obtain the spatial attention map. This spatial attention map is
+then used to rescale the input feature map.
+
+![](https://i.imgur.com/Dl0IwQs.png)
+"""
+
+
+class ChannelPooling(layers.Layer):
+    def __init__(self, axis=-1, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.axis = axis
+        self.concat = layers.Concatenate(axis=self.axis)
+
+    def call(self, inputs):
+        average_pooling = tf.expand_dims(tf.reduce_mean(inputs, axis=-1), axis=-1)
+        max_pooling = tf.expand_dims(tf.reduce_max(inputs, axis=-1), axis=-1)
+        return self.concat([average_pooling, max_pooling])
+
+    def get_config(self):
+        config = super().get_config()
+        config.update({"axis": self.axis})
+
+
+def spatial_attention_block(input_tensor):
+    compressed_feature_map = ChannelPooling(axis=-1)(input_tensor)
+    feature_map = layers.Conv2D(1, kernel_size=(1, 1))(compressed_feature_map)
+    feature_map = keras.activations.sigmoid(feature_map)
+    return input_tensor * feature_map
+
+
+def channel_attention_block(input_tensor):
+    channels = list(input_tensor.shape)[-1]
+    average_pooling = layers.GlobalAveragePooling2D()(input_tensor)
+    feature_descriptor = layers.Reshape((1, 1, channels))(average_pooling)
+    feature_activations = layers.Conv2D(
+        filters=channels // 8, kernel_size=(1, 1), activation="relu"
+    )(feature_descriptor)
+    feature_activations = layers.Conv2D(
+        filters=channels, kernel_size=(1, 1), activation="sigmoid"
+    )(feature_activations)
+    return input_tensor * feature_activations
+
+
+def dual_attention_unit_block(input_tensor):
+    channels = list(input_tensor.shape)[-1]
+    feature_map = layers.Conv2D(
+        channels, kernel_size=(3, 3), padding="same", activation="relu"
+    )(input_tensor)
+    feature_map = layers.Conv2D(channels, kernel_size=(3, 3), padding="same")(
+        feature_map
+    )
+    channel_attention = channel_attention_block(feature_map)
+    spatial_attention = spatial_attention_block(feature_map)
+    concatenation = layers.Concatenate(axis=-1)([channel_attention, spatial_attention])
+    concatenation = layers.Conv2D(channels, kernel_size=(1, 1))(concatenation)
+    return layers.Add()([input_tensor, concatenation])
+
+
+"""
+### Multi-Scale Residual Block
+
+The Multi-Scale Residual Block is capable of generating a spatially-precise output by
+maintaining high-resolution representations, while receiving rich contextual information
+from low-resolutions. The MRB consists of multiple (three in this paper)
+fully-convolutional streams connected in parallel. It allows information exchange across
+parallel streams in order to consolidate the high-resolution features with the help of
+low-resolution features, and vice versa. The MIRNet employs a recursive residual design
+(with skip connections) to ease the flow of information during the learning process. In
+order to maintain the residual nature of our architecture, residual resizing modules are
+used to perform downsampling and upsampling operations that are used in the Multi-scale
+Residual Block.
+
+![](https://i.imgur.com/wzZKV57.png)
+"""
+
+# Recursive Residual Modules
+
+
+def down_sampling_module(input_tensor):
+    channels = list(input_tensor.shape)[-1]
+    main_branch = layers.Conv2D(channels, kernel_size=(1, 1), activation="relu")(
+        input_tensor
+    )
+    main_branch = layers.Conv2D(
+        channels, kernel_size=(3, 3), padding="same", activation="relu"
+    )(main_branch)
+    main_branch = layers.MaxPooling2D()(main_branch)
+    main_branch = layers.Conv2D(channels * 2, kernel_size=(1, 1))(main_branch)
+    skip_branch = layers.MaxPooling2D()(input_tensor)
+    skip_branch = layers.Conv2D(channels * 2, kernel_size=(1, 1))(skip_branch)
+    return layers.Add()([skip_branch, main_branch])
+
+
+def up_sampling_module(input_tensor):
+    channels = list(input_tensor.shape)[-1]
+    main_branch = layers.Conv2D(channels, kernel_size=(1, 1), activation="relu")(
+        input_tensor
+    )
+    main_branch = layers.Conv2D(
+        channels, kernel_size=(3, 3), padding="same", activation="relu"
+    )(main_branch)
+    main_branch = layers.UpSampling2D()(main_branch)
+    main_branch = layers.Conv2D(channels // 2, kernel_size=(1, 1))(main_branch)
+    skip_branch = layers.UpSampling2D()(input_tensor)
+    skip_branch = layers.Conv2D(channels // 2, kernel_size=(1, 1))(skip_branch)
+    return layers.Add()([skip_branch, main_branch])
+
+
+# MRB Block
+def multi_scale_residual_block(input_tensor, channels):
+    # features
+    level1 = input_tensor
+    level2 = down_sampling_module(input_tensor)
+    level3 = down_sampling_module(level2)
+    # DAU
+    level1_dau = dual_attention_unit_block(level1)
+    level2_dau = dual_attention_unit_block(level2)
+    level3_dau = dual_attention_unit_block(level3)
+    # SKFF
+    level1_skff = selective_kernel_feature_fusion(
+        level1_dau,
+        up_sampling_module(level2_dau),
+        up_sampling_module(up_sampling_module(level3_dau)),
+    )
+    level2_skff = selective_kernel_feature_fusion(
+        down_sampling_module(level1_dau),
+        level2_dau,
+        up_sampling_module(level3_dau),
+    )
+    level3_skff = selective_kernel_feature_fusion(
+        down_sampling_module(down_sampling_module(level1_dau)),
+        down_sampling_module(level2_dau),
+        level3_dau,
+    )
+    # DAU 2
+    level1_dau_2 = dual_attention_unit_block(level1_skff)
+    level2_dau_2 = up_sampling_module((dual_attention_unit_block(level2_skff)))
+    level3_dau_2 = up_sampling_module(
+        up_sampling_module(dual_attention_unit_block(level3_skff))
+    )
+    # SKFF 2
+    skff_ = selective_kernel_feature_fusion(level1_dau_2, level2_dau_2, level3_dau_2)
+    conv = layers.Conv2D(channels, kernel_size=(3, 3), padding="same")(skff_)
+    return layers.Add()([input_tensor, conv])
+
+
+"""
+### MIRNet Model
+"""
+
+
+def recursive_residual_group(input_tensor, num_mrb, channels):
+    conv1 = layers.Conv2D(channels, kernel_size=(3, 3), padding="same")(input_tensor)
+    for _ in range(num_mrb):
+        conv1 = multi_scale_residual_block(conv1, channels)
+    conv2 = layers.Conv2D(channels, kernel_size=(3, 3), padding="same")(conv1)
+    return layers.Add()([conv2, input_tensor])
+
+
+def mirnet_model(num_rrg, num_mrb, channels):
+    input_tensor = keras.Input(shape=[None, None, 3])
+    x1 = layers.Conv2D(channels, kernel_size=(3, 3), padding="same")(input_tensor)
+    for _ in range(num_rrg):
+        x1 = recursive_residual_group(x1, num_mrb, channels)
+    conv = layers.Conv2D(3, kernel_size=(3, 3), padding="same")(x1)
+    output_tensor = layers.Add()([input_tensor, conv])
+    return keras.Model(input_tensor, output_tensor)
+
+
+model = mirnet_model(num_rrg=3, num_mrb=2, channels=64)
+
+"""
+## Training
+
+- We train MIRNet using **Charbonnier Loss** as the loss function and **Adam
+Optimizer** with a learning rate of `1e-4`.
+- We use **Peak Signal Noise Ratio** or PSNR as a metric which is an expression for the
+ratio between the maximum possible value (power) of a signal and the power of distorting
+noise that affects the quality of its representation.
+"""
+
+
+def charbonnier_loss(y_true, y_pred):
+    return tf.reduce_mean(tf.sqrt(tf.square(y_true - y_pred) + tf.square(1e-3)))
+
+
+def peak_signal_noise_ratio(y_true, y_pred):
+    return tf.image.psnr(y_pred, y_true, max_val=255.0)
+
+
+optimizer = keras.optimizers.Adam(learning_rate=1e-4)
+model.compile(
+    optimizer=optimizer,
+    loss=charbonnier_loss,
+    metrics=[peak_signal_noise_ratio],
+)
+
+history = model.fit(
+    train_dataset,
+    validation_data=val_dataset,
+    epochs=50,
+    callbacks=[
+        keras.callbacks.ReduceLROnPlateau(
+            monitor="val_peak_signal_noise_ratio",
+            factor=0.5,
+            patience=5,
+            verbose=1,
+            min_delta=1e-7,
+            mode="max",
+        )
+    ],
+)
+
+
+def plot_history(value, name):
+    plt.plot(history.history[value], label=f"train_{name.lower()}")
+    plt.plot(history.history[f"val_{value}"], label=f"val_{name.lower()}")
+    plt.xlabel("Epochs")
+    plt.ylabel(name)
+    plt.title(f"Train and Validation {name} Over Epochs", fontsize=14)
+    plt.legend()
+    plt.grid()
+    plt.show()
+
+
+plot_history("loss", "Loss")
+plot_history("peak_signal_noise_ratio", "PSNR")
+
+"""
+## Inference
+"""
+
+
+def plot_results(images, titles, figure_size=(12, 12)):
+    fig = plt.figure(figsize=figure_size)
+    for i in range(len(images)):
+        fig.add_subplot(1, len(images), i + 1).set_title(titles[i])
+        _ = plt.imshow(images[i])
+        plt.axis("off")
+    plt.show()
+
+
+def infer(original_image):
+    image = keras.utils.img_to_array(original_image)
+    image = image.astype("float32") / 255.0
+    image = np.expand_dims(image, axis=0)
+    output = model.predict(image, verbose=0)
+    output_image = output[0] * 255.0
+    output_image = output_image.clip(0, 255)
+    output_image = output_image.reshape(
+        (np.shape(output_image)[0], np.shape(output_image)[1], 3)
+    )
+    output_image = Image.fromarray(np.uint8(output_image))
+    original_image = Image.fromarray(np.uint8(original_image))
+    return output_image
+
+
+"""
+### Inference on Test Images
+
+We compare the test images from LOLDataset enhanced by MIRNet with images
+enhanced via the `PIL.ImageOps.autocontrast()` function.
+
+You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/keras-io/lowlight-enhance-mirnet)
+and try the demo on [Hugging Face Spaces](https://huggingface.co/spaces/keras-io/Enhance_Low_Light_Image).
+"""
+
+
+for low_light_image in random.sample(test_low_light_images, 6):
+    original_image = Image.open(low_light_image)
+    enhanced_image = infer(original_image)
+    plot_results(
+        [original_image, ImageOps.autocontrast(original_image), enhanced_image],
+        ["Original", "PIL Autocontrast", "MIRNet Enhanced"],
+        (20, 12),
+    )
diff --git a/knowledge_base/vision/mixup.py b/knowledge_base/vision/mixup.py
new file mode 100644
index 0000000000000000000000000000000000000000..77c9c049ee1a613be109f4db60a6053b9a876117
--- /dev/null
+++ b/knowledge_base/vision/mixup.py
@@ -0,0 +1,242 @@
+"""
+Title: MixUp augmentation for image classification
+Author: [Sayak Paul](https://twitter.com/RisingSayak)
+Date created: 2021/03/06
+Last modified: 2023/07/24
+Description: Data augmentation using the mixup technique for image classification.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+"""
+
+"""
+_mixup_ is a *domain-agnostic* data augmentation technique proposed in [mixup: Beyond Empirical Risk Minimization](https://arxiv.org/abs/1710.09412)
+by Zhang et al. It's implemented with the following formulas:
+
+![](https://i.ibb.co/DRyHYww/image.png)
+
+(Note that the lambda values are values with the [0, 1] range and are sampled from the
+[Beta distribution](https://en.wikipedia.org/wiki/Beta_distribution).)
+
+The technique is quite systematically named. We are literally mixing up the features and
+their corresponding labels. Implementation-wise it's simple. Neural networks are prone
+to [memorizing corrupt labels](https://arxiv.org/abs/1611.03530). mixup relaxes this by
+combining different features with one another (same happens for the labels too) so that
+a network does not get overconfident about the relationship between the features and
+their labels.
+
+mixup is specifically useful when we are not sure about selecting a set of augmentation
+transforms for a given dataset, medical imaging datasets, for example. mixup can be
+extended to a variety of data modalities such as computer vision, naturallanguage
+processing, speech, and so on.
+"""
+
+"""
+## Setup
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import numpy as np
+import keras
+import matplotlib.pyplot as plt
+
+from keras import layers
+
+# TF imports related to tf.data preprocessing
+from tensorflow import data as tf_data
+from tensorflow import image as tf_image
+from tensorflow.random import gamma as tf_random_gamma
+
+
+"""
+## Prepare the dataset
+
+In this example, we will be using the [FashionMNIST](https://github.com/zalandoresearch/fashion-mnist) dataset. But this same recipe can
+be used for other classification datasets as well.
+"""
+
+(x_train, y_train), (x_test, y_test) = keras.datasets.fashion_mnist.load_data()
+
+x_train = x_train.astype("float32") / 255.0
+x_train = np.reshape(x_train, (-1, 28, 28, 1))
+y_train = keras.ops.one_hot(y_train, 10)
+
+x_test = x_test.astype("float32") / 255.0
+x_test = np.reshape(x_test, (-1, 28, 28, 1))
+y_test = keras.ops.one_hot(y_test, 10)
+
+"""
+## Define hyperparameters
+"""
+
+AUTO = tf_data.AUTOTUNE
+BATCH_SIZE = 64
+EPOCHS = 10
+
+"""
+## Convert the data into TensorFlow `Dataset` objects
+"""
+
+# Put aside a few samples to create our validation set
+val_samples = 2000
+x_val, y_val = x_train[:val_samples], y_train[:val_samples]
+new_x_train, new_y_train = x_train[val_samples:], y_train[val_samples:]
+
+train_ds_one = (
+    tf_data.Dataset.from_tensor_slices((new_x_train, new_y_train))
+    .shuffle(BATCH_SIZE * 100)
+    .batch(BATCH_SIZE)
+)
+train_ds_two = (
+    tf_data.Dataset.from_tensor_slices((new_x_train, new_y_train))
+    .shuffle(BATCH_SIZE * 100)
+    .batch(BATCH_SIZE)
+)
+# Because we will be mixing up the images and their corresponding labels, we will be
+# combining two shuffled datasets from the same training data.
+train_ds = tf_data.Dataset.zip((train_ds_one, train_ds_two))
+
+val_ds = tf_data.Dataset.from_tensor_slices((x_val, y_val)).batch(BATCH_SIZE)
+
+test_ds = tf_data.Dataset.from_tensor_slices((x_test, y_test)).batch(BATCH_SIZE)
+
+"""
+## Define the mixup technique function
+
+To perform the mixup routine, we create new virtual datasets using the training data from
+the same dataset, and apply a lambda value within the [0, 1] range sampled from a [Beta distribution](https://en.wikipedia.org/wiki/Beta_distribution)
+— such that, for example, `new_x = lambda * x1 + (1 - lambda) * x2` (where
+`x1` and `x2` are images) and the same equation is applied to the labels as well.
+"""
+
+
+def sample_beta_distribution(size, concentration_0=0.2, concentration_1=0.2):
+    gamma_1_sample = tf_random_gamma(shape=[size], alpha=concentration_1)
+    gamma_2_sample = tf_random_gamma(shape=[size], alpha=concentration_0)
+    return gamma_1_sample / (gamma_1_sample + gamma_2_sample)
+
+
+def mix_up(ds_one, ds_two, alpha=0.2):
+    # Unpack two datasets
+    images_one, labels_one = ds_one
+    images_two, labels_two = ds_two
+    batch_size = keras.ops.shape(images_one)[0]
+
+    # Sample lambda and reshape it to do the mixup
+    l = sample_beta_distribution(batch_size, alpha, alpha)
+    x_l = keras.ops.reshape(l, (batch_size, 1, 1, 1))
+    y_l = keras.ops.reshape(l, (batch_size, 1))
+
+    # Perform mixup on both images and labels by combining a pair of images/labels
+    # (one from each dataset) into one image/label
+    images = images_one * x_l + images_two * (1 - x_l)
+    labels = labels_one * y_l + labels_two * (1 - y_l)
+    return (images, labels)
+
+
+"""
+**Note** that here , we are combining two images to create a single one. Theoretically,
+we can combine as many we want but that comes at an increased computation cost. In
+certain cases, it may not help improve the performance as well.
+"""
+
+"""
+## Visualize the new augmented dataset
+"""
+
+# First create the new dataset using our `mix_up` utility
+train_ds_mu = train_ds.map(
+    lambda ds_one, ds_two: mix_up(ds_one, ds_two, alpha=0.2),
+    num_parallel_calls=AUTO,
+)
+
+# Let's preview 9 samples from the dataset
+sample_images, sample_labels = next(iter(train_ds_mu))
+plt.figure(figsize=(10, 10))
+for i, (image, label) in enumerate(zip(sample_images[:9], sample_labels[:9])):
+    ax = plt.subplot(3, 3, i + 1)
+    plt.imshow(image.numpy().squeeze())
+    print(label.numpy().tolist())
+    plt.axis("off")
+
+"""
+## Model building
+"""
+
+
+def get_training_model():
+    model = keras.Sequential(
+        [
+            layers.Input(shape=(28, 28, 1)),
+            layers.Conv2D(16, (5, 5), activation="relu"),
+            layers.MaxPooling2D(pool_size=(2, 2)),
+            layers.Conv2D(32, (5, 5), activation="relu"),
+            layers.MaxPooling2D(pool_size=(2, 2)),
+            layers.Dropout(0.2),
+            layers.GlobalAveragePooling2D(),
+            layers.Dense(128, activation="relu"),
+            layers.Dense(10, activation="softmax"),
+        ]
+    )
+    return model
+
+
+"""
+For the sake of reproducibility, we serialize the initial random weights of our shallow
+network.
+"""
+
+initial_model = get_training_model()
+initial_model.save_weights("initial_weights.weights.h5")
+
+"""
+## 1. Train the model with the mixed up dataset
+"""
+
+model = get_training_model()
+model.load_weights("initial_weights.weights.h5")
+model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
+model.fit(train_ds_mu, validation_data=val_ds, epochs=EPOCHS)
+_, test_acc = model.evaluate(test_ds)
+print("Test accuracy: {:.2f}%".format(test_acc * 100))
+
+"""
+## 2. Train the model *without* the mixed up dataset
+"""
+
+model = get_training_model()
+model.load_weights("initial_weights.weights.h5")
+model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
+# Notice that we are NOT using the mixed up dataset here
+model.fit(train_ds_one, validation_data=val_ds, epochs=EPOCHS)
+_, test_acc = model.evaluate(test_ds)
+print("Test accuracy: {:.2f}%".format(test_acc * 100))
+
+"""
+Readers are encouraged to try out mixup on different datasets from different domains and
+experiment with the lambda parameter. You are strongly advised to check out the
+[original paper](https://arxiv.org/abs/1710.09412) as well - the authors present several ablation studies on mixup
+showing how it can improve generalization, as well as show their results of combining
+more than two images to create a single one.
+"""
+
+"""
+## Notes
+
+* With mixup, you can create synthetic examples — especially when you lack a large
+dataset - without incurring high computational costs.
+* [Label smoothing](https://www.pyimagesearch.com/2019/12/30/label-smoothing-with-keras-tensorflow-and-deep-learning/) and mixup usually do not work well together because label smoothing
+already modifies the hard labels by some factor.
+* mixup does not work well when you are using [Supervised Contrastive
+Learning](https://arxiv.org/abs/2004.11362) (SCL) since SCL expects the true labels
+during its pre-training phase.
+* A few other benefits of mixup include (as described in the [paper](https://arxiv.org/abs/1710.09412)) robustness to
+adversarial examples and stabilized GAN (Generative Adversarial Networks) training.
+* There are a number of data augmentation techniques that extend mixup such as
+[CutMix](https://arxiv.org/abs/1905.04899) and [AugMix](https://arxiv.org/abs/1912.02781).
+"""
diff --git a/knowledge_base/vision/mlp_image_classification.py b/knowledge_base/vision/mlp_image_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..81d513cc7422dcb1fa451b658b4890e77a2e65cd
--- /dev/null
+++ b/knowledge_base/vision/mlp_image_classification.py
@@ -0,0 +1,476 @@
+"""
+Title: Image classification with modern MLP models
+Author: [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/)
+Date created: 2021/05/30
+Last modified: 2023/08/03
+Description: Implementing the MLP-Mixer, FNet, and gMLP models for CIFAR-100 image classification.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+This example implements three modern attention-free, multi-layer perceptron (MLP) based models for image
+classification, demonstrated on the CIFAR-100 dataset:
+
+1. The [MLP-Mixer](https://arxiv.org/abs/2105.01601) model, by Ilya Tolstikhin et al., based on two types of MLPs.
+3. The [FNet](https://arxiv.org/abs/2105.03824) model, by James Lee-Thorp et al., based on unparameterized
+Fourier Transform.
+2. The [gMLP](https://arxiv.org/abs/2105.08050) model, by Hanxiao Liu et al., based on MLP with gating.
+
+The purpose of the example is not to compare between these models, as they might perform differently on
+different datasets with well-tuned hyperparameters. Rather, it is to show simple implementations of their
+main building blocks.
+"""
+
+"""
+## Setup
+"""
+
+import numpy as np
+import keras
+from keras import layers
+
+"""
+## Prepare the data
+"""
+
+num_classes = 100
+input_shape = (32, 32, 3)
+
+(x_train, y_train), (x_test, y_test) = keras.datasets.cifar100.load_data()
+
+print(f"x_train shape: {x_train.shape} - y_train shape: {y_train.shape}")
+print(f"x_test shape: {x_test.shape} - y_test shape: {y_test.shape}")
+
+"""
+## Configure the hyperparameters
+"""
+
+weight_decay = 0.0001
+batch_size = 128
+num_epochs = 1  # Recommended num_epochs = 50
+dropout_rate = 0.2
+image_size = 64  # We'll resize input images to this size.
+patch_size = 8  # Size of the patches to be extracted from the input images.
+num_patches = (image_size // patch_size) ** 2  # Size of the data array.
+embedding_dim = 256  # Number of hidden units.
+num_blocks = 4  # Number of blocks.
+
+print(f"Image size: {image_size} X {image_size} = {image_size ** 2}")
+print(f"Patch size: {patch_size} X {patch_size} = {patch_size ** 2} ")
+print(f"Patches per image: {num_patches}")
+print(f"Elements per patch (3 channels): {(patch_size ** 2) * 3}")
+
+"""
+## Build a classification model
+
+We implement a method that builds a classifier given the processing blocks.
+"""
+
+
+def build_classifier(blocks, positional_encoding=False):
+    inputs = layers.Input(shape=input_shape)
+    # Augment data.
+    augmented = data_augmentation(inputs)
+    # Create patches.
+    patches = Patches(patch_size)(augmented)
+    # Encode patches to generate a [batch_size, num_patches, embedding_dim] tensor.
+    x = layers.Dense(units=embedding_dim)(patches)
+    if positional_encoding:
+        x = x + PositionEmbedding(sequence_length=num_patches)(x)
+    # Process x using the module blocks.
+    x = blocks(x)
+    # Apply global average pooling to generate a [batch_size, embedding_dim] representation tensor.
+    representation = layers.GlobalAveragePooling1D()(x)
+    # Apply dropout.
+    representation = layers.Dropout(rate=dropout_rate)(representation)
+    # Compute logits outputs.
+    logits = layers.Dense(num_classes)(representation)
+    # Create the Keras model.
+    return keras.Model(inputs=inputs, outputs=logits)
+
+
+"""
+## Define an experiment
+
+We implement a utility function to compile, train, and evaluate a given model.
+"""
+
+
+def run_experiment(model):
+    # Create Adam optimizer with weight decay.
+    optimizer = keras.optimizers.AdamW(
+        learning_rate=learning_rate,
+        weight_decay=weight_decay,
+    )
+    # Compile the model.
+    model.compile(
+        optimizer=optimizer,
+        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+        metrics=[
+            keras.metrics.SparseCategoricalAccuracy(name="acc"),
+            keras.metrics.SparseTopKCategoricalAccuracy(5, name="top5-acc"),
+        ],
+    )
+    # Create a learning rate scheduler callback.
+    reduce_lr = keras.callbacks.ReduceLROnPlateau(
+        monitor="val_loss", factor=0.5, patience=5
+    )
+    # Create an early stopping callback.
+    early_stopping = keras.callbacks.EarlyStopping(
+        monitor="val_loss", patience=10, restore_best_weights=True
+    )
+    # Fit the model.
+    history = model.fit(
+        x=x_train,
+        y=y_train,
+        batch_size=batch_size,
+        epochs=num_epochs,
+        validation_split=0.1,
+        callbacks=[early_stopping, reduce_lr],
+    )
+
+    _, accuracy, top_5_accuracy = model.evaluate(x_test, y_test)
+    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
+    print(f"Test top 5 accuracy: {round(top_5_accuracy * 100, 2)}%")
+
+    # Return history to plot learning curves.
+    return history
+
+
+"""
+## Use data augmentation
+"""
+
+data_augmentation = keras.Sequential(
+    [
+        layers.Normalization(),
+        layers.Resizing(image_size, image_size),
+        layers.RandomFlip("horizontal"),
+        layers.RandomZoom(height_factor=0.2, width_factor=0.2),
+    ],
+    name="data_augmentation",
+)
+# Compute the mean and the variance of the training data for normalization.
+data_augmentation.layers[0].adapt(x_train)
+
+
+"""
+## Implement patch extraction as a layer
+"""
+
+
+class Patches(layers.Layer):
+    def __init__(self, patch_size, **kwargs):
+        super().__init__(**kwargs)
+        self.patch_size = patch_size
+
+    def call(self, x):
+        patches = keras.ops.image.extract_patches(x, self.patch_size)
+        batch_size = keras.ops.shape(patches)[0]
+        num_patches = keras.ops.shape(patches)[1] * keras.ops.shape(patches)[2]
+        patch_dim = keras.ops.shape(patches)[3]
+        out = keras.ops.reshape(patches, (batch_size, num_patches, patch_dim))
+        return out
+
+
+"""
+## Implement position embedding as a layer
+"""
+
+
+class PositionEmbedding(keras.layers.Layer):
+    def __init__(
+        self,
+        sequence_length,
+        initializer="glorot_uniform",
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        if sequence_length is None:
+            raise ValueError("`sequence_length` must be an Integer, received `None`.")
+        self.sequence_length = int(sequence_length)
+        self.initializer = keras.initializers.get(initializer)
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "sequence_length": self.sequence_length,
+                "initializer": keras.initializers.serialize(self.initializer),
+            }
+        )
+        return config
+
+    def build(self, input_shape):
+        feature_size = input_shape[-1]
+        self.position_embeddings = self.add_weight(
+            name="embeddings",
+            shape=[self.sequence_length, feature_size],
+            initializer=self.initializer,
+            trainable=True,
+        )
+
+        super().build(input_shape)
+
+    def call(self, inputs, start_index=0):
+        shape = keras.ops.shape(inputs)
+        feature_length = shape[-1]
+        sequence_length = shape[-2]
+        # trim to match the length of the input sequence, which might be less
+        # than the sequence_length of the layer.
+        position_embeddings = keras.ops.convert_to_tensor(self.position_embeddings)
+        position_embeddings = keras.ops.slice(
+            position_embeddings,
+            (start_index, 0),
+            (sequence_length, feature_length),
+        )
+        return keras.ops.broadcast_to(position_embeddings, shape)
+
+    def compute_output_shape(self, input_shape):
+        return input_shape
+
+
+"""
+## The MLP-Mixer model
+
+The MLP-Mixer is an architecture based exclusively on
+multi-layer perceptrons (MLPs), that contains two types of MLP layers:
+
+1. One applied independently to image patches, which mixes the per-location features.
+2. The other applied across patches (along channels), which mixes spatial information.
+
+This is similar to a [depthwise separable convolution based model](https://arxiv.org/abs/1610.02357)
+such as the Xception model, but with two chained dense transforms, no max pooling, and layer normalization
+instead of batch normalization.
+"""
+
+"""
+### Implement the MLP-Mixer module
+"""
+
+
+class MLPMixerLayer(layers.Layer):
+    def __init__(self, num_patches, hidden_units, dropout_rate, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.mlp1 = keras.Sequential(
+            [
+                layers.Dense(units=num_patches, activation="gelu"),
+                layers.Dense(units=num_patches),
+                layers.Dropout(rate=dropout_rate),
+            ]
+        )
+        self.mlp2 = keras.Sequential(
+            [
+                layers.Dense(units=num_patches, activation="gelu"),
+                layers.Dense(units=hidden_units),
+                layers.Dropout(rate=dropout_rate),
+            ]
+        )
+        self.normalize = layers.LayerNormalization(epsilon=1e-6)
+
+    def build(self, input_shape):
+        return super().build(input_shape)
+
+    def call(self, inputs):
+        # Apply layer normalization.
+        x = self.normalize(inputs)
+        # Transpose inputs from [num_batches, num_patches, hidden_units] to [num_batches, hidden_units, num_patches].
+        x_channels = keras.ops.transpose(x, axes=(0, 2, 1))
+        # Apply mlp1 on each channel independently.
+        mlp1_outputs = self.mlp1(x_channels)
+        # Transpose mlp1_outputs from [num_batches, hidden_units, num_patches] to [num_batches, num_patches, hidden_units].
+        mlp1_outputs = keras.ops.transpose(mlp1_outputs, axes=(0, 2, 1))
+        # Add skip connection.
+        x = mlp1_outputs + inputs
+        # Apply layer normalization.
+        x_patches = self.normalize(x)
+        # Apply mlp2 on each patch independtenly.
+        mlp2_outputs = self.mlp2(x_patches)
+        # Add skip connection.
+        x = x + mlp2_outputs
+        return x
+
+
+"""
+### Build, train, and evaluate the MLP-Mixer model
+
+Note that training the model with the current settings on a V100 GPUs
+takes around 8 seconds per epoch.
+"""
+
+mlpmixer_blocks = keras.Sequential(
+    [MLPMixerLayer(num_patches, embedding_dim, dropout_rate) for _ in range(num_blocks)]
+)
+learning_rate = 0.005
+mlpmixer_classifier = build_classifier(mlpmixer_blocks)
+history = run_experiment(mlpmixer_classifier)
+
+"""
+The MLP-Mixer model tends to have much less number of parameters compared
+to convolutional and transformer-based models, which leads to less training and
+serving computational cost.
+
+As mentioned in the [MLP-Mixer](https://arxiv.org/abs/2105.01601) paper,
+when pre-trained on large datasets, or with modern regularization schemes,
+the MLP-Mixer attains competitive scores to state-of-the-art models.
+You can obtain better results by increasing the embedding dimensions,
+increasing the number of mixer blocks, and training the model for longer.
+You may also try to increase the size of the input images and use different patch sizes.
+"""
+
+"""
+## The FNet model
+
+The FNet uses a similar block to the Transformer block. However, FNet replaces the self-attention layer
+in the Transformer block with a parameter-free 2D Fourier transformation layer:
+
+1. One 1D Fourier Transform is applied along the patches.
+2. One 1D Fourier Transform is applied along the channels.
+"""
+
+"""
+### Implement the FNet module
+"""
+
+
+class FNetLayer(layers.Layer):
+    def __init__(self, embedding_dim, dropout_rate, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.ffn = keras.Sequential(
+            [
+                layers.Dense(units=embedding_dim, activation="gelu"),
+                layers.Dropout(rate=dropout_rate),
+                layers.Dense(units=embedding_dim),
+            ]
+        )
+
+        self.normalize1 = layers.LayerNormalization(epsilon=1e-6)
+        self.normalize2 = layers.LayerNormalization(epsilon=1e-6)
+
+    def call(self, inputs):
+        # Apply fourier transformations.
+        real_part = inputs
+        im_part = keras.ops.zeros_like(inputs)
+        x = keras.ops.fft2((real_part, im_part))[0]
+        # Add skip connection.
+        x = x + inputs
+        # Apply layer normalization.
+        x = self.normalize1(x)
+        # Apply Feedfowrad network.
+        x_ffn = self.ffn(x)
+        # Add skip connection.
+        x = x + x_ffn
+        # Apply layer normalization.
+        return self.normalize2(x)
+
+
+"""
+### Build, train, and evaluate the FNet model
+
+Note that training the model with the current settings on a V100 GPUs
+takes around 8 seconds per epoch.
+"""
+
+fnet_blocks = keras.Sequential(
+    [FNetLayer(embedding_dim, dropout_rate) for _ in range(num_blocks)]
+)
+learning_rate = 0.001
+fnet_classifier = build_classifier(fnet_blocks, positional_encoding=True)
+history = run_experiment(fnet_classifier)
+
+"""
+As shown in the [FNet](https://arxiv.org/abs/2105.03824) paper,
+better results can be achieved by increasing the embedding dimensions,
+increasing the number of FNet blocks, and training the model for longer.
+You may also try to increase the size of the input images and use different patch sizes.
+The FNet scales very efficiently to long inputs, runs much faster than attention-based
+Transformer models, and produces competitive accuracy results.
+"""
+
+"""
+## The gMLP model
+
+The gMLP is a MLP architecture that features a Spatial Gating Unit (SGU).
+The SGU enables cross-patch interactions across the spatial (channel) dimension, by:
+
+1. Transforming the input spatially by applying linear projection across patches (along channels).
+2. Applying element-wise multiplication of the input and its spatial transformation.
+"""
+
+"""
+### Implement the gMLP module
+"""
+
+
+class gMLPLayer(layers.Layer):
+    def __init__(self, num_patches, embedding_dim, dropout_rate, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+
+        self.channel_projection1 = keras.Sequential(
+            [
+                layers.Dense(units=embedding_dim * 2, activation="gelu"),
+                layers.Dropout(rate=dropout_rate),
+            ]
+        )
+
+        self.channel_projection2 = layers.Dense(units=embedding_dim)
+
+        self.spatial_projection = layers.Dense(
+            units=num_patches, bias_initializer="Ones"
+        )
+
+        self.normalize1 = layers.LayerNormalization(epsilon=1e-6)
+        self.normalize2 = layers.LayerNormalization(epsilon=1e-6)
+
+    def spatial_gating_unit(self, x):
+        # Split x along the channel dimensions.
+        # Tensors u and v will in the shape of [batch_size, num_patchs, embedding_dim].
+        u, v = keras.ops.split(x, indices_or_sections=2, axis=2)
+        # Apply layer normalization.
+        v = self.normalize2(v)
+        # Apply spatial projection.
+        v_channels = keras.ops.transpose(v, axes=(0, 2, 1))
+        v_projected = self.spatial_projection(v_channels)
+        v_projected = keras.ops.transpose(v_projected, axes=(0, 2, 1))
+        # Apply element-wise multiplication.
+        return u * v_projected
+
+    def call(self, inputs):
+        # Apply layer normalization.
+        x = self.normalize1(inputs)
+        # Apply the first channel projection. x_projected shape: [batch_size, num_patches, embedding_dim * 2].
+        x_projected = self.channel_projection1(x)
+        # Apply the spatial gating unit. x_spatial shape: [batch_size, num_patches, embedding_dim].
+        x_spatial = self.spatial_gating_unit(x_projected)
+        # Apply the second channel projection. x_projected shape: [batch_size, num_patches, embedding_dim].
+        x_projected = self.channel_projection2(x_spatial)
+        # Add skip connection.
+        return x + x_projected
+
+
+"""
+### Build, train, and evaluate the gMLP model
+
+Note that training the model with the current settings on a V100 GPUs
+takes around 9 seconds per epoch.
+"""
+
+gmlp_blocks = keras.Sequential(
+    [gMLPLayer(num_patches, embedding_dim, dropout_rate) for _ in range(num_blocks)]
+)
+learning_rate = 0.003
+gmlp_classifier = build_classifier(gmlp_blocks)
+history = run_experiment(gmlp_classifier)
+
+"""
+As shown in the [gMLP](https://arxiv.org/abs/2105.08050) paper,
+better results can be achieved by increasing the embedding dimensions,
+increasing the number of gMLP blocks, and training the model for longer.
+You may also try to increase the size of the input images and use different patch sizes.
+Note that, the paper used advanced regularization strategies, such as MixUp and CutMix,
+as well as AutoAugment.
+"""
diff --git a/knowledge_base/vision/mnist_convnet.py b/knowledge_base/vision/mnist_convnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..de0f355e652c4fb98f8c80c5e69f3d8aa5f928ea
--- /dev/null
+++ b/knowledge_base/vision/mnist_convnet.py
@@ -0,0 +1,80 @@
+"""
+Title: Simple MNIST convnet
+Author: [fchollet](https://twitter.com/fchollet)
+Date created: 2015/06/19
+Last modified: 2020/04/21
+Description: A simple convnet that achieves ~99% test accuracy on MNIST.
+Accelerator: GPU
+"""
+
+"""
+## Setup
+"""
+
+import numpy as np
+import keras
+from keras import layers
+
+"""
+## Prepare the data
+"""
+
+# Model / data parameters
+num_classes = 10
+input_shape = (28, 28, 1)
+
+# Load the data and split it between train and test sets
+(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data()
+
+# Scale images to the [0, 1] range
+x_train = x_train.astype("float32") / 255
+x_test = x_test.astype("float32") / 255
+# Make sure images have shape (28, 28, 1)
+x_train = np.expand_dims(x_train, -1)
+x_test = np.expand_dims(x_test, -1)
+print("x_train shape:", x_train.shape)
+print(x_train.shape[0], "train samples")
+print(x_test.shape[0], "test samples")
+
+
+# convert class vectors to binary class matrices
+y_train = keras.utils.to_categorical(y_train, num_classes)
+y_test = keras.utils.to_categorical(y_test, num_classes)
+
+"""
+## Build the model
+"""
+
+model = keras.Sequential(
+    [
+        keras.Input(shape=input_shape),
+        layers.Conv2D(32, kernel_size=(3, 3), activation="relu"),
+        layers.MaxPooling2D(pool_size=(2, 2)),
+        layers.Conv2D(64, kernel_size=(3, 3), activation="relu"),
+        layers.MaxPooling2D(pool_size=(2, 2)),
+        layers.Flatten(),
+        layers.Dropout(0.5),
+        layers.Dense(num_classes, activation="softmax"),
+    ]
+)
+
+model.summary()
+
+"""
+## Train the model
+"""
+
+batch_size = 128
+epochs = 15
+
+model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
+
+model.fit(x_train, y_train, batch_size=batch_size, epochs=epochs, validation_split=0.1)
+
+"""
+## Evaluate the trained model
+"""
+
+score = model.evaluate(x_test, y_test, verbose=0)
+print("Test loss:", score[0])
+print("Test accuracy:", score[1])
diff --git a/knowledge_base/vision/mobilevit.py b/knowledge_base/vision/mobilevit.py
new file mode 100644
index 0000000000000000000000000000000000000000..630de253db19b2c5f1a5e30b7eb9e6397d99ecd4
--- /dev/null
+++ b/knowledge_base/vision/mobilevit.py
@@ -0,0 +1,404 @@
+"""
+Title: MobileViT: A mobile-friendly Transformer-based model for image classification
+Author: [Sayak Paul](https://twitter.com/RisingSayak)
+Date created: 2021/10/20
+Last modified: 2025/09/30
+Description: MobileViT for image classification with combined benefits of convolutions and Transformers.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+In this example, we implement the MobileViT architecture
+([Mehta et al.](https://arxiv.org/abs/2110.02178)),
+which combines the benefits of Transformers
+([Vaswani et al.](https://arxiv.org/abs/1706.03762))
+and convolutions. With Transformers, we can capture long-range dependencies that result
+in global representations. With convolutions, we can capture spatial relationships that
+model locality.
+
+Besides combining the properties of Transformers and convolutions, the authors introduce
+MobileViT as a general-purpose mobile-friendly backbone for different image recognition
+tasks. Their findings suggest that, performance-wise, MobileViT is better than other
+models with the same or higher complexity ([MobileNetV3](https://arxiv.org/abs/1905.02244),
+for example), while being efficient on mobile devices.
+
+Note: This example should be run with Tensorflow 2.13 and higher.
+"""
+
+"""
+## Imports
+"""
+
+import os
+import tensorflow as tf
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import keras
+from keras import layers
+from keras import backend
+
+import tensorflow_datasets as tfds
+
+tfds.disable_progress_bar()
+
+"""
+## Hyperparameters
+"""
+
+# Values are from table 4.
+patch_size = 4  # 2x2, for the Transformer blocks.
+image_size = 256
+expansion_factor = 2  # expansion factor for the MobileNetV2 blocks.
+
+"""
+## MobileViT utilities
+
+The MobileViT architecture is comprised of the following blocks:
+
+* Strided 3x3 convolutions that process the input image.
+* [MobileNetV2](https://arxiv.org/abs/1801.04381)-style inverted residual blocks for
+downsampling the resolution of the intermediate feature maps.
+* MobileViT blocks that combine the benefits of Transformers and convolutions. It is
+presented in the figure below (taken from the
+[original paper](https://arxiv.org/abs/2110.02178)):
+
+
+![](https://i.imgur.com/mANnhI7.png)
+"""
+
+
+def conv_block(x, filters=16, kernel_size=3, strides=2):
+    conv_layer = layers.Conv2D(
+        filters,
+        kernel_size,
+        strides=strides,
+        activation=keras.activations.swish,
+        padding="same",
+    )
+    return conv_layer(x)
+
+
+# Reference: https://github.com/keras-team/keras/blob/e3858739d178fe16a0c77ce7fab88b0be6dbbdc7/keras/applications/imagenet_utils.py#L413C17-L435
+
+
+def correct_pad(inputs, kernel_size):
+    img_dim = 2 if backend.image_data_format() == "channels_first" else 1
+    input_size = inputs.shape[img_dim : (img_dim + 2)]
+    if isinstance(kernel_size, int):
+        kernel_size = (kernel_size, kernel_size)
+    if input_size[0] is None:
+        adjust = (1, 1)
+    else:
+        adjust = (1 - input_size[0] % 2, 1 - input_size[1] % 2)
+    correct = (kernel_size[0] // 2, kernel_size[1] // 2)
+    return (
+        (correct[0] - adjust[0], correct[0]),
+        (correct[1] - adjust[1], correct[1]),
+    )
+
+
+# Reference: https://git.io/JKgtC
+
+
+def inverted_residual_block(x, expanded_channels, output_channels, strides=1):
+    m = layers.Conv2D(expanded_channels, 1, padding="same", use_bias=False)(x)
+    m = layers.BatchNormalization()(m)
+    m = keras.activations.swish(m)
+
+    if strides == 2:
+        m = layers.ZeroPadding2D(padding=correct_pad(m, 3))(m)
+    m = layers.DepthwiseConv2D(
+        3, strides=strides, padding="same" if strides == 1 else "valid", use_bias=False
+    )(m)
+    m = layers.BatchNormalization()(m)
+    m = keras.activations.swish(m)
+
+    m = layers.Conv2D(output_channels, 1, padding="same", use_bias=False)(m)
+    m = layers.BatchNormalization()(m)
+
+    if keras.ops.equal(x.shape[-1], output_channels) and strides == 1:
+        return layers.Add()([m, x])
+    return m
+
+
+# Reference:
+# https://keras.io/examples/vision/image_classification_with_vision_transformer/
+
+
+def mlp(x, hidden_units, dropout_rate):
+    for units in hidden_units:
+        x = layers.Dense(units, activation=keras.activations.swish)(x)
+        x = layers.Dropout(dropout_rate)(x)
+    return x
+
+
+def transformer_block(x, transformer_layers, projection_dim, num_heads=2):
+    for _ in range(transformer_layers):
+        # Layer normalization 1.
+        x1 = layers.LayerNormalization(epsilon=1e-6)(x)
+        # Create a multi-head attention layer.
+        attention_output = layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=projection_dim, dropout=0.1
+        )(x1, x1)
+        # Skip connection 1.
+        x2 = layers.Add()([attention_output, x])
+        # Layer normalization 2.
+        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
+        # MLP.
+        x3 = mlp(
+            x3,
+            hidden_units=[x.shape[-1] * 2, x.shape[-1]],
+            dropout_rate=0.1,
+        )
+        # Skip connection 2.
+        x = layers.Add()([x3, x2])
+
+    return x
+
+
+def mobilevit_block(x, num_blocks, projection_dim, strides=1):
+    # Local projection with convolutions.
+    local_features = conv_block(x, filters=projection_dim, strides=strides)
+    local_features = conv_block(
+        local_features, filters=projection_dim, kernel_size=1, strides=strides
+    )
+
+    # Unfold into patches and then pass through Transformers.
+    num_patches = int((local_features.shape[1] * local_features.shape[2]) / patch_size)
+    non_overlapping_patches = layers.Reshape((patch_size, num_patches, projection_dim))(
+        local_features
+    )
+    global_features = transformer_block(
+        non_overlapping_patches, num_blocks, projection_dim
+    )
+
+    # Fold into conv-like feature-maps.
+    folded_feature_map = layers.Reshape((*local_features.shape[1:-1], projection_dim))(
+        global_features
+    )
+
+    # Apply point-wise conv -> concatenate with the input features.
+    folded_feature_map = conv_block(
+        folded_feature_map, filters=x.shape[-1], kernel_size=1, strides=strides
+    )
+    local_global_features = layers.Concatenate(axis=-1)([x, folded_feature_map])
+
+    # Fuse the local and global features using a convoluion layer.
+    local_global_features = conv_block(
+        local_global_features, filters=projection_dim, strides=strides
+    )
+
+    return local_global_features
+
+
+"""
+**More on the MobileViT block**:
+
+* First, the feature representations (A) go through convolution blocks that capture local
+relationships. The expected shape of a single entry here would be `(h, w, num_channels)`.
+* Then they get unfolded into another vector with shape `(p, n, num_channels)`,
+where `p` is the area of a small patch, and `n` is `(h * w) / p`. So, we end up with `n`
+non-overlapping patches.
+* This unfolded vector is then passed through a Tranformer block that captures global
+relationships between the patches.
+* The output vector (B) is again folded into a vector of shape `(h, w, num_channels)`
+resembling a feature map coming out of convolutions.
+
+Vectors A and B are then passed through two more convolutional layers to fuse the local
+and global representations. Notice how the spatial resolution of the final vector remains
+unchanged at this point. The authors also present an explanation of how the MobileViT
+block resembles a convolution block of a CNN. For more details, please refer to the
+original paper.
+"""
+
+"""
+Next, we combine these blocks together and implement the MobileViT architecture (XXS
+variant). The following figure (taken from the original paper) presents a schematic
+representation of the architecture:
+
+![](https://i.ibb.co/sRbVRBN/image.png)
+"""
+
+
+def create_mobilevit(num_classes=5):
+    inputs = keras.Input((image_size, image_size, 3))
+    x = layers.Rescaling(scale=1.0 / 255)(inputs)
+
+    # Initial conv-stem -> MV2 block.
+    x = conv_block(x, filters=16)
+    x = inverted_residual_block(
+        x, expanded_channels=16 * expansion_factor, output_channels=16
+    )
+
+    # Downsampling with MV2 block.
+    x = inverted_residual_block(
+        x, expanded_channels=16 * expansion_factor, output_channels=24, strides=2
+    )
+    x = inverted_residual_block(
+        x, expanded_channels=24 * expansion_factor, output_channels=24
+    )
+    x = inverted_residual_block(
+        x, expanded_channels=24 * expansion_factor, output_channels=24
+    )
+
+    # First MV2 -> MobileViT block.
+    x = inverted_residual_block(
+        x, expanded_channels=24 * expansion_factor, output_channels=48, strides=2
+    )
+    x = mobilevit_block(x, num_blocks=2, projection_dim=64)
+
+    # Second MV2 -> MobileViT block.
+    x = inverted_residual_block(
+        x, expanded_channels=64 * expansion_factor, output_channels=64, strides=2
+    )
+    x = mobilevit_block(x, num_blocks=4, projection_dim=80)
+
+    # Third MV2 -> MobileViT block.
+    x = inverted_residual_block(
+        x, expanded_channels=80 * expansion_factor, output_channels=80, strides=2
+    )
+    x = mobilevit_block(x, num_blocks=3, projection_dim=96)
+    x = conv_block(x, filters=320, kernel_size=1, strides=1)
+
+    # Classification head.
+    x = layers.GlobalAvgPool2D()(x)
+    outputs = layers.Dense(num_classes, activation="softmax")(x)
+
+    return keras.Model(inputs, outputs)
+
+
+mobilevit_xxs = create_mobilevit()
+mobilevit_xxs.summary()
+
+"""
+## Dataset preparation
+
+We will be using the
+[`tf_flowers`](https://www.tensorflow.org/datasets/catalog/tf_flowers)
+dataset to demonstrate the model. Unlike other Transformer-based architectures,
+MobileViT uses a simple augmentation pipeline primarily because it has the properties
+of a CNN.
+"""
+
+batch_size = 64
+auto = tf.data.AUTOTUNE
+resize_bigger = 280
+num_classes = 5
+
+
+def preprocess_dataset(is_training=True):
+    def _pp(image, label):
+        if is_training:
+            # Resize to a bigger spatial resolution and take the random
+            # crops.
+            image = tf.image.resize(image, (resize_bigger, resize_bigger))
+            image = tf.image.random_crop(image, (image_size, image_size, 3))
+            image = tf.image.random_flip_left_right(image)
+        else:
+            image = tf.image.resize(image, (image_size, image_size))
+        label = tf.one_hot(label, depth=num_classes)
+        return image, label
+
+    return _pp
+
+
+def prepare_dataset(dataset, is_training=True):
+    if is_training:
+        dataset = dataset.shuffle(batch_size * 10)
+    dataset = dataset.map(preprocess_dataset(is_training), num_parallel_calls=auto)
+    return dataset.batch(batch_size).prefetch(auto)
+
+
+"""
+The authors use a multi-scale data sampler to help the model learn representations of
+varied scales. In this example, we discard this part.
+"""
+
+"""
+## Load and prepare the dataset
+"""
+
+train_dataset, val_dataset = tfds.load(
+    "tf_flowers", split=["train[:90%]", "train[90%:]"], as_supervised=True
+)
+
+num_train = train_dataset.cardinality()
+num_val = val_dataset.cardinality()
+print(f"Number of training examples: {num_train}")
+print(f"Number of validation examples: {num_val}")
+
+train_dataset = prepare_dataset(train_dataset, is_training=True)
+val_dataset = prepare_dataset(val_dataset, is_training=False)
+
+"""
+## Train a MobileViT (XXS) model
+"""
+
+learning_rate = 0.002
+label_smoothing_factor = 0.1
+epochs = 30
+
+optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
+loss_fn = keras.losses.CategoricalCrossentropy(label_smoothing=label_smoothing_factor)
+
+
+def run_experiment(epochs=epochs):
+    mobilevit_xxs = create_mobilevit(num_classes=num_classes)
+    mobilevit_xxs.compile(optimizer=optimizer, loss=loss_fn, metrics=["accuracy"])
+
+    # When using `save_weights_only=True` in `ModelCheckpoint`, the filepath provided must end in `.weights.h5`
+    checkpoint_filepath = "/tmp/checkpoint.weights.h5"
+    checkpoint_callback = keras.callbacks.ModelCheckpoint(
+        checkpoint_filepath,
+        monitor="val_accuracy",
+        save_best_only=True,
+        save_weights_only=True,
+    )
+
+    mobilevit_xxs.fit(
+        train_dataset,
+        validation_data=val_dataset,
+        epochs=epochs,
+        callbacks=[checkpoint_callback],
+    )
+    mobilevit_xxs.load_weights(checkpoint_filepath)
+    _, accuracy = mobilevit_xxs.evaluate(val_dataset)
+    print(f"Validation accuracy: {round(accuracy * 100, 2)}%")
+    return mobilevit_xxs
+
+
+mobilevit_xxs = run_experiment()
+
+"""
+## Results and TFLite conversion
+
+With about one million parameters, getting to ~85% top-1 accuracy on 256x256 resolution is
+a strong result. This MobileViT mobile is fully compatible with TensorFlow Lite (TFLite)
+and can be converted with the following code:
+"""
+
+# Serialize the model as a SavedModel.
+tf.saved_model.save(mobilevit_xxs, "mobilevit_xxs")
+
+# Convert to TFLite. This form of quantization is called
+# post-training dynamic-range quantization in TFLite.
+converter = tf.lite.TFLiteConverter.from_saved_model("mobilevit_xxs")
+converter.optimizations = [tf.lite.Optimize.DEFAULT]
+converter.target_spec.supported_ops = [
+    tf.lite.OpsSet.TFLITE_BUILTINS,  # Enable TensorFlow Lite ops.
+    tf.lite.OpsSet.SELECT_TF_OPS,  # Enable TensorFlow ops.
+]
+tflite_model = converter.convert()
+open("mobilevit_xxs.tflite", "wb").write(tflite_model)
+
+"""
+To learn more about different quantization recipes available in TFLite and running
+inference with TFLite models, check out
+[this official resource](https://www.tensorflow.org/lite/performance/post_training_quantization).
+
+You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/keras-io/mobile-vit-xxs)
+and try the demo on [Hugging Face Spaces](https://huggingface.co/spaces/keras-io/Flowers-Classification-MobileViT).
+"""
diff --git a/knowledge_base/vision/near_dup_search.py b/knowledge_base/vision/near_dup_search.py
new file mode 100644
index 0000000000000000000000000000000000000000..0030edacbb1e700996e035fa4a0c28e33bd2306e
--- /dev/null
+++ b/knowledge_base/vision/near_dup_search.py
@@ -0,0 +1,562 @@
+"""
+Title: Near-duplicate image search
+Author: [Sayak Paul](https://twitter.com/RisingSayak)
+Date created: 2021/09/10
+Last modified: 2023/08/30
+Description: Building a near-duplicate image search utility using deep learning and locality-sensitive hashing.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+Fetching similar images in (near) real time is an important use case of information
+retrieval systems. Some popular products utilizing it include Pinterest, Google Image
+Search, etc. In this example, we will build a similar image search utility using
+[Locality Sensitive Hashing](https://towardsdatascience.com/understanding-locality-sensitive-hashing-49f6d1f6134)
+(LSH) and [random projection](https://en.wikipedia.org/wiki/Random_projection) on top
+of the image representations computed by a pretrained image classifier.
+This kind of search engine is also known
+as a _near-duplicate (or near-dup) image detector_.
+We will also look into optimizing the inference performance of
+our search utility on GPU using [TensorRT](https://developer.nvidia.com/tensorrt).
+
+There are other examples under [keras.io/examples/vision](https://keras.io/examples/vision)
+that are worth checking out in this regard:
+
+* [Metric learning for image similarity search](https://keras.io/examples/vision/metric_learning)
+* [Image similarity estimation using a Siamese Network with a triplet loss](https://keras.io/examples/vision/siamese_network)
+
+Finally, this example uses the following resource as a reference and as such reuses some
+of its code:
+[Locality Sensitive Hashing for Similar Item Search](https://towardsdatascience.com/locality-sensitive-hashing-for-music-search-f2f1940ace23).
+
+_Note that in order to optimize the performance of our parser,
+you should have a GPU runtime available._
+"""
+
+"""
+## Setup
+"""
+
+"""shell
+pip install tensorrt
+"""
+
+"""
+## Imports
+"""
+
+import matplotlib.pyplot as plt
+import tensorflow as tf
+import tensorrt
+import numpy as np
+import time
+
+import tensorflow_datasets as tfds
+
+tfds.disable_progress_bar()
+
+"""
+## Load the dataset and create a training set of 1,000 images
+
+To keep the run time of the example short, we will be using a subset of 1,000 images from
+the `tf_flowers` dataset (available through
+[TensorFlow Datasets](https://www.tensorflow.org/datasets/catalog/tf_flowers))
+to build our vocabulary.
+"""
+
+train_ds, validation_ds = tfds.load(
+    "tf_flowers", split=["train[:85%]", "train[85%:]"], as_supervised=True
+)
+
+IMAGE_SIZE = 224
+NUM_IMAGES = 1000
+
+images = []
+labels = []
+
+for image, label in train_ds.take(NUM_IMAGES):
+    image = tf.image.resize(image, (IMAGE_SIZE, IMAGE_SIZE))
+    images.append(image.numpy())
+    labels.append(label.numpy())
+
+images = np.array(images)
+labels = np.array(labels)
+
+"""
+## Load a pre-trained model
+"""
+
+"""
+In this section, we load an image classification model that was trained on the
+`tf_flowers` dataset. 85% of the total images were used to build the training set. For
+more details on the training, refer to
+[this notebook](https://github.com/sayakpaul/near-dup-parser/blob/main/bit-supervised-training.ipynb).
+
+The underlying model is a BiT-ResNet (proposed in
+[Big Transfer (BiT): General Visual Representation Learning](https://arxiv.org/abs/1912.11370)).
+The BiT-ResNet family of models is known to provide excellent transfer performance across
+a wide variety of different downstream tasks.
+"""
+
+"""shell
+wget -q https://github.com/sayakpaul/near-dup-parser/releases/download/v0.1.0/flower_model_bit_0.96875.zip
+unzip -qq flower_model_bit_0.96875.zip
+"""
+
+bit_model = tf.keras.models.load_model("flower_model_bit_0.96875")
+bit_model.count_params()
+
+"""
+## Create an embedding model
+
+To retrieve similar images given a query image, we need to first generate vector
+representations of all the images involved. We do this via an
+embedding model that extracts output features from our pretrained classifier and
+normalizes the resulting feature vectors.
+"""
+
+embedding_model = tf.keras.Sequential(
+    [
+        tf.keras.layers.Input((IMAGE_SIZE, IMAGE_SIZE, 3)),
+        tf.keras.layers.Rescaling(scale=1.0 / 255),
+        bit_model.layers[1],
+        tf.keras.layers.Normalization(mean=0, variance=1),
+    ],
+    name="embedding_model",
+)
+
+embedding_model.summary()
+
+"""
+Take note of the normalization layer inside the model. It is used to project the
+representation vectors to the space of unit-spheres.
+"""
+
+"""
+## Hashing utilities
+"""
+
+
+def hash_func(embedding, random_vectors):
+    embedding = np.array(embedding)
+
+    # Random projection.
+    bools = np.dot(embedding, random_vectors) > 0
+    return [bool2int(bool_vec) for bool_vec in bools]
+
+
+def bool2int(x):
+    y = 0
+    for i, j in enumerate(x):
+        if j:
+            y += 1 << i
+    return y
+
+
+"""
+The shape of the vectors coming out of `embedding_model` is `(2048,)`, and considering practical
+aspects (storage, retrieval performance, etc.) it is quite large. So, there arises a need
+to reduce the dimensionality of the embedding vectors without reducing their information
+content. This is where *random projection* comes into the picture.
+It is based on the principle that if the
+distance between a group of points on a given plane is _approximately_ preserved, the
+dimensionality of that plane can further be reduced.
+
+Inside `hash_func()`, we first reduce the dimensionality of the embedding vectors. Then
+we compute the bitwise hash values of the images to determine their hash buckets. Images
+having same hash values are likely to go into the same hash bucket. From a deployment
+perspective, bitwise hash values are cheaper to store and operate on.
+"""
+
+"""
+## Query utilities
+
+The `Table` class is responsible for building a single hash table. Each entry in the hash
+table is a mapping between the reduced embedding of an image from our dataset and a
+unique identifier. Because our dimensionality reduction technique involves randomness, it
+can so happen that similar images are not mapped to the same hash bucket everytime the
+process run. To reduce this effect, we will take results from multiple tables into
+consideration -- the number of tables and the reduction dimensionality are the key
+hyperparameters here.
+
+Crucially, you wouldn't reimplement locality-sensitive hashing yourself when working with
+real world applications. Instead, you'd likely use one of the following popular libraries:
+
+* [ScaNN](https://github.com/google-research/google-research/tree/master/scann)
+* [Annoy](https://github.com/spotify/annoy)
+* [Vald](https://github.com/vdaas/vald)
+"""
+
+
+class Table:
+    def __init__(self, hash_size, dim):
+        self.table = {}
+        self.hash_size = hash_size
+        self.random_vectors = np.random.randn(hash_size, dim).T
+
+    def add(self, id, vectors, label):
+        # Create a unique indentifier.
+        entry = {"id_label": str(id) + "_" + str(label)}
+
+        # Compute the hash values.
+        hashes = hash_func(vectors, self.random_vectors)
+
+        # Add the hash values to the current table.
+        for h in hashes:
+            if h in self.table:
+                self.table[h].append(entry)
+            else:
+                self.table[h] = [entry]
+
+    def query(self, vectors):
+        # Compute hash value for the query vector.
+        hashes = hash_func(vectors, self.random_vectors)
+        results = []
+
+        # Loop over the query hashes and determine if they exist in
+        # the current table.
+        for h in hashes:
+            if h in self.table:
+                results.extend(self.table[h])
+        return results
+
+
+"""
+In the following `LSH` class we will pack the utilities to have multiple hash tables.
+"""
+
+
+class LSH:
+    def __init__(self, hash_size, dim, num_tables):
+        self.num_tables = num_tables
+        self.tables = []
+        for i in range(self.num_tables):
+            self.tables.append(Table(hash_size, dim))
+
+    def add(self, id, vectors, label):
+        for table in self.tables:
+            table.add(id, vectors, label)
+
+    def query(self, vectors):
+        results = []
+        for table in self.tables:
+            results.extend(table.query(vectors))
+        return results
+
+
+"""
+Now we can encapsulate the logic for building and operating with the master LSH table (a
+collection of many tables) inside a class. It has two methods:
+
+* `train()`: Responsible for building the final LSH table.
+* `query()`: Computes the number of matches given a query image and also quantifies the
+similarity score.
+"""
+
+
+class BuildLSHTable:
+    def __init__(
+        self,
+        prediction_model,
+        concrete_function=False,
+        hash_size=8,
+        dim=2048,
+        num_tables=10,
+    ):
+        self.hash_size = hash_size
+        self.dim = dim
+        self.num_tables = num_tables
+        self.lsh = LSH(self.hash_size, self.dim, self.num_tables)
+
+        self.prediction_model = prediction_model
+        self.concrete_function = concrete_function
+
+    def train(self, training_files):
+        for id, training_file in enumerate(training_files):
+            # Unpack the data.
+            image, label = training_file
+            if len(image.shape) < 4:
+                image = image[None, ...]
+
+            # Compute embeddings and update the LSH tables.
+            # More on `self.concrete_function()` later.
+            if self.concrete_function:
+                features = self.prediction_model(tf.constant(image))[
+                    "normalization"
+                ].numpy()
+            else:
+                features = self.prediction_model.predict(image)
+            self.lsh.add(id, features, label)
+
+    def query(self, image, verbose=True):
+        # Compute the embeddings of the query image and fetch the results.
+        if len(image.shape) < 4:
+            image = image[None, ...]
+
+        if self.concrete_function:
+            features = self.prediction_model(tf.constant(image))[
+                "normalization"
+            ].numpy()
+        else:
+            features = self.prediction_model.predict(image)
+
+        results = self.lsh.query(features)
+        if verbose:
+            print("Matches:", len(results))
+
+        # Calculate Jaccard index to quantify the similarity.
+        counts = {}
+        for r in results:
+            if r["id_label"] in counts:
+                counts[r["id_label"]] += 1
+            else:
+                counts[r["id_label"]] = 1
+        for k in counts:
+            counts[k] = float(counts[k]) / self.dim
+        return counts
+
+
+"""
+## Create LSH tables
+
+With our helper utilities and classes implemented, we can now build our LSH table. Since
+we will be benchmarking performance between optimized and unoptimized embedding models, we
+will also warm up our GPU to avoid any unfair comparison.
+"""
+
+
+# Utility to warm up the GPU.
+def warmup():
+    dummy_sample = tf.ones((1, IMAGE_SIZE, IMAGE_SIZE, 3))
+    for _ in range(100):
+        _ = embedding_model.predict(dummy_sample)
+
+
+"""
+Now we can first do the GPU wam-up and proceed to build the master LSH table with
+`embedding_model`.
+"""
+
+warmup()
+
+training_files = zip(images, labels)
+lsh_builder = BuildLSHTable(embedding_model)
+lsh_builder.train(training_files)
+
+
+"""
+At the time of writing, the wall time was 54.1 seconds on a Tesla T4 GPU. This timing may
+vary based on the GPU you are using.
+"""
+
+"""
+## Optimize the model with TensorRT
+
+For NVIDIA-based GPUs, the
+[TensorRT framework](https://docs.nvidia.com/deeplearning/frameworks/tf-trt-user-guide/index.html)
+can be used to dramatically enhance the inference latency by using various model
+optimization techniques like pruning, constant folding, layer fusion, and so on. Here we
+will use the `tf.experimental.tensorrt` module to optimize our embedding model.
+"""
+
+# First serialize the embedding model as a SavedModel.
+embedding_model.save("embedding_model")
+
+# Initialize the conversion parameters.
+params = tf.experimental.tensorrt.ConversionParams(
+    precision_mode="FP16", maximum_cached_engines=16
+)
+
+# Run the conversion.
+converter = tf.experimental.tensorrt.Converter(
+    input_saved_model_dir="embedding_model", conversion_params=params
+)
+converter.convert()
+converter.save("tensorrt_embedding_model")
+
+"""
+**Notes on the parameters inside of `tf.experimental.tensorrt.ConversionParams()`**:
+
+* `precision_mode` defines the numerical precision of the operations in the
+to-be-converted model.
+* `maximum_cached_engines` specifies the maximum number of TRT engines that will be
+cached to handle dynamic operations (operations with unknown shapes).
+
+To learn more about the other options, refer to the
+[official documentation](https://www.tensorflow.org/api_docs/python/tf/experimental/tensorrt/ConversionParams).
+You can also explore the different quantization options provided by the
+`tf.experimental.tensorrt` module.
+"""
+
+# Load the converted model.
+root = tf.saved_model.load("tensorrt_embedding_model")
+trt_model_function = root.signatures["serving_default"]
+
+"""
+## Build LSH tables with optimized model
+"""
+
+warmup()
+
+training_files = zip(images, labels)
+lsh_builder_trt = BuildLSHTable(trt_model_function, concrete_function=True)
+lsh_builder_trt.train(training_files)
+
+"""
+Notice the difference in the wall time which is **13.1 seconds**. Earlier, with the
+unoptimized model it was **54.1 seconds**.
+
+We can take a closer look into one of the hash tables and get an idea of how they are
+represented.
+"""
+
+idx = 0
+for hash, entry in lsh_builder_trt.lsh.tables[0].table.items():
+    if idx == 5:
+        break
+    if len(entry) < 5:
+        print(hash, entry)
+        idx += 1
+
+"""
+## Visualize results on validation images
+
+In this section we will first writing a couple of utility functions to visualize the
+similar image parsing process. Then we will benchmark the query performance of the models
+with and without optimization.
+"""
+
+"""
+First, we take 100 images from the validation set for testing purposes.
+"""
+
+validation_images = []
+validation_labels = []
+
+for image, label in validation_ds.take(100):
+    image = tf.image.resize(image, (224, 224))
+    validation_images.append(image.numpy())
+    validation_labels.append(label.numpy())
+
+validation_images = np.array(validation_images)
+validation_labels = np.array(validation_labels)
+validation_images.shape, validation_labels.shape
+
+
+"""
+Now we write our visualization utilities.
+"""
+
+
+def plot_images(images, labels):
+    plt.figure(figsize=(20, 10))
+    columns = 5
+    for i, image in enumerate(images):
+        ax = plt.subplot(len(images) // columns + 1, columns, i + 1)
+        if i == 0:
+            ax.set_title("Query Image\n" + "Label: {}".format(labels[i]))
+        else:
+            ax.set_title("Similar Image # " + str(i) + "\nLabel: {}".format(labels[i]))
+        plt.imshow(image.astype("int"))
+        plt.axis("off")
+
+
+def visualize_lsh(lsh_class):
+    idx = np.random.choice(len(validation_images))
+    image = validation_images[idx]
+    label = validation_labels[idx]
+    results = lsh_class.query(image)
+
+    candidates = []
+    labels = []
+    overlaps = []
+
+    for idx, r in enumerate(sorted(results, key=results.get, reverse=True)):
+        if idx == 4:
+            break
+        image_id, label = r.split("_")[0], r.split("_")[1]
+        candidates.append(images[int(image_id)])
+        labels.append(label)
+        overlaps.append(results[r])
+
+    candidates.insert(0, image)
+    labels.insert(0, label)
+
+    plot_images(candidates, labels)
+
+
+"""
+### Non-TRT model
+"""
+
+for _ in range(5):
+    visualize_lsh(lsh_builder)
+
+visualize_lsh(lsh_builder)
+
+"""
+### TRT model
+"""
+
+for _ in range(5):
+    visualize_lsh(lsh_builder_trt)
+
+"""
+As you may have noticed, there are a couple of incorrect results. This can be mitigated in
+a few ways:
+
+* Better models for generating the initial embeddings especially for noisy samples. We can
+use techniques like [ArcFace](https://arxiv.org/abs/1801.07698),
+[Supervised Contrastive Learning](https://arxiv.org/abs/2004.11362), etc.
+that implicitly encourage better learning of representations for retrieval purposes.
+* The trade-off between the number of tables and the reduction dimensionality is crucial
+and helps set the right recall required for your application.
+"""
+
+"""
+## Benchmarking query performance
+"""
+
+
+def benchmark(lsh_class):
+    warmup()
+
+    start_time = time.time()
+    for _ in range(1000):
+        image = np.ones((1, 224, 224, 3)).astype("float32")
+        _ = lsh_class.query(image, verbose=False)
+    end_time = time.time() - start_time
+    print(f"Time taken: {end_time:.3f}")
+
+
+benchmark(lsh_builder)
+
+benchmark(lsh_builder_trt)
+
+"""
+We can immediately notice a stark difference between the query performance of the two
+models.
+"""
+
+"""
+## Final remarks
+
+In this example, we explored the TensorRT framework from NVIDIA for optimizing our model.
+It's best suited for GPU-based inference servers. There are other choices for such
+frameworks that cater to different hardware platforms:
+
+* [TensorFlow Lite](https://www.tensorflow.org/lite) for mobile and edge devices.
+* [ONNX](hhttps://onnx.ai/) for commodity CPU-based servers.
+* [Apache TVM](https://tvm.apache.org/), compiler for machine learning models covering
+various platforms.
+
+Here are a few resources you might want to check out to learn more
+about applications based on vector similary search in general:
+
+* [ANN Benchmarks](http://ann-benchmarks.com/)
+* [Accelerating Large-Scale Inference with Anisotropic Vector Quantization(ScaNN)](https://arxiv.org/abs/1908.10396)
+* [Spreading vectors for similarity search](https://arxiv.org/abs/1806.03198)
+* [Building a real-time embeddings similarity matching system](https://cloud.google.com/architecture/building-real-time-embeddings-similarity-matching-system)
+"""
diff --git a/knowledge_base/vision/nerf.py b/knowledge_base/vision/nerf.py
new file mode 100644
index 0000000000000000000000000000000000000000..580eaffe9ad210ebb803ca9cdb31f1ad0352a0c8
--- /dev/null
+++ b/knowledge_base/vision/nerf.py
@@ -0,0 +1,777 @@
+"""
+Title: 3D volumetric rendering with NeRF
+Authors: [Aritra Roy Gosthipaty](https://twitter.com/arig23498), [Ritwik Raha](https://twitter.com/ritwik_raha)
+Date created: 2021/08/09
+Last modified: 2023/11/13
+Description: Minimal implementation of volumetric rendering as shown in NeRF.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+In this example, we present a minimal implementation of the research paper
+[**NeRF: Representing Scenes as Neural Radiance Fields for View Synthesis**](https://arxiv.org/abs/2003.08934)
+by Ben Mildenhall et. al. The authors have proposed an ingenious way
+to *synthesize novel views of a scene* by modelling the *volumetric
+scene function* through a neural network.
+
+To help you understand this intuitively, let's start with the following question:
+*would it be possible to give to a neural
+network the position of a pixel in an image, and ask the network
+to predict the color at that position?*
+
+| ![2d-train](https://i.imgur.com/DQM92vN.png) |
+| :---: |
+| **Figure 1**: A neural network being given coordinates of an image
+as input and asked to predict the color at the coordinates. |
+
+The neural network would hypothetically *memorize* (overfit on) the
+image. This means that our neural network would have encoded the entire image
+in its weights. We could query the neural network with each position,
+and it would eventually reconstruct the entire image.
+
+| ![2d-test](https://i.imgur.com/6Qz5Hp1.png) |
+| :---: |
+| **Figure 2**: The trained neural network recreates the image from scratch. |
+
+A question now arises, how do we extend this idea to learn a 3D
+volumetric scene? Implementing a similar process as above would
+require the knowledge of every voxel (volume pixel). Turns out, this
+is quite a challenging task to do.
+
+The authors of the paper propose a minimal and elegant way to learn a
+3D scene using a few images of the scene. They discard the use of
+voxels for training. The network learns to model the volumetric scene,
+thus generating novel views (images) of the 3D scene that the model
+was not shown at training time.
+
+There are a few prerequisites one needs to understand to fully
+appreciate the process. We structure the example in such a way that
+you will have all the required knowledge before starting the
+implementation.
+"""
+
+"""
+## Setup
+"""
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+# Setting random seed to obtain reproducible results.
+import tensorflow as tf
+
+tf.random.set_seed(42)
+
+import keras
+from keras import layers
+
+import os
+import glob
+import imageio.v2 as imageio
+import numpy as np
+from tqdm import tqdm
+import matplotlib.pyplot as plt
+
+# Initialize global variables.
+AUTO = tf.data.AUTOTUNE
+BATCH_SIZE = 5
+NUM_SAMPLES = 32
+POS_ENCODE_DIMS = 16
+EPOCHS = 20
+
+"""
+## Download and load the data
+
+The `npz` data file contains images, camera poses, and a focal length.
+The images are taken from multiple camera angles as shown in
+**Figure 3**.
+
+| ![camera-angles](https://i.imgur.com/FLsi2is.png) |
+| :---: |
+| **Figure 3**: Multiple camera angles <br>
+[Source: NeRF](https://arxiv.org/abs/2003.08934) |
+
+
+To understand camera poses in this context we have to first allow
+ourselves to think that a *camera is a mapping between the real-world
+and the 2-D image*.
+
+| ![mapping](https://www.mathworks.com/help/vision/ug/calibration_coordinate_blocks.png) |
+| :---: |
+| **Figure 4**: 3-D world to 2-D image mapping through a camera <br>
+[Source: Mathworks](https://www.mathworks.com/help/vision/ug/camera-calibration.html) |
+
+Consider the following equation:
+
+<img src="https://i.imgur.com/TQHKx5v.pngg" width="100" height="50"/>
+
+Where **x** is the 2-D image point, **X** is the 3-D world point and
+**P** is the camera-matrix. **P** is a 3 x 4 matrix that plays the
+crucial role of mapping the real world object onto an image plane.
+
+<img src="https://i.imgur.com/chvJct5.png" width="300" height="100"/>
+
+The camera-matrix is an *affine transform matrix* that is
+concatenated with a 3 x 1 column `[image height, image width, focal length]`
+to produce the *pose matrix*. This matrix is of
+dimensions 3 x 5 where the first 3 x 3 block is in the camera’s point
+of view. The axes are `[down, right, backwards]` or `[-y, x, z]`
+where the camera is facing forwards `-z`.
+
+| ![camera-mapping](https://i.imgur.com/kvjqbiO.png) |
+| :---: |
+| **Figure 5**: The affine transformation. |
+
+The COLMAP frame is `[right, down, forwards]` or `[x, -y, -z]`. Read
+more about COLMAP [here](https://colmap.github.io/).
+"""
+
+# Download the data if it does not already exist.
+url = (
+    "http://cseweb.ucsd.edu/~viscomp/projects/LF/papers/ECCV20/nerf/tiny_nerf_data.npz"
+)
+data = keras.utils.get_file(origin=url)
+
+data = np.load(data)
+images = data["images"]
+im_shape = images.shape
+(num_images, H, W, _) = images.shape
+(poses, focal) = (data["poses"], data["focal"])
+
+# Plot a random image from the dataset for visualization.
+plt.imshow(images[np.random.randint(low=0, high=num_images)])
+plt.show()
+
+"""
+## Data pipeline
+
+Now that you've understood the notion of camera matrix
+and the mapping from a 3D scene to 2D images,
+let's talk about the inverse mapping, i.e. from 2D image to the 3D scene.
+
+We'll need to talk about volumetric rendering with ray casting and tracing,
+which are common computer graphics techniques.
+This section will help you get to speed with these techniques.
+
+Consider an image with `N` pixels. We shoot a ray through each pixel
+and sample some points on the ray. A ray is commonly parameterized by
+the equation `r(t) = o + td` where `t` is the parameter, `o` is the
+origin and `d` is the unit directional vector as shown in **Figure 6**.
+
+| ![img](https://i.imgur.com/ywrqlzt.gif) |
+| :---: |
+| **Figure 6**: `r(t) = o + td` where t is 3 |
+
+In **Figure 7**, we consider a ray, and we sample some random points on
+the ray. These sample points each have a unique location `(x, y, z)`
+and the ray has a viewing angle `(theta, phi)`. The viewing angle is
+particularly interesting as we can shoot a ray through a single pixel
+in a lot of different ways, each with a unique viewing angle. Another
+interesting thing to notice here is the noise that is added to the
+sampling process. We add a uniform noise to each sample so that the
+samples correspond to a continuous distribution. In **Figure 7** the
+blue points are the evenly distributed samples and the white points
+`(t1, t2, t3)` are randomly placed between the samples.
+
+| ![img](https://i.imgur.com/r9TS2wv.gif) |
+| :---: |
+| **Figure 7**: Sampling the points from a ray. |
+
+**Figure 8** showcases the entire sampling process in 3D, where you
+can see the rays coming out of the white image. This means that each
+pixel will have its corresponding rays and each ray will be sampled at
+distinct points.
+
+| ![3-d rays](https://i.imgur.com/hr4D2g2.gif) |
+| :---: |
+| **Figure 8**: Shooting rays from all the pixels of an image in 3-D |
+
+These sampled points act as the input to the NeRF model. The model is
+then asked to predict the RGB color and the volume density at that
+point.
+
+| ![3-Drender](https://i.imgur.com/HHb6tlQ.png) |
+| :---: |
+| **Figure 9**: Data pipeline <br>
+[Source: NeRF](https://arxiv.org/abs/2003.08934) |
+
+"""
+
+
+def encode_position(x):
+    """Encodes the position into its corresponding Fourier feature.
+
+    Args:
+        x: The input coordinate.
+
+    Returns:
+        Fourier features tensors of the position.
+    """
+    positions = [x]
+    for i in range(POS_ENCODE_DIMS):
+        for fn in [tf.sin, tf.cos]:
+            positions.append(fn(2.0**i * x))
+    return tf.concat(positions, axis=-1)
+
+
+def get_rays(height, width, focal, pose):
+    """Computes origin point and direction vector of rays.
+
+    Args:
+        height: Height of the image.
+        width: Width of the image.
+        focal: The focal length between the images and the camera.
+        pose: The pose matrix of the camera.
+
+    Returns:
+        Tuple of origin point and direction vector for rays.
+    """
+    # Build a meshgrid for the rays.
+    i, j = tf.meshgrid(
+        tf.range(width, dtype=tf.float32),
+        tf.range(height, dtype=tf.float32),
+        indexing="xy",
+    )
+
+    # Normalize the x axis coordinates.
+    transformed_i = (i - width * 0.5) / focal
+
+    # Normalize the y axis coordinates.
+    transformed_j = (j - height * 0.5) / focal
+
+    # Create the direction unit vectors.
+    directions = tf.stack([transformed_i, -transformed_j, -tf.ones_like(i)], axis=-1)
+
+    # Get the camera matrix.
+    camera_matrix = pose[:3, :3]
+    height_width_focal = pose[:3, -1]
+
+    # Get origins and directions for the rays.
+    transformed_dirs = directions[..., None, :]
+    camera_dirs = transformed_dirs * camera_matrix
+    ray_directions = tf.reduce_sum(camera_dirs, axis=-1)
+    ray_origins = tf.broadcast_to(height_width_focal, tf.shape(ray_directions))
+
+    # Return the origins and directions.
+    return (ray_origins, ray_directions)
+
+
+def render_flat_rays(ray_origins, ray_directions, near, far, num_samples, rand=False):
+    """Renders the rays and flattens it.
+
+    Args:
+        ray_origins: The origin points for rays.
+        ray_directions: The direction unit vectors for the rays.
+        near: The near bound of the volumetric scene.
+        far: The far bound of the volumetric scene.
+        num_samples: Number of sample points in a ray.
+        rand: Choice for randomising the sampling strategy.
+
+    Returns:
+       Tuple of flattened rays and sample points on each rays.
+    """
+    # Compute 3D query points.
+    # Equation: r(t) = o+td -> Building the "t" here.
+    t_vals = tf.linspace(near, far, num_samples)
+    if rand:
+        # Inject uniform noise into sample space to make the sampling
+        # continuous.
+        shape = list(ray_origins.shape[:-1]) + [num_samples]
+        noise = tf.random.uniform(shape=shape) * (far - near) / num_samples
+        t_vals = t_vals + noise
+
+    # Equation: r(t) = o + td -> Building the "r" here.
+    rays = ray_origins[..., None, :] + (
+        ray_directions[..., None, :] * t_vals[..., None]
+    )
+    rays_flat = tf.reshape(rays, [-1, 3])
+    rays_flat = encode_position(rays_flat)
+    return (rays_flat, t_vals)
+
+
+def map_fn(pose):
+    """Maps individual pose to flattened rays and sample points.
+
+    Args:
+        pose: The pose matrix of the camera.
+
+    Returns:
+        Tuple of flattened rays and sample points corresponding to the
+        camera pose.
+    """
+    (ray_origins, ray_directions) = get_rays(height=H, width=W, focal=focal, pose=pose)
+    (rays_flat, t_vals) = render_flat_rays(
+        ray_origins=ray_origins,
+        ray_directions=ray_directions,
+        near=2.0,
+        far=6.0,
+        num_samples=NUM_SAMPLES,
+        rand=True,
+    )
+    return (rays_flat, t_vals)
+
+
+# Create the training split.
+split_index = int(num_images * 0.8)
+
+# Split the images into training and validation.
+train_images = images[:split_index]
+val_images = images[split_index:]
+
+# Split the poses into training and validation.
+train_poses = poses[:split_index]
+val_poses = poses[split_index:]
+
+# Make the training pipeline.
+train_img_ds = tf.data.Dataset.from_tensor_slices(train_images)
+train_pose_ds = tf.data.Dataset.from_tensor_slices(train_poses)
+train_ray_ds = train_pose_ds.map(map_fn, num_parallel_calls=AUTO)
+training_ds = tf.data.Dataset.zip((train_img_ds, train_ray_ds))
+train_ds = (
+    training_ds.shuffle(BATCH_SIZE)
+    .batch(BATCH_SIZE, drop_remainder=True, num_parallel_calls=AUTO)
+    .prefetch(AUTO)
+)
+
+# Make the validation pipeline.
+val_img_ds = tf.data.Dataset.from_tensor_slices(val_images)
+val_pose_ds = tf.data.Dataset.from_tensor_slices(val_poses)
+val_ray_ds = val_pose_ds.map(map_fn, num_parallel_calls=AUTO)
+validation_ds = tf.data.Dataset.zip((val_img_ds, val_ray_ds))
+val_ds = (
+    validation_ds.shuffle(BATCH_SIZE)
+    .batch(BATCH_SIZE, drop_remainder=True, num_parallel_calls=AUTO)
+    .prefetch(AUTO)
+)
+
+"""
+## NeRF model
+
+The model is a multi-layer perceptron (MLP), with ReLU as its non-linearity.
+
+An excerpt from the paper:
+
+*"We encourage the representation to be multiview-consistent by
+restricting the network to predict the volume density sigma as a
+function of only the location `x`, while allowing the RGB color `c` to be
+predicted as a function of both location and viewing direction. To
+accomplish this, the MLP first processes the input 3D coordinate `x`
+with 8 fully-connected layers (using ReLU activations and 256 channels
+per layer), and outputs sigma and a 256-dimensional feature vector.
+This feature vector is then concatenated with the camera ray's viewing
+direction and passed to one additional fully-connected layer (using a
+ReLU activation and 128 channels) that output the view-dependent RGB
+color."*
+
+Here we have gone for a minimal implementation and have used 64
+Dense units instead of 256 as mentioned in the paper.
+"""
+
+
+def get_nerf_model(num_layers, num_pos):
+    """Generates the NeRF neural network.
+
+    Args:
+        num_layers: The number of MLP layers.
+        num_pos: The number of dimensions of positional encoding.
+
+    Returns:
+        The `keras` model.
+    """
+    inputs = keras.Input(shape=(num_pos, 2 * 3 * POS_ENCODE_DIMS + 3))
+    x = inputs
+    for i in range(num_layers):
+        x = layers.Dense(units=64, activation="relu")(x)
+        if i % 4 == 0 and i > 0:
+            # Inject residual connection.
+            x = layers.concatenate([x, inputs], axis=-1)
+    outputs = layers.Dense(units=4)(x)
+    return keras.Model(inputs=inputs, outputs=outputs)
+
+
+def render_rgb_depth(model, rays_flat, t_vals, rand=True, train=True):
+    """Generates the RGB image and depth map from model prediction.
+
+    Args:
+        model: The MLP model that is trained to predict the rgb and
+            volume density of the volumetric scene.
+        rays_flat: The flattened rays that serve as the input to
+            the NeRF model.
+        t_vals: The sample points for the rays.
+        rand: Choice to randomise the sampling strategy.
+        train: Whether the model is in the training or testing phase.
+
+    Returns:
+        Tuple of rgb image and depth map.
+    """
+    # Get the predictions from the nerf model and reshape it.
+    if train:
+        predictions = model(rays_flat)
+    else:
+        predictions = model.predict(rays_flat)
+    predictions = tf.reshape(predictions, shape=(BATCH_SIZE, H, W, NUM_SAMPLES, 4))
+
+    # Slice the predictions into rgb and sigma.
+    rgb = tf.sigmoid(predictions[..., :-1])
+    sigma_a = tf.nn.relu(predictions[..., -1])
+
+    # Get the distance of adjacent intervals.
+    delta = t_vals[..., 1:] - t_vals[..., :-1]
+    # delta shape = (num_samples)
+    if rand:
+        delta = tf.concat(
+            [delta, tf.broadcast_to([1e10], shape=(BATCH_SIZE, H, W, 1))], axis=-1
+        )
+        alpha = 1.0 - tf.exp(-sigma_a * delta)
+    else:
+        delta = tf.concat(
+            [delta, tf.broadcast_to([1e10], shape=(BATCH_SIZE, 1))], axis=-1
+        )
+        alpha = 1.0 - tf.exp(-sigma_a * delta[:, None, None, :])
+
+    # Get transmittance.
+    exp_term = 1.0 - alpha
+    epsilon = 1e-10
+    transmittance = tf.math.cumprod(exp_term + epsilon, axis=-1, exclusive=True)
+    weights = alpha * transmittance
+    rgb = tf.reduce_sum(weights[..., None] * rgb, axis=-2)
+
+    if rand:
+        depth_map = tf.reduce_sum(weights * t_vals, axis=-1)
+    else:
+        depth_map = tf.reduce_sum(weights * t_vals[:, None, None], axis=-1)
+    return (rgb, depth_map)
+
+
+"""
+## Training
+
+The training step is implemented as part of a custom `keras.Model` subclass
+so that we can make use of the `model.fit` functionality.
+"""
+
+
+class NeRF(keras.Model):
+    def __init__(self, nerf_model):
+        super().__init__()
+        self.nerf_model = nerf_model
+
+    def compile(self, optimizer, loss_fn):
+        super().compile()
+        self.optimizer = optimizer
+        self.loss_fn = loss_fn
+        self.loss_tracker = keras.metrics.Mean(name="loss")
+        self.psnr_metric = keras.metrics.Mean(name="psnr")
+
+    def train_step(self, inputs):
+        # Get the images and the rays.
+        (images, rays) = inputs
+        (rays_flat, t_vals) = rays
+
+        with tf.GradientTape() as tape:
+            # Get the predictions from the model.
+            rgb, _ = render_rgb_depth(
+                model=self.nerf_model, rays_flat=rays_flat, t_vals=t_vals, rand=True
+            )
+            loss = self.loss_fn(images, rgb)
+
+        # Get the trainable variables.
+        trainable_variables = self.nerf_model.trainable_variables
+
+        # Get the gradeints of the trainiable variables with respect to the loss.
+        gradients = tape.gradient(loss, trainable_variables)
+
+        # Apply the grads and optimize the model.
+        self.optimizer.apply_gradients(zip(gradients, trainable_variables))
+
+        # Get the PSNR of the reconstructed images and the source images.
+        psnr = tf.image.psnr(images, rgb, max_val=1.0)
+
+        # Compute our own metrics
+        self.loss_tracker.update_state(loss)
+        self.psnr_metric.update_state(psnr)
+        return {"loss": self.loss_tracker.result(), "psnr": self.psnr_metric.result()}
+
+    def test_step(self, inputs):
+        # Get the images and the rays.
+        (images, rays) = inputs
+        (rays_flat, t_vals) = rays
+
+        # Get the predictions from the model.
+        rgb, _ = render_rgb_depth(
+            model=self.nerf_model, rays_flat=rays_flat, t_vals=t_vals, rand=True
+        )
+        loss = self.loss_fn(images, rgb)
+
+        # Get the PSNR of the reconstructed images and the source images.
+        psnr = tf.image.psnr(images, rgb, max_val=1.0)
+
+        # Compute our own metrics
+        self.loss_tracker.update_state(loss)
+        self.psnr_metric.update_state(psnr)
+        return {"loss": self.loss_tracker.result(), "psnr": self.psnr_metric.result()}
+
+    @property
+    def metrics(self):
+        return [self.loss_tracker, self.psnr_metric]
+
+
+test_imgs, test_rays = next(iter(train_ds))
+test_rays_flat, test_t_vals = test_rays
+
+loss_list = []
+
+
+class TrainMonitor(keras.callbacks.Callback):
+    def on_epoch_end(self, epoch, logs=None):
+        loss = logs["loss"]
+        loss_list.append(loss)
+        test_recons_images, depth_maps = render_rgb_depth(
+            model=self.model.nerf_model,
+            rays_flat=test_rays_flat,
+            t_vals=test_t_vals,
+            rand=True,
+            train=False,
+        )
+
+        # Plot the rgb, depth and the loss plot.
+        fig, ax = plt.subplots(nrows=1, ncols=3, figsize=(20, 5))
+        ax[0].imshow(keras.utils.array_to_img(test_recons_images[0]))
+        ax[0].set_title(f"Predicted Image: {epoch:03d}")
+
+        ax[1].imshow(keras.utils.array_to_img(depth_maps[0, ..., None]))
+        ax[1].set_title(f"Depth Map: {epoch:03d}")
+
+        ax[2].plot(loss_list)
+        ax[2].set_xticks(np.arange(0, EPOCHS + 1, 5.0))
+        ax[2].set_title(f"Loss Plot: {epoch:03d}")
+
+        fig.savefig(f"images/{epoch:03d}.png")
+        plt.show()
+        plt.close()
+
+
+num_pos = H * W * NUM_SAMPLES
+nerf_model = get_nerf_model(num_layers=8, num_pos=num_pos)
+
+model = NeRF(nerf_model)
+model.compile(
+    optimizer=keras.optimizers.Adam(), loss_fn=keras.losses.MeanSquaredError()
+)
+
+# Create a directory to save the images during training.
+if not os.path.exists("images"):
+    os.makedirs("images")
+
+model.fit(
+    train_ds,
+    validation_data=val_ds,
+    batch_size=BATCH_SIZE,
+    epochs=EPOCHS,
+    callbacks=[TrainMonitor()],
+)
+
+
+def create_gif(path_to_images, name_gif):
+    filenames = glob.glob(path_to_images)
+    filenames = sorted(filenames)
+    images = []
+    for filename in tqdm(filenames):
+        images.append(imageio.imread(filename))
+    kargs = {"duration": 0.25}
+    imageio.mimsave(name_gif, images, "GIF", **kargs)
+
+
+create_gif("images/*.png", "training.gif")
+
+"""
+## Visualize the training step
+
+Here we see the training step. With the decreasing loss, the rendered
+image and the depth maps are getting better. In your local system, you
+will see the `training.gif` file generated.
+
+![training-20](https://i.imgur.com/ql5OcYA.gif)
+"""
+
+"""
+## Inference
+
+In this section, we ask the model to build novel views of the scene.
+The model was given `106` views of the scene in the training step. The
+collections of training images cannot contain each and every angle of
+the scene. A trained model can represent the entire 3-D scene with a
+sparse set of training images.
+
+Here we provide different poses to the model and ask for it to give us
+the 2-D image corresponding to that camera view. If we infer the model
+for all the 360-degree views, it should provide an overview of the
+entire scenery from all around.
+"""
+
+# Get the trained NeRF model and infer.
+nerf_model = model.nerf_model
+test_recons_images, depth_maps = render_rgb_depth(
+    model=nerf_model,
+    rays_flat=test_rays_flat,
+    t_vals=test_t_vals,
+    rand=True,
+    train=False,
+)
+
+# Create subplots.
+fig, axes = plt.subplots(nrows=5, ncols=3, figsize=(10, 20))
+
+for ax, ori_img, recons_img, depth_map in zip(
+    axes, test_imgs, test_recons_images, depth_maps
+):
+    ax[0].imshow(keras.utils.array_to_img(ori_img))
+    ax[0].set_title("Original")
+
+    ax[1].imshow(keras.utils.array_to_img(recons_img))
+    ax[1].set_title("Reconstructed")
+
+    ax[2].imshow(keras.utils.array_to_img(depth_map[..., None]), cmap="inferno")
+    ax[2].set_title("Depth Map")
+
+"""
+## Render 3D Scene
+
+Here we will synthesize novel 3D views and stitch all of them together
+to render a video encompassing the 360-degree view.
+"""
+
+
+def get_translation_t(t):
+    """Get the translation matrix for movement in t."""
+    matrix = [
+        [1, 0, 0, 0],
+        [0, 1, 0, 0],
+        [0, 0, 1, t],
+        [0, 0, 0, 1],
+    ]
+    return tf.convert_to_tensor(matrix, dtype=tf.float32)
+
+
+def get_rotation_phi(phi):
+    """Get the rotation matrix for movement in phi."""
+    matrix = [
+        [1, 0, 0, 0],
+        [0, tf.cos(phi), -tf.sin(phi), 0],
+        [0, tf.sin(phi), tf.cos(phi), 0],
+        [0, 0, 0, 1],
+    ]
+    return tf.convert_to_tensor(matrix, dtype=tf.float32)
+
+
+def get_rotation_theta(theta):
+    """Get the rotation matrix for movement in theta."""
+    matrix = [
+        [tf.cos(theta), 0, -tf.sin(theta), 0],
+        [0, 1, 0, 0],
+        [tf.sin(theta), 0, tf.cos(theta), 0],
+        [0, 0, 0, 1],
+    ]
+    return tf.convert_to_tensor(matrix, dtype=tf.float32)
+
+
+def pose_spherical(theta, phi, t):
+    """
+    Get the camera to world matrix for the corresponding theta, phi
+    and t.
+    """
+    c2w = get_translation_t(t)
+    c2w = get_rotation_phi(phi / 180.0 * np.pi) @ c2w
+    c2w = get_rotation_theta(theta / 180.0 * np.pi) @ c2w
+    c2w = np.array([[-1, 0, 0, 0], [0, 0, 1, 0], [0, 1, 0, 0], [0, 0, 0, 1]]) @ c2w
+    return c2w
+
+
+rgb_frames = []
+batch_flat = []
+batch_t = []
+
+# Iterate over different theta value and generate scenes.
+for index, theta in tqdm(enumerate(np.linspace(0.0, 360.0, 120, endpoint=False))):
+    # Get the camera to world matrix.
+    c2w = pose_spherical(theta, -30.0, 4.0)
+
+    #
+    ray_oris, ray_dirs = get_rays(H, W, focal, c2w)
+    rays_flat, t_vals = render_flat_rays(
+        ray_oris, ray_dirs, near=2.0, far=6.0, num_samples=NUM_SAMPLES, rand=False
+    )
+
+    if index % BATCH_SIZE == 0 and index > 0:
+        batched_flat = tf.stack(batch_flat, axis=0)
+        batch_flat = [rays_flat]
+
+        batched_t = tf.stack(batch_t, axis=0)
+        batch_t = [t_vals]
+
+        rgb, _ = render_rgb_depth(
+            nerf_model, batched_flat, batched_t, rand=False, train=False
+        )
+
+        temp_rgb = [np.clip(255 * img, 0.0, 255.0).astype(np.uint8) for img in rgb]
+
+        rgb_frames = rgb_frames + temp_rgb
+    else:
+        batch_flat.append(rays_flat)
+        batch_t.append(t_vals)
+
+rgb_video = "rgb_video.mp4"
+imageio.mimwrite(rgb_video, rgb_frames, fps=30, quality=7, macro_block_size=None)
+
+"""
+### Visualize the video
+
+Here we can see the rendered 360 degree view of the scene. The model
+has successfully learned the entire volumetric space through the
+sparse set of images in **only 20 epochs**. You can view the
+rendered video saved locally, named `rgb_video.mp4`.
+
+![rendered-video](https://i.imgur.com/j2sIkzW.gif)
+"""
+
+"""
+## Conclusion
+
+We have produced a minimal implementation of NeRF to provide an intuition of its
+core ideas and methodology. This method has been used in various
+other works in the computer graphics space.
+
+We would like to encourage our readers to use this code as an example
+and play with the hyperparameters and visualize the outputs. Below we
+have also provided the outputs of the model trained for more epochs.
+
+| Epochs | GIF of the training step |
+| :--- | :---: |
+| **100** | ![100-epoch-training](https://i.imgur.com/2k9p8ez.gif) |
+| **200** | ![200-epoch-training](https://i.imgur.com/l3rG4HQ.gif) |
+
+## Way forward
+
+If anyone is interested to go deeper into NeRF, we have built a 3-part blog
+series at [PyImageSearch](https://pyimagesearch.com/).
+
+- [Prerequisites of NeRF](https://www.pyimagesearch.com/2021/11/10/computer-graphics-and-deep-learning-with-nerf-using-tensorflow-and-keras-part-1/)
+- [Concepts of NeRF](https://www.pyimagesearch.com/2021/11/17/computer-graphics-and-deep-learning-with-nerf-using-tensorflow-and-keras-part-2/)
+- [Implementing NeRF](https://www.pyimagesearch.com/2021/11/24/computer-graphics-and-deep-learning-with-nerf-using-tensorflow-and-keras-part-3/)
+
+## Reference
+
+- [NeRF repository](https://github.com/bmild/nerf): The official
+    repository for NeRF.
+- [NeRF paper](https://arxiv.org/abs/2003.08934): The paper on NeRF.
+- [Manim Repository](https://github.com/3b1b/manim): We have used
+    manim to build all the animations.
+- [Mathworks](https://www.mathworks.com/help/vision/ug/camera-calibration.html):
+    Mathworks for the camera calibration article.
+- [Mathew's video](https://www.youtube.com/watch?v=dPWLybp4LL0): A
+    great video on NeRF.
+
+You can try the model on [Hugging Face Spaces](https://huggingface.co/spaces/keras-io/NeRF).
+"""
diff --git a/knowledge_base/vision/nl_image_search.py b/knowledge_base/vision/nl_image_search.py
new file mode 100644
index 0000000000000000000000000000000000000000..942d808656141c926855de3806000dcbec6b8565
--- /dev/null
+++ b/knowledge_base/vision/nl_image_search.py
@@ -0,0 +1,594 @@
+"""
+Title: Natural language image search with a Dual Encoder
+Author: [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/)
+Date created: 2021/01/30
+Last modified: 2021/01/30
+Description: Implementation of a dual encoder model for retrieving images that match natural language queries.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+The example demonstrates how to build a dual encoder (also known as two-tower) neural network
+model to search for images using natural language. The model is inspired by
+the [CLIP](https://openai.com/blog/clip/)
+approach, introduced by Alec Radford et al. The idea is to train a vision encoder and a text
+encoder jointly to project the representation of images and their captions into the same embedding
+space, such that the caption embeddings are located near the embeddings of the images they describe.
+
+This example requires TensorFlow 2.4 or higher.
+In addition, [TensorFlow Hub](https://www.tensorflow.org/hub)
+and [TensorFlow Text](https://www.tensorflow.org/tutorials/tensorflow_text/intro)
+are required for the BERT model, and [TensorFlow Addons](https://www.tensorflow.org/addons)
+is required for the AdamW optimizer. These libraries can be installed using the
+following command:
+
+```python
+pip install -q -U tensorflow-hub tensorflow-text tensorflow-addons
+```
+"""
+
+"""
+## Setup
+"""
+
+import os
+import collections
+import json
+import numpy as np
+import tensorflow as tf
+from tensorflow import keras
+from tensorflow.keras import layers
+import tensorflow_hub as hub
+import tensorflow_text as text
+import tensorflow_addons as tfa
+import matplotlib.pyplot as plt
+import matplotlib.image as mpimg
+from tqdm import tqdm
+
+# Suppressing tf.hub warnings
+tf.get_logger().setLevel("ERROR")
+
+"""
+## Prepare the data
+
+We will use the [MS-COCO](https://cocodataset.org/#home) dataset to train our
+dual encoder model. MS-COCO contains over 82,000 images, each of which has at least
+5 different caption annotations. The dataset is usually used for
+[image captioning](https://www.tensorflow.org/tutorials/text/image_captioning)
+tasks, but we can repurpose the image-caption pairs to train our dual encoder
+model for image search.
+
+###
+Download and extract the data
+
+First, let's download the dataset, which consists of two compressed folders:
+one with images, and the other—with associated image captions.
+Note that the compressed images folder is 13GB in size.
+"""
+
+root_dir = "datasets"
+annotations_dir = os.path.join(root_dir, "annotations")
+images_dir = os.path.join(root_dir, "train2014")
+tfrecords_dir = os.path.join(root_dir, "tfrecords")
+annotation_file = os.path.join(annotations_dir, "captions_train2014.json")
+
+# Download caption annotation files
+if not os.path.exists(annotations_dir):
+    annotation_zip = tf.keras.utils.get_file(
+        "captions.zip",
+        cache_dir=os.path.abspath("."),
+        origin="http://images.cocodataset.org/annotations/annotations_trainval2014.zip",
+        extract=True,
+    )
+    os.remove(annotation_zip)
+
+# Download image files
+if not os.path.exists(images_dir):
+    image_zip = tf.keras.utils.get_file(
+        "train2014.zip",
+        cache_dir=os.path.abspath("."),
+        origin="http://images.cocodataset.org/zips/train2014.zip",
+        extract=True,
+    )
+    os.remove(image_zip)
+
+print("Dataset is downloaded and extracted successfully.")
+
+with open(annotation_file, "r") as f:
+    annotations = json.load(f)["annotations"]
+
+image_path_to_caption = collections.defaultdict(list)
+for element in annotations:
+    caption = f"{element['caption'].lower().rstrip('.')}"
+    image_path = images_dir + "/COCO_train2014_" + "%012d.jpg" % (element["image_id"])
+    image_path_to_caption[image_path].append(caption)
+
+image_paths = list(image_path_to_caption.keys())
+print(f"Number of images: {len(image_paths)}")
+
+"""
+### Process and save the data to TFRecord files
+
+You can change the `sample_size` parameter to control many image-caption pairs
+will be used for training the dual encoder model.
+In this example we set `train_size` to 30,000 images,
+which is about 35% of the dataset. We use 2 captions for each
+image, thus producing 60,000 image-caption pairs. The size of the training set
+affects the quality of the produced encoders, but more examples would lead to
+longer training time.
+"""
+
+train_size = 30000
+valid_size = 5000
+captions_per_image = 2
+images_per_file = 2000
+
+train_image_paths = image_paths[:train_size]
+num_train_files = int(np.ceil(train_size / images_per_file))
+train_files_prefix = os.path.join(tfrecords_dir, "train")
+
+valid_image_paths = image_paths[-valid_size:]
+num_valid_files = int(np.ceil(valid_size / images_per_file))
+valid_files_prefix = os.path.join(tfrecords_dir, "valid")
+
+tf.io.gfile.makedirs(tfrecords_dir)
+
+
+def bytes_feature(value):
+    return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
+
+
+def create_example(image_path, caption):
+    feature = {
+        "caption": bytes_feature(caption.encode()),
+        "raw_image": bytes_feature(tf.io.read_file(image_path).numpy()),
+    }
+    return tf.train.Example(features=tf.train.Features(feature=feature))
+
+
+def write_tfrecords(file_name, image_paths):
+    caption_list = []
+    image_path_list = []
+    for image_path in image_paths:
+        captions = image_path_to_caption[image_path][:captions_per_image]
+        caption_list.extend(captions)
+        image_path_list.extend([image_path] * len(captions))
+
+    with tf.io.TFRecordWriter(file_name) as writer:
+        for example_idx in range(len(image_path_list)):
+            example = create_example(
+                image_path_list[example_idx], caption_list[example_idx]
+            )
+            writer.write(example.SerializeToString())
+    return example_idx + 1
+
+
+def write_data(image_paths, num_files, files_prefix):
+    example_counter = 0
+    for file_idx in tqdm(range(num_files)):
+        file_name = files_prefix + "-%02d.tfrecord" % (file_idx)
+        start_idx = images_per_file * file_idx
+        end_idx = start_idx + images_per_file
+        example_counter += write_tfrecords(file_name, image_paths[start_idx:end_idx])
+    return example_counter
+
+
+train_example_count = write_data(train_image_paths, num_train_files, train_files_prefix)
+print(f"{train_example_count} training examples were written to tfrecord files.")
+
+valid_example_count = write_data(valid_image_paths, num_valid_files, valid_files_prefix)
+print(f"{valid_example_count} evaluation examples were written to tfrecord files.")
+
+"""
+### Create `tf.data.Dataset` for training and evaluation
+"""
+
+
+feature_description = {
+    "caption": tf.io.FixedLenFeature([], tf.string),
+    "raw_image": tf.io.FixedLenFeature([], tf.string),
+}
+
+
+def read_example(example):
+    features = tf.io.parse_single_example(example, feature_description)
+    raw_image = features.pop("raw_image")
+    features["image"] = tf.image.resize(
+        tf.image.decode_jpeg(raw_image, channels=3), size=(299, 299)
+    )
+    return features
+
+
+def get_dataset(file_pattern, batch_size):
+    return (
+        tf.data.TFRecordDataset(tf.data.Dataset.list_files(file_pattern))
+        .map(
+            read_example,
+            num_parallel_calls=tf.data.AUTOTUNE,
+            deterministic=False,
+        )
+        .shuffle(batch_size * 10)
+        .prefetch(buffer_size=tf.data.AUTOTUNE)
+        .batch(batch_size)
+    )
+
+
+"""
+## Implement the projection head
+
+The projection head is used to transform the image and the text embeddings to
+the same embedding space with the same dimensionality.
+"""
+
+
+def project_embeddings(
+    embeddings, num_projection_layers, projection_dims, dropout_rate
+):
+    projected_embeddings = layers.Dense(units=projection_dims)(embeddings)
+    for _ in range(num_projection_layers):
+        x = tf.nn.gelu(projected_embeddings)
+        x = layers.Dense(projection_dims)(x)
+        x = layers.Dropout(dropout_rate)(x)
+        x = layers.Add()([projected_embeddings, x])
+        projected_embeddings = layers.LayerNormalization()(x)
+    return projected_embeddings
+
+
+"""
+## Implement the vision encoder
+
+In this example, we use [Xception](https://keras.io/api/applications/xception/)
+from [Keras Applications](https://keras.io/api/applications/) as the base for the
+vision encoder.
+"""
+
+
+def create_vision_encoder(
+    num_projection_layers, projection_dims, dropout_rate, trainable=False
+):
+    # Load the pre-trained Xception model to be used as the base encoder.
+    xception = keras.applications.Xception(
+        include_top=False, weights="imagenet", pooling="avg"
+    )
+    # Set the trainability of the base encoder.
+    for layer in xception.layers:
+        layer.trainable = trainable
+    # Receive the images as inputs.
+    inputs = layers.Input(shape=(299, 299, 3), name="image_input")
+    # Preprocess the input image.
+    xception_input = tf.keras.applications.xception.preprocess_input(inputs)
+    # Generate the embeddings for the images using the xception model.
+    embeddings = xception(xception_input)
+    # Project the embeddings produced by the model.
+    outputs = project_embeddings(
+        embeddings, num_projection_layers, projection_dims, dropout_rate
+    )
+    # Create the vision encoder model.
+    return keras.Model(inputs, outputs, name="vision_encoder")
+
+
+"""
+## Implement the text encoder
+
+We use [BERT](https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-12_H-256_A-4/1)
+from [TensorFlow Hub](https://tfhub.dev) as the text encoder
+"""
+
+
+def create_text_encoder(
+    num_projection_layers, projection_dims, dropout_rate, trainable=False
+):
+    # Load the BERT preprocessing module.
+    preprocess = hub.KerasLayer(
+        "https://tfhub.dev/tensorflow/bert_en_uncased_preprocess/2",
+        name="text_preprocessing",
+    )
+    # Load the pre-trained BERT model to be used as the base encoder.
+    bert = hub.KerasLayer(
+        "https://tfhub.dev/tensorflow/small_bert/bert_en_uncased_L-4_H-512_A-8/1",
+        "bert",
+    )
+    # Set the trainability of the base encoder.
+    bert.trainable = trainable
+    # Receive the text as inputs.
+    inputs = layers.Input(shape=(), dtype=tf.string, name="text_input")
+    # Preprocess the text.
+    bert_inputs = preprocess(inputs)
+    # Generate embeddings for the preprocessed text using the BERT model.
+    embeddings = bert(bert_inputs)["pooled_output"]
+    # Project the embeddings produced by the model.
+    outputs = project_embeddings(
+        embeddings, num_projection_layers, projection_dims, dropout_rate
+    )
+    # Create the text encoder model.
+    return keras.Model(inputs, outputs, name="text_encoder")
+
+
+"""
+## Implement the dual encoder
+
+To calculate the loss, we compute the pairwise dot-product similarity between
+each `caption_i` and `images_j` in the batch as the predictions.
+The target similarity between `caption_i`  and `image_j` is computed as
+the average of the (dot-product similarity between `caption_i` and `caption_j`)
+and (the dot-product similarity between `image_i` and `image_j`).
+Then, we use crossentropy to compute the loss between the targets and the predictions.
+"""
+
+
+class DualEncoder(keras.Model):
+    def __init__(self, text_encoder, image_encoder, temperature=1.0, **kwargs):
+        super().__init__(**kwargs)
+        self.text_encoder = text_encoder
+        self.image_encoder = image_encoder
+        self.temperature = temperature
+        self.loss_tracker = keras.metrics.Mean(name="loss")
+
+    @property
+    def metrics(self):
+        return [self.loss_tracker]
+
+    def call(self, features, training=False):
+        # Place each encoder on a separate GPU (if available).
+        # TF will fallback on available devices if there are fewer than 2 GPUs.
+        with tf.device("/gpu:0"):
+            # Get the embeddings for the captions.
+            caption_embeddings = text_encoder(features["caption"], training=training)
+        with tf.device("/gpu:1"):
+            # Get the embeddings for the images.
+            image_embeddings = vision_encoder(features["image"], training=training)
+        return caption_embeddings, image_embeddings
+
+    def compute_loss(self, caption_embeddings, image_embeddings):
+        # logits[i][j] is the dot_similarity(caption_i, image_j).
+        logits = (
+            tf.matmul(caption_embeddings, image_embeddings, transpose_b=True)
+            / self.temperature
+        )
+        # images_similarity[i][j] is the dot_similarity(image_i, image_j).
+        images_similarity = tf.matmul(
+            image_embeddings, image_embeddings, transpose_b=True
+        )
+        # captions_similarity[i][j] is the dot_similarity(caption_i, caption_j).
+        captions_similarity = tf.matmul(
+            caption_embeddings, caption_embeddings, transpose_b=True
+        )
+        # targets[i][j] = avarage dot_similarity(caption_i, caption_j) and dot_similarity(image_i, image_j).
+        targets = keras.activations.softmax(
+            (captions_similarity + images_similarity) / (2 * self.temperature)
+        )
+        # Compute the loss for the captions using crossentropy
+        captions_loss = keras.losses.categorical_crossentropy(
+            y_true=targets, y_pred=logits, from_logits=True
+        )
+        # Compute the loss for the images using crossentropy
+        images_loss = keras.losses.categorical_crossentropy(
+            y_true=tf.transpose(targets), y_pred=tf.transpose(logits), from_logits=True
+        )
+        # Return the mean of the loss over the batch.
+        return (captions_loss + images_loss) / 2
+
+    def train_step(self, features):
+        with tf.GradientTape() as tape:
+            # Forward pass
+            caption_embeddings, image_embeddings = self(features, training=True)
+            loss = self.compute_loss(caption_embeddings, image_embeddings)
+        # Backward pass
+        gradients = tape.gradient(loss, self.trainable_variables)
+        self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
+        # Monitor loss
+        self.loss_tracker.update_state(loss)
+        return {"loss": self.loss_tracker.result()}
+
+    def test_step(self, features):
+        caption_embeddings, image_embeddings = self(features, training=False)
+        loss = self.compute_loss(caption_embeddings, image_embeddings)
+        self.loss_tracker.update_state(loss)
+        return {"loss": self.loss_tracker.result()}
+
+
+"""
+## Train the dual encoder model
+
+In this experiment, we freeze the base encoders for text and images, and make only
+the projection head trainable.
+"""
+
+num_epochs = 5  # In practice, train for at least 30 epochs
+batch_size = 256
+
+vision_encoder = create_vision_encoder(
+    num_projection_layers=1, projection_dims=256, dropout_rate=0.1
+)
+text_encoder = create_text_encoder(
+    num_projection_layers=1, projection_dims=256, dropout_rate=0.1
+)
+dual_encoder = DualEncoder(text_encoder, vision_encoder, temperature=0.05)
+dual_encoder.compile(
+    optimizer=tfa.optimizers.AdamW(learning_rate=0.001, weight_decay=0.001)
+)
+
+"""
+Note that training the model with 60,000 image-caption pairs, with a batch size of 256,
+takes around 12 minutes per epoch using a V100 GPU accelerator. If 2 GPUs are available,
+the epoch takes around 8 minutes.
+"""
+
+print(f"Number of GPUs: {len(tf.config.list_physical_devices('GPU'))}")
+print(f"Number of examples (caption-image pairs): {train_example_count}")
+print(f"Batch size: {batch_size}")
+print(f"Steps per epoch: {int(np.ceil(train_example_count / batch_size))}")
+train_dataset = get_dataset(os.path.join(tfrecords_dir, "train-*.tfrecord"), batch_size)
+valid_dataset = get_dataset(os.path.join(tfrecords_dir, "valid-*.tfrecord"), batch_size)
+# Create a learning rate scheduler callback.
+reduce_lr = keras.callbacks.ReduceLROnPlateau(
+    monitor="val_loss", factor=0.2, patience=3
+)
+# Create an early stopping callback.
+early_stopping = tf.keras.callbacks.EarlyStopping(
+    monitor="val_loss", patience=5, restore_best_weights=True
+)
+history = dual_encoder.fit(
+    train_dataset,
+    epochs=num_epochs,
+    validation_data=valid_dataset,
+    callbacks=[reduce_lr, early_stopping],
+)
+print("Training completed. Saving vision and text encoders...")
+vision_encoder.save("vision_encoder")
+text_encoder.save("text_encoder")
+print("Models are saved.")
+
+"""
+Plotting the training loss:
+"""
+
+plt.plot(history.history["loss"])
+plt.plot(history.history["val_loss"])
+plt.ylabel("Loss")
+plt.xlabel("Epoch")
+plt.legend(["train", "valid"], loc="upper right")
+plt.show()
+
+"""
+## Search for images using natural language queries
+
+We can then retrieve images corresponding to natural language queries via
+the following steps:
+
+1. Generate embeddings for the images by feeding them into the `vision_encoder`.
+2. Feed the natural language query to the `text_encoder` to generate a query embedding.
+3. Compute the similarity between the query embedding and the image embeddings
+in the index to retrieve the indices of the top matches.
+4. Look up the paths of the top matching images to display them.
+
+Note that, after training the `dual encoder`, only the fine-tuned `vision_encoder`
+and `text_encoder` models will be used, while the `dual_encoder` model will be discarded.
+"""
+
+"""
+### Generate embeddings for the images
+
+We load the images and feed them into the `vision_encoder` to generate their embeddings.
+In large scale systems, this step is performed using a parallel data processing framework,
+such as [Apache Spark](https://spark.apache.org) or [Apache Beam](https://beam.apache.org).
+Generating the image embeddings may take several minutes.
+"""
+print("Loading vision and text encoders...")
+vision_encoder = keras.models.load_model("vision_encoder")
+text_encoder = keras.models.load_model("text_encoder")
+print("Models are loaded.")
+
+
+def read_image(image_path):
+    image_array = tf.image.decode_jpeg(tf.io.read_file(image_path), channels=3)
+    return tf.image.resize(image_array, (299, 299))
+
+
+print(f"Generating embeddings for {len(image_paths)} images...")
+image_embeddings = vision_encoder.predict(
+    tf.data.Dataset.from_tensor_slices(image_paths).map(read_image).batch(batch_size),
+    verbose=1,
+)
+print(f"Image embeddings shape: {image_embeddings.shape}.")
+
+"""
+### Retrieve relevant images
+
+In this example, we use exact matching by computing the dot product similarity
+between the input query embedding and the image embeddings, and retrieve the top k
+matches. However, *approximate* similarity matching, using frameworks like
+[ScaNN](https://github.com/google-research/google-research/tree/master/scann),
+[Annoy](https://github.com/spotify/annoy), or [Faiss](https://github.com/facebookresearch/faiss)
+is preferred in real-time use cases to scale with a large number of images.
+"""
+
+
+def find_matches(image_embeddings, queries, k=9, normalize=True):
+    # Get the embedding for the query.
+    query_embedding = text_encoder(tf.convert_to_tensor(queries))
+    # Normalize the query and the image embeddings.
+    if normalize:
+        image_embeddings = tf.math.l2_normalize(image_embeddings, axis=1)
+        query_embedding = tf.math.l2_normalize(query_embedding, axis=1)
+    # Compute the dot product between the query and the image embeddings.
+    dot_similarity = tf.matmul(query_embedding, image_embeddings, transpose_b=True)
+    # Retrieve top k indices.
+    results = tf.math.top_k(dot_similarity, k).indices.numpy()
+    # Return matching image paths.
+    return [[image_paths[idx] for idx in indices] for indices in results]
+
+
+"""
+Set the `query` variable to the type of images you want to search for.
+Try things like: 'a plate of healthy food',
+'a woman wearing a hat is walking down a sidewalk',
+'a bird sits near to the water', or 'wild animals are standing in a field'.
+"""
+
+query = "a family standing next to the ocean on a sandy beach with a surf board"
+matches = find_matches(image_embeddings, [query], normalize=True)[0]
+
+plt.figure(figsize=(20, 20))
+for i in range(9):
+    ax = plt.subplot(3, 3, i + 1)
+    plt.imshow(mpimg.imread(matches[i]))
+    plt.axis("off")
+
+
+"""
+## Evaluate the retrieval quality
+
+To evaluate the dual encoder model, we use the captions as queries.
+We use the out-of-training-sample images and captions to evaluate the retrieval quality,
+using top k accuracy. A true prediction is counted if, for a given caption, its associated image
+is retrieved within the top k matches.
+"""
+
+
+def compute_top_k_accuracy(image_paths, k=100):
+    hits = 0
+    num_batches = int(np.ceil(len(image_paths) / batch_size))
+    for idx in tqdm(range(num_batches)):
+        start_idx = idx * batch_size
+        end_idx = start_idx + batch_size
+        current_image_paths = image_paths[start_idx:end_idx]
+        queries = [
+            image_path_to_caption[image_path][0] for image_path in current_image_paths
+        ]
+        result = find_matches(image_embeddings, queries, k)
+        hits += sum(
+            [
+                image_path in matches
+                for (image_path, matches) in list(zip(current_image_paths, result))
+            ]
+        )
+
+    return hits / len(image_paths)
+
+
+print("Scoring training data...")
+train_accuracy = compute_top_k_accuracy(train_image_paths)
+print(f"Train accuracy: {round(train_accuracy * 100, 3)}%")
+
+print("Scoring evaluation data...")
+eval_accuracy = compute_top_k_accuracy(image_paths[train_size:])
+print(f"Eval accuracy: {round(eval_accuracy * 100, 3)}%")
+
+
+"""
+## Final remarks
+
+You can obtain better results by increasing the size of the training sample,
+train for more  epochs, explore other base encoders for images and text,
+set the base encoders to be trainable, and tune the hyperparameters,
+especially the `temperature` for the softmax in the loss computation.
+
+Example available on HuggingFace
+
+| Trained Model | Demo |
+| :--: | :--: |
+| [![Generic badge](https://img.shields.io/badge/%F0%9F%A4%97%20Model-nl%20image%20search-black.svg)](https://huggingface.co/keras-io/dual-encoder-image-search) | [![Generic badge](https://img.shields.io/badge/%F0%9F%A4%97%20Spaces-nl%20image%20search-black.svg)](https://huggingface.co/spaces/keras-io/dual-encoder-image-search) |
+"""
diff --git a/knowledge_base/vision/nnclr.py b/knowledge_base/vision/nnclr.py
new file mode 100644
index 0000000000000000000000000000000000000000..68c1443928aa331101692777f9f797f718d9856c
--- /dev/null
+++ b/knowledge_base/vision/nnclr.py
@@ -0,0 +1,527 @@
+"""
+Title: Self-supervised contrastive learning with NNCLR
+Author: [Rishit Dagli](https://twitter.com/rishit_dagli)
+Date created: 2021/09/13
+Last modified: 2024/01/22
+Description: Implementation of NNCLR, a self-supervised learning method for computer vision.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+### Self-supervised learning
+
+Self-supervised representation learning aims to obtain robust representations of samples
+from raw data without expensive labels or annotations. Early methods in this field
+focused on defining pretraining tasks which involved a surrogate task on a domain with ample
+weak supervision labels. Encoders trained to solve such tasks are expected to
+learn general features that might be useful for other downstream tasks requiring
+expensive annotations like image classification.
+
+### Contrastive Learning
+
+A broad category of self-supervised learning techniques are those that use *contrastive
+losses*, which have been used in a wide range of computer vision applications like
+[image similarity](https://www.jmlr.org/papers/v11/chechik10a.html),
+[dimensionality reduction (DrLIM)](http://yann.lecun.com/exdb/publis/pdf/hadsell-chopra-lecun-06.pdf)
+and [face verification/identification](https://openaccess.thecvf.com/content_cvpr_2015/html/Schroff_FaceNet_A_Unified_2015_CVPR_paper.html).
+These methods learn a latent space that clusters positive samples together while
+pushing apart negative samples.
+
+### NNCLR
+
+In this example, we implement NNCLR as proposed in the paper
+[With a Little Help from My Friends: Nearest-Neighbor Contrastive Learning of Visual Representations](https://arxiv.org/abs/2104.14548),
+by Google Research and DeepMind.
+
+NNCLR learns self-supervised representations that go beyond single-instance positives, which
+allows for learning better features that are invariant to different viewpoints, deformations,
+and even intra-class variations.
+Clustering based methods offer a great approach to go beyond single instance positives,
+but assuming the entire cluster to be positives could hurt performance due to early
+over-generalization. Instead, NNCLR uses nearest neighbors in the learned representation
+space as positives.
+In addition, NNCLR increases the performance of existing contrastive learning methods like
+[SimCLR](https://arxiv.org/abs/2002.05709)([Keras Example](https://keras.io/examples/vision/semisupervised_simclr))
+and reduces the reliance of self-supervised methods on data augmentation strategies.
+
+Here is a great visualization by the paper authors showing how NNCLR builds on ideas from
+SimCLR:
+
+![](https://i.imgur.com/p2DbZJJ.png)
+
+We can see that SimCLR uses two views of the same image as the positive pair. These two
+views, which are produced using random data augmentations, are fed through an encoder to
+obtain the positive embedding pair, we end up using two augmentations. NNCLR instead
+keeps a _support set_ of embeddings representing the full data distribution, and forms
+the positive pairs using nearest-neighbours. A support set is used as memory during
+training, similar to a queue (i.e. first-in-first-out) as in
+[MoCo](https://arxiv.org/abs/1911.05722).
+
+This example requires `tensorflow_datasets`, which can
+be installed with this command:
+"""
+
+"""shell
+pip install tensorflow-datasets
+"""
+
+"""
+## Setup
+"""
+
+import matplotlib.pyplot as plt
+import tensorflow as tf
+import tensorflow_datasets as tfds
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+import keras
+import keras_cv
+from keras import ops
+from keras import layers
+
+"""
+## Hyperparameters
+
+A greater `queue_size` most likely means better performance as shown in the original
+paper, but introduces significant computational overhead. The authors show that the best
+results of NNCLR are achieved with a queue size of 98,304 (the largest `queue_size` they
+experimented on). We here use 10,000 to show a working example.
+"""
+
+AUTOTUNE = tf.data.AUTOTUNE
+shuffle_buffer = 5000
+# The below two values are taken from https://www.tensorflow.org/datasets/catalog/stl10
+labelled_train_images = 5000
+unlabelled_images = 100000
+
+temperature = 0.1
+queue_size = 10000
+contrastive_augmenter = {
+    "brightness": 0.5,
+    "name": "contrastive_augmenter",
+    "scale": (0.2, 1.0),
+}
+classification_augmenter = {
+    "brightness": 0.2,
+    "name": "classification_augmenter",
+    "scale": (0.5, 1.0),
+}
+input_shape = (96, 96, 3)
+width = 128
+num_epochs = 5  # Use 25 for better results
+steps_per_epoch = 50  # Use 200 for better results
+
+"""
+## Load the Dataset
+
+We load the [STL-10](http://ai.stanford.edu/~acoates/stl10/) dataset from
+TensorFlow Datasets, an image recognition dataset for developing unsupervised
+feature learning, deep learning, self-taught learning algorithms. It is inspired by the
+CIFAR-10 dataset, with some modifications.
+"""
+
+dataset_name = "stl10"
+
+
+def prepare_dataset():
+    unlabeled_batch_size = unlabelled_images // steps_per_epoch
+    labeled_batch_size = labelled_train_images // steps_per_epoch
+    batch_size = unlabeled_batch_size + labeled_batch_size
+
+    unlabeled_train_dataset = (
+        tfds.load(
+            dataset_name, split="unlabelled", as_supervised=True, shuffle_files=True
+        )
+        .shuffle(buffer_size=shuffle_buffer)
+        .batch(unlabeled_batch_size, drop_remainder=True)
+    )
+    labeled_train_dataset = (
+        tfds.load(dataset_name, split="train", as_supervised=True, shuffle_files=True)
+        .shuffle(buffer_size=shuffle_buffer)
+        .batch(labeled_batch_size, drop_remainder=True)
+    )
+    test_dataset = (
+        tfds.load(dataset_name, split="test", as_supervised=True)
+        .batch(batch_size)
+        .prefetch(buffer_size=AUTOTUNE)
+    )
+    train_dataset = tf.data.Dataset.zip(
+        (unlabeled_train_dataset, labeled_train_dataset)
+    ).prefetch(buffer_size=AUTOTUNE)
+
+    return batch_size, train_dataset, labeled_train_dataset, test_dataset
+
+
+batch_size, train_dataset, labeled_train_dataset, test_dataset = prepare_dataset()
+
+"""
+## Augmentations
+
+Other self-supervised techniques like [SimCLR](https://arxiv.org/abs/2002.05709),
+[BYOL](https://arxiv.org/abs/2006.07733), [SwAV](https://arxiv.org/abs/2006.09882) etc.
+rely heavily on a well-designed data augmentation pipeline to get the best performance.
+However, NNCLR is _less_ dependent on complex augmentations as nearest-neighbors already
+provide richness in sample variations. A few common techniques often included
+augmentation pipelines are:
+
+- Random resized crops
+- Multiple color distortions
+- Gaussian blur
+
+Since NNCLR is less dependent on complex augmentations, we will only use random
+crops and random brightness for augmenting the input images.
+"""
+
+
+"""
+### Prepare augmentation module
+"""
+
+
+def augmenter(brightness, name, scale):
+    return keras.Sequential(
+        [
+            layers.Input(shape=input_shape),
+            layers.Rescaling(1 / 255),
+            layers.RandomFlip("horizontal"),
+            keras_cv.layers.RandomCropAndResize(
+                target_size=(input_shape[0], input_shape[1]),
+                crop_area_factor=scale,
+                aspect_ratio_factor=(3 / 4, 4 / 3),
+            ),
+            keras_cv.layers.RandomBrightness(factor=brightness, value_range=(0.0, 1.0)),
+        ],
+        name=name,
+    )
+
+
+"""
+### Encoder architecture
+
+Using a ResNet-50 as the encoder architecture
+is standard in the literature. In the original paper, the authors use ResNet-50 as
+the encoder architecture and spatially average the outputs of ResNet-50. However, keep in
+mind that more powerful models will not only increase training time but will also
+require more memory and will limit the maximal batch size you can use. For the purpose of
+this example, we just use four convolutional layers.
+"""
+
+
+def encoder():
+    return keras.Sequential(
+        [
+            layers.Input(shape=input_shape),
+            layers.Conv2D(width, kernel_size=3, strides=2, activation="relu"),
+            layers.Conv2D(width, kernel_size=3, strides=2, activation="relu"),
+            layers.Conv2D(width, kernel_size=3, strides=2, activation="relu"),
+            layers.Conv2D(width, kernel_size=3, strides=2, activation="relu"),
+            layers.Flatten(),
+            layers.Dense(width, activation="relu"),
+        ],
+        name="encoder",
+    )
+
+
+"""
+## The NNCLR model for contrastive pre-training
+
+We train an encoder on unlabeled images with a contrastive loss. A nonlinear projection
+head is attached to the top of the encoder, as it improves the quality of representations
+of the encoder.
+"""
+
+
+class NNCLR(keras.Model):
+    def __init__(
+        self,
+        temperature,
+        queue_size,
+    ):
+        super().__init__()
+        self.probe_accuracy = keras.metrics.SparseCategoricalAccuracy()
+        self.correlation_accuracy = keras.metrics.SparseCategoricalAccuracy()
+        self.contrastive_accuracy = keras.metrics.SparseCategoricalAccuracy()
+        self.probe_loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+
+        self.contrastive_augmenter = augmenter(**contrastive_augmenter)
+        self.classification_augmenter = augmenter(**classification_augmenter)
+        self.encoder = encoder()
+        self.projection_head = keras.Sequential(
+            [
+                layers.Input(shape=(width,)),
+                layers.Dense(width, activation="relu"),
+                layers.Dense(width),
+            ],
+            name="projection_head",
+        )
+        self.linear_probe = keras.Sequential(
+            [layers.Input(shape=(width,)), layers.Dense(10)], name="linear_probe"
+        )
+        self.temperature = temperature
+
+        feature_dimensions = self.encoder.output_shape[1]
+        self.feature_queue = keras.Variable(
+            keras.utils.normalize(
+                keras.random.normal(shape=(queue_size, feature_dimensions)),
+                axis=1,
+                order=2,
+            ),
+            trainable=False,
+        )
+
+    def compile(self, contrastive_optimizer, probe_optimizer, **kwargs):
+        super().compile(**kwargs)
+        self.contrastive_optimizer = contrastive_optimizer
+        self.probe_optimizer = probe_optimizer
+
+    def nearest_neighbour(self, projections):
+        support_similarities = ops.matmul(
+            projections, ops.transpose(self.feature_queue)
+        )
+        nn_projections = ops.take(
+            self.feature_queue, ops.argmax(support_similarities, axis=1), axis=0
+        )
+        return projections + ops.stop_gradient(nn_projections - projections)
+
+    def update_contrastive_accuracy(self, features_1, features_2):
+        features_1 = keras.utils.normalize(features_1, axis=1, order=2)
+        features_2 = keras.utils.normalize(features_2, axis=1, order=2)
+        similarities = ops.matmul(features_1, ops.transpose(features_2))
+        batch_size = ops.shape(features_1)[0]
+        contrastive_labels = ops.arange(batch_size)
+        self.contrastive_accuracy.update_state(
+            ops.concatenate([contrastive_labels, contrastive_labels], axis=0),
+            ops.concatenate([similarities, ops.transpose(similarities)], axis=0),
+        )
+
+    def update_correlation_accuracy(self, features_1, features_2):
+        features_1 = (features_1 - ops.mean(features_1, axis=0)) / ops.std(
+            features_1, axis=0
+        )
+        features_2 = (features_2 - ops.mean(features_2, axis=0)) / ops.std(
+            features_2, axis=0
+        )
+
+        batch_size = ops.shape(features_1)[0]
+        cross_correlation = (
+            ops.matmul(ops.transpose(features_1), features_2) / batch_size
+        )
+
+        feature_dim = ops.shape(features_1)[1]
+        correlation_labels = ops.arange(feature_dim)
+        self.correlation_accuracy.update_state(
+            ops.concatenate([correlation_labels, correlation_labels], axis=0),
+            ops.concatenate(
+                [cross_correlation, ops.transpose(cross_correlation)], axis=0
+            ),
+        )
+
+    def contrastive_loss(self, projections_1, projections_2):
+        projections_1 = keras.utils.normalize(projections_1, axis=1, order=2)
+        projections_2 = keras.utils.normalize(projections_2, axis=1, order=2)
+
+        similarities_1_2_1 = (
+            ops.matmul(
+                self.nearest_neighbour(projections_1), ops.transpose(projections_2)
+            )
+            / self.temperature
+        )
+        similarities_1_2_2 = (
+            ops.matmul(
+                projections_2, ops.transpose(self.nearest_neighbour(projections_1))
+            )
+            / self.temperature
+        )
+
+        similarities_2_1_1 = (  #
+            ops.matmul(
+                self.nearest_neighbour(projections_2), ops.transpose(projections_1)
+            )
+            / self.temperature
+        )
+        similarities_2_1_2 = (
+            ops.matmul(
+                projections_1, ops.transpose(self.nearest_neighbour(projections_2))
+            )
+            / self.temperature
+        )
+
+        batch_size = ops.shape(projections_1)[0]
+        contrastive_labels = ops.arange(batch_size)
+        loss = keras.losses.sparse_categorical_crossentropy(
+            ops.concatenate(
+                [
+                    contrastive_labels,
+                    contrastive_labels,
+                    contrastive_labels,
+                    contrastive_labels,
+                ],
+                axis=0,
+            ),
+            ops.concatenate(
+                [
+                    similarities_1_2_1,
+                    similarities_1_2_2,
+                    similarities_2_1_1,
+                    similarities_2_1_2,
+                ],
+                axis=0,
+            ),
+            from_logits=True,
+        )
+
+        self.feature_queue.assign(
+            ops.concatenate([projections_1, self.feature_queue[:-batch_size]], axis=0)
+        )
+        return loss
+
+    def train_step(self, data):
+        (unlabeled_images, _), (labeled_images, labels) = data
+        images = ops.concatenate((unlabeled_images, labeled_images), axis=0)
+        augmented_images_1 = self.contrastive_augmenter(images)
+        augmented_images_2 = self.contrastive_augmenter(images)
+
+        with tf.GradientTape() as tape:
+            features_1 = self.encoder(augmented_images_1)
+            features_2 = self.encoder(augmented_images_2)
+            projections_1 = self.projection_head(features_1)
+            projections_2 = self.projection_head(features_2)
+            contrastive_loss = self.contrastive_loss(projections_1, projections_2)
+        gradients = tape.gradient(
+            contrastive_loss,
+            self.encoder.trainable_weights + self.projection_head.trainable_weights,
+        )
+        self.contrastive_optimizer.apply_gradients(
+            zip(
+                gradients,
+                self.encoder.trainable_weights + self.projection_head.trainable_weights,
+            )
+        )
+        self.update_contrastive_accuracy(features_1, features_2)
+        self.update_correlation_accuracy(features_1, features_2)
+        preprocessed_images = self.classification_augmenter(labeled_images)
+
+        with tf.GradientTape() as tape:
+            features = self.encoder(preprocessed_images)
+            class_logits = self.linear_probe(features)
+            probe_loss = self.probe_loss(labels, class_logits)
+        gradients = tape.gradient(probe_loss, self.linear_probe.trainable_weights)
+        self.probe_optimizer.apply_gradients(
+            zip(gradients, self.linear_probe.trainable_weights)
+        )
+        self.probe_accuracy.update_state(labels, class_logits)
+
+        return {
+            "c_loss": contrastive_loss,
+            "c_acc": self.contrastive_accuracy.result(),
+            "r_acc": self.correlation_accuracy.result(),
+            "p_loss": probe_loss,
+            "p_acc": self.probe_accuracy.result(),
+        }
+
+    def test_step(self, data):
+        labeled_images, labels = data
+
+        preprocessed_images = self.classification_augmenter(
+            labeled_images, training=False
+        )
+        features = self.encoder(preprocessed_images, training=False)
+        class_logits = self.linear_probe(features, training=False)
+        probe_loss = self.probe_loss(labels, class_logits)
+
+        self.probe_accuracy.update_state(labels, class_logits)
+        return {"p_loss": probe_loss, "p_acc": self.probe_accuracy.result()}
+
+
+"""
+## Pre-train NNCLR
+
+We train the network using a `temperature` of 0.1 as suggested in the paper and
+a `queue_size` of 10,000 as explained earlier. We use Adam as our contrastive and probe
+optimizer. For this example we train the model for only 30 epochs but it should be
+trained for more epochs for better performance.
+
+The following two metrics can be used for monitoring the pretraining performance
+which we also log (taken from
+[this Keras example](https://keras.io/examples/vision/semisupervised_simclr/#selfsupervised-model-for-contrastive-pretraining)):
+
+- Contrastive accuracy: self-supervised metric, the ratio of cases in which the
+representation of an image is more similar to its differently augmented version's one,
+than to the representation of any other image in the current batch. Self-supervised
+metrics can be used for hyperparameter tuning even in the case when there are no labeled
+examples.
+- Linear probing accuracy: linear probing is a popular metric to evaluate self-supervised
+classifiers. It is computed as the accuracy of a logistic regression classifier trained
+on top of the encoder's features. In our case, this is done by training a single dense
+layer on top of the frozen encoder. Note that contrary to traditional approach where the
+classifier is trained after the pretraining phase, in this example we train it during
+pretraining. This might slightly decrease its accuracy, but that way we can monitor its
+value during training, which helps with experimentation and debugging.
+"""
+
+model = NNCLR(temperature=temperature, queue_size=queue_size)
+model.compile(
+    contrastive_optimizer=keras.optimizers.Adam(),
+    probe_optimizer=keras.optimizers.Adam(),
+    jit_compile=False,
+)
+pretrain_history = model.fit(
+    train_dataset, epochs=num_epochs, validation_data=test_dataset
+)
+
+"""
+## Evaluate our model
+
+A popular way to evaluate a SSL method in computer vision or for that fact any other
+pre-training method as such is to learn a linear classifier on the frozen features of the
+trained backbone model and evaluate the classifier on unseen images. Other methods often
+include fine-tuning on the source dataset or even a target dataset with 5% or 10% labels
+present. You can use the backbone we just trained for any downstream task such as image
+classification (like we do here) or segmentation or detection, where the backbone models
+are usually pre-trained with supervised learning.
+"""
+
+finetuning_model = keras.Sequential(
+    [
+        layers.Input(shape=input_shape),
+        augmenter(**classification_augmenter),
+        model.encoder,
+        layers.Dense(10),
+    ],
+    name="finetuning_model",
+)
+finetuning_model.compile(
+    optimizer=keras.optimizers.Adam(),
+    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")],
+    jit_compile=False,
+)
+
+finetuning_history = finetuning_model.fit(
+    labeled_train_dataset, epochs=num_epochs, validation_data=test_dataset
+)
+
+"""
+Self supervised learning is particularly helpful when you do only have access to very
+limited labeled training data but you can manage to build a large corpus of unlabeled
+data as shown by previous methods like [SEER](https://arxiv.org/abs/2103.01988),
+[SimCLR](https://arxiv.org/abs/2002.05709), [SwAV](https://arxiv.org/abs/2006.09882) and
+more.
+
+You should also take a look at the blog posts for these papers which neatly show that it is
+possible to achieve good results with few class labels by first pretraining on a large
+unlabeled dataset and then fine-tuning on a smaller labeled dataset:
+
+- [Advancing Self-Supervised and Semi-Supervised Learning with SimCLR](https://ai.googleblog.com/2020/04/advancing-self-supervised-and-semi.html)
+- [High-performance self-supervised image classification with contrastive clustering](https://ai.facebook.com/blog/high-performance-self-supervised-image-classification-with-contrastive-clustering/)
+- [Self-supervised learning: The dark matter of intelligence](https://ai.facebook.com/blog/self-supervised-learning-the-dark-matter-of-intelligence/)
+
+You are also advised to check out the [original paper](https://arxiv.org/abs/2104.14548).
+
+*Many thanks to [Debidatta Dwibedi](https://twitter.com/debidatta) (Google Research),
+primary author of the NNCLR paper for his super-insightful reviews for this example.
+This example also takes inspiration from the [SimCLR Keras Example](https://keras.io/examples/vision/semisupervised_simclr/).*
+"""
diff --git a/knowledge_base/vision/object_detection_using_vision_transformer.py b/knowledge_base/vision/object_detection_using_vision_transformer.py
new file mode 100644
index 0000000000000000000000000000000000000000..cb93f23a4673f394f87d7e4cc4310584caf78214
--- /dev/null
+++ b/knowledge_base/vision/object_detection_using_vision_transformer.py
@@ -0,0 +1,521 @@
+"""
+Title: Object detection with Vision Transformers
+Author: [Karan V. Dave](https://www.linkedin.com/in/karan-dave-811413164/)
+Date created: 2022/03/27
+Last modified: 2023/11/20
+Description: A simple Keras implementation of object detection using Vision Transformers.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+The article
+[Vision Transformer (ViT)](https://arxiv.org/abs/2010.11929)
+architecture by Alexey Dosovitskiy et al.
+demonstrates that a pure transformer applied directly to sequences of image
+patches can perform well on object detection tasks.
+
+In this Keras example, we implement an object detection ViT
+and we train it on the
+[Caltech 101 dataset](http://www.vision.caltech.edu/datasets/)
+to detect an airplane in the given image.
+"""
+
+"""
+## Imports and setup
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "jax"  # @param ["tensorflow", "jax", "torch"]
+
+
+import numpy as np
+import keras
+from keras import layers
+from keras import ops
+import matplotlib.pyplot as plt
+import numpy as np
+import cv2
+import os
+import scipy.io
+import shutil
+
+"""
+## Prepare dataset
+
+We use the [Caltech 101 Dataset](https://data.caltech.edu/records/mzrjq-6wc02).
+"""
+
+# Path to images and annotations
+path_images = "./101_ObjectCategories/airplanes/"
+path_annot = "./Annotations/Airplanes_Side_2/"
+
+path_to_downloaded_file = keras.utils.get_file(
+    fname="caltech_101_zipped",
+    origin="https://data.caltech.edu/records/mzrjq-6wc02/files/caltech-101.zip",
+    extract=True,
+    archive_format="zip",  # downloaded file format
+    cache_dir="/",  # cache and extract in current directory
+)
+download_base_dir = os.path.dirname(path_to_downloaded_file)
+
+# Extracting tar files found inside main zip file
+shutil.unpack_archive(
+    os.path.join(download_base_dir, "caltech-101", "101_ObjectCategories.tar.gz"), "."
+)
+shutil.unpack_archive(
+    os.path.join(download_base_dir, "caltech-101", "Annotations.tar"), "."
+)
+
+# list of paths to images and annotations
+image_paths = [
+    f for f in os.listdir(path_images) if os.path.isfile(os.path.join(path_images, f))
+]
+annot_paths = [
+    f for f in os.listdir(path_annot) if os.path.isfile(os.path.join(path_annot, f))
+]
+
+image_paths.sort()
+annot_paths.sort()
+
+image_size = 224  # resize input images to this size
+
+images, targets = [], []
+
+# loop over the annotations and images, preprocess them and store in lists
+for i in range(0, len(annot_paths)):
+    # Access bounding box coordinates
+    annot = scipy.io.loadmat(path_annot + annot_paths[i])["box_coord"][0]
+
+    top_left_x, top_left_y = annot[2], annot[0]
+    bottom_right_x, bottom_right_y = annot[3], annot[1]
+
+    image = keras.utils.load_img(
+        path_images + image_paths[i],
+    )
+    (w, h) = image.size[:2]
+
+    # resize images
+    image = image.resize((image_size, image_size))
+
+    # convert image to array and append to list
+    images.append(keras.utils.img_to_array(image))
+
+    # apply relative scaling to bounding boxes as per given image and append to list
+    targets.append(
+        (
+            float(top_left_x) / w,
+            float(top_left_y) / h,
+            float(bottom_right_x) / w,
+            float(bottom_right_y) / h,
+        )
+    )
+
+# Convert the list to numpy array, split to train and test dataset
+(x_train), (y_train) = (
+    np.asarray(images[: int(len(images) * 0.8)]),
+    np.asarray(targets[: int(len(targets) * 0.8)]),
+)
+(x_test), (y_test) = (
+    np.asarray(images[int(len(images) * 0.8) :]),
+    np.asarray(targets[int(len(targets) * 0.8) :]),
+)
+
+"""
+## Implement multilayer-perceptron (MLP)
+
+We use the code from the Keras example
+[Image classification with Vision Transformer](https://keras.io/examples/vision/image_classification_with_vision_transformer/)
+as a reference.
+"""
+
+
+def mlp(x, hidden_units, dropout_rate):
+    for units in hidden_units:
+        x = layers.Dense(units, activation=keras.activations.gelu)(x)
+        x = layers.Dropout(dropout_rate)(x)
+    return x
+
+
+"""
+## Implement the patch creation layer
+"""
+
+
+class Patches(layers.Layer):
+    def __init__(self, patch_size):
+        super().__init__()
+        self.patch_size = patch_size
+
+    def call(self, images):
+        input_shape = ops.shape(images)
+        batch_size = input_shape[0]
+        height = input_shape[1]
+        width = input_shape[2]
+        channels = input_shape[3]
+        num_patches_h = height // self.patch_size
+        num_patches_w = width // self.patch_size
+        patches = keras.ops.image.extract_patches(images, size=self.patch_size)
+        patches = ops.reshape(
+            patches,
+            (
+                batch_size,
+                num_patches_h * num_patches_w,
+                self.patch_size * self.patch_size * channels,
+            ),
+        )
+        return patches
+
+    def get_config(self):
+        config = super().get_config()
+        config.update({"patch_size": self.patch_size})
+        return config
+
+
+"""
+## Display patches for an input image
+"""
+
+patch_size = 32  # Size of the patches to be extracted from the input images
+
+plt.figure(figsize=(4, 4))
+plt.imshow(x_train[0].astype("uint8"))
+plt.axis("off")
+
+patches = Patches(patch_size)(np.expand_dims(x_train[0], axis=0))
+print(f"Image size: {image_size} X {image_size}")
+print(f"Patch size: {patch_size} X {patch_size}")
+print(f"{patches.shape[1]} patches per image \n{patches.shape[-1]} elements per patch")
+
+
+n = int(np.sqrt(patches.shape[1]))
+plt.figure(figsize=(4, 4))
+for i, patch in enumerate(patches[0]):
+    ax = plt.subplot(n, n, i + 1)
+    patch_img = ops.reshape(patch, (patch_size, patch_size, 3))
+    plt.imshow(ops.convert_to_numpy(patch_img).astype("uint8"))
+    plt.axis("off")
+
+"""
+## Implement the patch encoding layer
+
+The `PatchEncoder` layer linearly transforms a patch by projecting it into a
+vector of size `projection_dim`. It also adds a learnable position
+embedding to the projected vector.
+"""
+
+
+class PatchEncoder(layers.Layer):
+    def __init__(self, num_patches, projection_dim):
+        super().__init__()
+        self.num_patches = num_patches
+        self.projection = layers.Dense(units=projection_dim)
+        self.position_embedding = layers.Embedding(
+            input_dim=num_patches, output_dim=projection_dim
+        )
+
+    # Override function to avoid error while saving model
+    def get_config(self):
+        config = super().get_config().copy()
+        config.update(
+            {
+                "input_shape": input_shape,
+                "patch_size": patch_size,
+                "num_patches": num_patches,
+                "projection_dim": projection_dim,
+                "num_heads": num_heads,
+                "transformer_units": transformer_units,
+                "transformer_layers": transformer_layers,
+                "mlp_head_units": mlp_head_units,
+            }
+        )
+        return config
+
+    def call(self, patch):
+        positions = ops.expand_dims(
+            ops.arange(start=0, stop=self.num_patches, step=1), axis=0
+        )
+        projected_patches = self.projection(patch)
+        encoded = projected_patches + self.position_embedding(positions)
+        return encoded
+
+
+"""
+## Build the ViT model
+
+The ViT model has multiple Transformer blocks.
+The `MultiHeadAttention` layer is used for self-attention,
+applied to the sequence of image patches. The encoded patches (skip connection)
+and self-attention layer outputs are normalized and fed into a
+multilayer perceptron (MLP).
+The model outputs four dimensions representing
+the bounding box coordinates of an object.
+"""
+
+
+def create_vit_object_detector(
+    input_shape,
+    patch_size,
+    num_patches,
+    projection_dim,
+    num_heads,
+    transformer_units,
+    transformer_layers,
+    mlp_head_units,
+):
+    inputs = keras.Input(shape=input_shape)
+    # Create patches
+    patches = Patches(patch_size)(inputs)
+    # Encode patches
+    encoded_patches = PatchEncoder(num_patches, projection_dim)(patches)
+
+    # Create multiple layers of the Transformer block.
+    for _ in range(transformer_layers):
+        # Layer normalization 1.
+        x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
+        # Create a multi-head attention layer.
+        attention_output = layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=projection_dim, dropout=0.1
+        )(x1, x1)
+        # Skip connection 1.
+        x2 = layers.Add()([attention_output, encoded_patches])
+        # Layer normalization 2.
+        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
+        # MLP
+        x3 = mlp(x3, hidden_units=transformer_units, dropout_rate=0.1)
+        # Skip connection 2.
+        encoded_patches = layers.Add()([x3, x2])
+
+    # Create a [batch_size, projection_dim] tensor.
+    representation = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
+    representation = layers.Flatten()(representation)
+    representation = layers.Dropout(0.3)(representation)
+    # Add MLP.
+    features = mlp(representation, hidden_units=mlp_head_units, dropout_rate=0.3)
+
+    bounding_box = layers.Dense(4)(
+        features
+    )  # Final four neurons that output bounding box
+
+    # return Keras model.
+    return keras.Model(inputs=inputs, outputs=bounding_box)
+
+
+"""
+## Run the experiment
+"""
+
+
+def run_experiment(model, learning_rate, weight_decay, batch_size, num_epochs):
+    optimizer = keras.optimizers.AdamW(
+        learning_rate=learning_rate, weight_decay=weight_decay
+    )
+
+    # Compile model.
+    model.compile(optimizer=optimizer, loss=keras.losses.MeanSquaredError())
+
+    checkpoint_filepath = "vit_object_detector.weights.h5"
+    checkpoint_callback = keras.callbacks.ModelCheckpoint(
+        checkpoint_filepath,
+        monitor="val_loss",
+        save_best_only=True,
+        save_weights_only=True,
+    )
+
+    history = model.fit(
+        x=x_train,
+        y=y_train,
+        batch_size=batch_size,
+        epochs=num_epochs,
+        validation_split=0.1,
+        callbacks=[
+            checkpoint_callback,
+            keras.callbacks.EarlyStopping(monitor="val_loss", patience=10),
+        ],
+    )
+
+    return history
+
+
+input_shape = (image_size, image_size, 3)  # input image shape
+learning_rate = 0.001
+weight_decay = 0.0001
+batch_size = 32
+num_epochs = 100
+num_patches = (image_size // patch_size) ** 2
+projection_dim = 64
+num_heads = 4
+# Size of the transformer layers
+transformer_units = [
+    projection_dim * 2,
+    projection_dim,
+]
+transformer_layers = 4
+mlp_head_units = [2048, 1024, 512, 64, 32]  # Size of the dense layers
+
+
+history = []
+num_patches = (image_size // patch_size) ** 2
+
+vit_object_detector = create_vit_object_detector(
+    input_shape,
+    patch_size,
+    num_patches,
+    projection_dim,
+    num_heads,
+    transformer_units,
+    transformer_layers,
+    mlp_head_units,
+)
+
+# Train model
+history = run_experiment(
+    vit_object_detector, learning_rate, weight_decay, batch_size, num_epochs
+)
+
+
+def plot_history(item):
+    plt.plot(history.history[item], label=item)
+    plt.plot(history.history["val_" + item], label="val_" + item)
+    plt.xlabel("Epochs")
+    plt.ylabel(item)
+    plt.title("Train and Validation {} Over Epochs".format(item), fontsize=14)
+    plt.legend()
+    plt.grid()
+    plt.show()
+
+
+plot_history("loss")
+
+
+"""
+## Evaluate the model
+"""
+
+import matplotlib.patches as patches
+
+# Saves the model in current path
+vit_object_detector.save("vit_object_detector.keras")
+
+
+# To calculate IoU (intersection over union, given two bounding boxes)
+def bounding_box_intersection_over_union(box_predicted, box_truth):
+    # get (x, y) coordinates of intersection of bounding boxes
+    top_x_intersect = max(box_predicted[0], box_truth[0])
+    top_y_intersect = max(box_predicted[1], box_truth[1])
+    bottom_x_intersect = min(box_predicted[2], box_truth[2])
+    bottom_y_intersect = min(box_predicted[3], box_truth[3])
+
+    # calculate area of the intersection bb (bounding box)
+    intersection_area = max(0, bottom_x_intersect - top_x_intersect + 1) * max(
+        0, bottom_y_intersect - top_y_intersect + 1
+    )
+
+    # calculate area of the prediction bb and ground-truth bb
+    box_predicted_area = (box_predicted[2] - box_predicted[0] + 1) * (
+        box_predicted[3] - box_predicted[1] + 1
+    )
+    box_truth_area = (box_truth[2] - box_truth[0] + 1) * (
+        box_truth[3] - box_truth[1] + 1
+    )
+
+    # calculate intersection over union by taking intersection
+    # area and dividing it by the sum of predicted bb and ground truth
+    # bb areas subtracted by  the interesection area
+
+    # return ioU
+    return intersection_area / float(
+        box_predicted_area + box_truth_area - intersection_area
+    )
+
+
+i, mean_iou = 0, 0
+
+# Compare results for 10 images in the test set
+for input_image in x_test[:10]:
+    fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 15))
+    im = input_image
+
+    # Display the image
+    ax1.imshow(im.astype("uint8"))
+    ax2.imshow(im.astype("uint8"))
+
+    input_image = cv2.resize(
+        input_image, (image_size, image_size), interpolation=cv2.INTER_AREA
+    )
+    input_image = np.expand_dims(input_image, axis=0)
+    preds = vit_object_detector.predict(input_image)[0]
+
+    (h, w) = (im).shape[0:2]
+
+    top_left_x, top_left_y = int(preds[0] * w), int(preds[1] * h)
+
+    bottom_right_x, bottom_right_y = int(preds[2] * w), int(preds[3] * h)
+
+    box_predicted = [top_left_x, top_left_y, bottom_right_x, bottom_right_y]
+    # Create the bounding box
+    rect = patches.Rectangle(
+        (top_left_x, top_left_y),
+        bottom_right_x - top_left_x,
+        bottom_right_y - top_left_y,
+        facecolor="none",
+        edgecolor="red",
+        linewidth=1,
+    )
+    # Add the bounding box to the image
+    ax1.add_patch(rect)
+    ax1.set_xlabel(
+        "Predicted: "
+        + str(top_left_x)
+        + ", "
+        + str(top_left_y)
+        + ", "
+        + str(bottom_right_x)
+        + ", "
+        + str(bottom_right_y)
+    )
+
+    top_left_x, top_left_y = int(y_test[i][0] * w), int(y_test[i][1] * h)
+
+    bottom_right_x, bottom_right_y = int(y_test[i][2] * w), int(y_test[i][3] * h)
+
+    box_truth = top_left_x, top_left_y, bottom_right_x, bottom_right_y
+
+    mean_iou += bounding_box_intersection_over_union(box_predicted, box_truth)
+    # Create the bounding box
+    rect = patches.Rectangle(
+        (top_left_x, top_left_y),
+        bottom_right_x - top_left_x,
+        bottom_right_y - top_left_y,
+        facecolor="none",
+        edgecolor="red",
+        linewidth=1,
+    )
+    # Add the bounding box to the image
+    ax2.add_patch(rect)
+    ax2.set_xlabel(
+        "Target: "
+        + str(top_left_x)
+        + ", "
+        + str(top_left_y)
+        + ", "
+        + str(bottom_right_x)
+        + ", "
+        + str(bottom_right_y)
+        + "\n"
+        + "IoU"
+        + str(bounding_box_intersection_over_union(box_predicted, box_truth))
+    )
+    i = i + 1
+
+print("mean_iou: " + str(mean_iou / len(x_test[:10])))
+plt.show()
+
+"""
+This example demonstrates that a pure Transformer can be trained
+to predict the bounding boxes of an object in a given image,
+thus extending the use of Transformers to object detection tasks.
+The model can be improved further by tuning hyper-parameters and pre-training.
+"""
diff --git a/knowledge_base/vision/oxford_pets_image_segmentation.py b/knowledge_base/vision/oxford_pets_image_segmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..f624a18d86754b524bab14fa348719750aaf097b
--- /dev/null
+++ b/knowledge_base/vision/oxford_pets_image_segmentation.py
@@ -0,0 +1,270 @@
+"""
+Title: Image segmentation with a U-Net-like architecture
+Author: [fchollet](https://twitter.com/fchollet)
+Date created: 2019/03/20
+Last modified: 2020/04/20
+Description: Image segmentation model trained from scratch on the Oxford Pets dataset.
+Accelerator: GPU
+"""
+
+"""
+## Download the data
+"""
+
+"""shell
+!wget https://www.robots.ox.ac.uk/~vgg/data/pets/data/images.tar.gz
+!wget https://www.robots.ox.ac.uk/~vgg/data/pets/data/annotations.tar.gz
+
+curl -O https://thor.robots.ox.ac.uk/datasets/pets/images.tar.gz
+curl -O https://thor.robots.ox.ac.uk/datasets/pets/annotations.tar.gz
+
+tar -xf images.tar.gz
+tar -xf annotations.tar.gz
+"""
+
+"""
+## Prepare paths of input images and target segmentation masks
+"""
+
+import os
+
+input_dir = "images/"
+target_dir = "annotations/trimaps/"
+img_size = (160, 160)
+num_classes = 3
+batch_size = 32
+
+input_img_paths = sorted(
+    [
+        os.path.join(input_dir, fname)
+        for fname in os.listdir(input_dir)
+        if fname.endswith(".jpg")
+    ]
+)
+target_img_paths = sorted(
+    [
+        os.path.join(target_dir, fname)
+        for fname in os.listdir(target_dir)
+        if fname.endswith(".png") and not fname.startswith(".")
+    ]
+)
+
+print("Number of samples:", len(input_img_paths))
+
+for input_path, target_path in zip(input_img_paths[:10], target_img_paths[:10]):
+    print(input_path, "|", target_path)
+
+"""
+## What does one input image and corresponding segmentation mask look like?
+"""
+
+from IPython.display import Image, display
+from keras.utils import load_img
+from PIL import ImageOps
+
+# Display input image #7
+display(Image(filename=input_img_paths[9]))
+
+# Display auto-contrast version of corresponding target (per-pixel categories)
+img = ImageOps.autocontrast(load_img(target_img_paths[9]))
+display(img)
+
+"""
+## Prepare dataset to load & vectorize batches of data
+"""
+
+import keras
+import numpy as np
+from tensorflow import data as tf_data
+from tensorflow import image as tf_image
+from tensorflow import io as tf_io
+
+
+def get_dataset(
+    batch_size,
+    img_size,
+    input_img_paths,
+    target_img_paths,
+    max_dataset_len=None,
+):
+    """Returns a TF Dataset."""
+
+    def load_img_masks(input_img_path, target_img_path):
+        input_img = tf_io.read_file(input_img_path)
+        input_img = tf_io.decode_png(input_img, channels=3)
+        input_img = tf_image.resize(input_img, img_size)
+        input_img = tf_image.convert_image_dtype(input_img, "float32")
+
+        target_img = tf_io.read_file(target_img_path)
+        target_img = tf_io.decode_png(target_img, channels=1)
+        target_img = tf_image.resize(target_img, img_size, method="nearest")
+        target_img = tf_image.convert_image_dtype(target_img, "uint8")
+
+        # Ground truth labels are 1, 2, 3. Subtract one to make them 0, 1, 2:
+        target_img -= 1
+        return input_img, target_img
+
+    # For faster debugging, limit the size of data
+    if max_dataset_len:
+        input_img_paths = input_img_paths[:max_dataset_len]
+        target_img_paths = target_img_paths[:max_dataset_len]
+    dataset = tf_data.Dataset.from_tensor_slices((input_img_paths, target_img_paths))
+    dataset = dataset.map(load_img_masks, num_parallel_calls=tf_data.AUTOTUNE)
+    return dataset.batch(batch_size)
+
+
+"""
+## Prepare U-Net Xception-style model
+"""
+
+from keras import layers
+
+
+def get_model(img_size, num_classes):
+    inputs = keras.Input(shape=img_size + (3,))
+
+    ### [First half of the network: downsampling inputs] ###
+
+    # Entry block
+    x = layers.Conv2D(32, 3, strides=2, padding="same")(inputs)
+    x = layers.BatchNormalization()(x)
+    x = layers.Activation("relu")(x)
+
+    previous_block_activation = x  # Set aside residual
+
+    # Blocks 1, 2, 3 are identical apart from the feature depth.
+    for filters in [64, 128, 256]:
+        x = layers.Activation("relu")(x)
+        x = layers.SeparableConv2D(filters, 3, padding="same")(x)
+        x = layers.BatchNormalization()(x)
+
+        x = layers.Activation("relu")(x)
+        x = layers.SeparableConv2D(filters, 3, padding="same")(x)
+        x = layers.BatchNormalization()(x)
+
+        x = layers.MaxPooling2D(3, strides=2, padding="same")(x)
+
+        # Project residual
+        residual = layers.Conv2D(filters, 1, strides=2, padding="same")(
+            previous_block_activation
+        )
+        x = layers.add([x, residual])  # Add back residual
+        previous_block_activation = x  # Set aside next residual
+
+    ### [Second half of the network: upsampling inputs] ###
+
+    for filters in [256, 128, 64, 32]:
+        x = layers.Activation("relu")(x)
+        x = layers.Conv2DTranspose(filters, 3, padding="same")(x)
+        x = layers.BatchNormalization()(x)
+
+        x = layers.Activation("relu")(x)
+        x = layers.Conv2DTranspose(filters, 3, padding="same")(x)
+        x = layers.BatchNormalization()(x)
+
+        x = layers.UpSampling2D(2)(x)
+
+        # Project residual
+        residual = layers.UpSampling2D(2)(previous_block_activation)
+        residual = layers.Conv2D(filters, 1, padding="same")(residual)
+        x = layers.add([x, residual])  # Add back residual
+        previous_block_activation = x  # Set aside next residual
+
+    # Add a per-pixel classification layer
+    outputs = layers.Conv2D(num_classes, 3, activation="softmax", padding="same")(x)
+
+    # Define the model
+    model = keras.Model(inputs, outputs)
+    return model
+
+
+# Build model
+model = get_model(img_size, num_classes)
+model.summary()
+
+"""
+## Set aside a validation split
+"""
+
+import random
+
+# Split our img paths into a training and a validation set
+val_samples = 1000
+random.Random(1337).shuffle(input_img_paths)
+random.Random(1337).shuffle(target_img_paths)
+train_input_img_paths = input_img_paths[:-val_samples]
+train_target_img_paths = target_img_paths[:-val_samples]
+val_input_img_paths = input_img_paths[-val_samples:]
+val_target_img_paths = target_img_paths[-val_samples:]
+
+# Instantiate dataset for each split
+# Limit input files in `max_dataset_len` for faster epoch training time.
+# Remove the `max_dataset_len` arg when running with full dataset.
+train_dataset = get_dataset(
+    batch_size,
+    img_size,
+    train_input_img_paths,
+    train_target_img_paths,
+    max_dataset_len=1000,
+)
+valid_dataset = get_dataset(
+    batch_size, img_size, val_input_img_paths, val_target_img_paths
+)
+
+"""
+## Train the model
+"""
+
+# Configure the model for training.
+# We use the "sparse" version of categorical_crossentropy
+# because our target data is integers.
+model.compile(
+    optimizer=keras.optimizers.Adam(1e-4), loss="sparse_categorical_crossentropy"
+)
+
+callbacks = [
+    keras.callbacks.ModelCheckpoint("oxford_segmentation.keras", save_best_only=True)
+]
+
+# Train the model, doing validation at the end of each epoch.
+epochs = 50
+model.fit(
+    train_dataset,
+    epochs=epochs,
+    validation_data=valid_dataset,
+    callbacks=callbacks,
+    verbose=2,
+)
+
+"""
+## Visualize predictions
+"""
+
+# Generate predictions for all images in the validation set
+
+val_dataset = get_dataset(
+    batch_size, img_size, val_input_img_paths, val_target_img_paths
+)
+val_preds = model.predict(val_dataset)
+
+
+def display_mask(i):
+    """Quick utility to display a model's prediction."""
+    mask = np.argmax(val_preds[i], axis=-1)
+    mask = np.expand_dims(mask, axis=-1)
+    img = ImageOps.autocontrast(keras.utils.array_to_img(mask))
+    display(img)
+
+
+# Display results for validation image #10
+i = 10
+
+# Display input image
+display(Image(filename=val_input_img_paths[i]))
+
+# Display ground-truth target mask
+img = ImageOps.autocontrast(load_img(val_target_img_paths[i]))
+display(img)
+
+# Display mask predicted by our model
+display_mask(i)  # Note that the model only sees inputs at 150x150.
diff --git a/knowledge_base/vision/patch_convnet.py b/knowledge_base/vision/patch_convnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..19e364bc4ce1f969980bb318772b63661637619e
--- /dev/null
+++ b/knowledge_base/vision/patch_convnet.py
@@ -0,0 +1,690 @@
+"""
+Title: Augmenting convnets with aggregated attention
+Author: [Aritra Roy Gosthipaty](https://twitter.com/ariG23498)
+Date created: 2022/01/22
+Last modified: 2022/01/22
+Description: Building a patch-convnet architecture and visualizing its attention maps.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+Vision transformers ([Dosovitskiy et. al](https://arxiv.org/abs/2010.11929))
+have emerged as a powerful alternative to Convolutional Neural Networks.
+ViTs process the images in a patch-based manner. The image information
+is then aggregated into a `CLASS` token. This token correlates to the
+most important patches of the image for a particular classification decision.
+
+The interaction between the `CLASS` token and the patches can be visualized
+to help explain a classification decision. In the academic paper
+[Augmenting convolutional networks with attention-based aggregation](https://arxiv.org/abs/2112.13692)
+by Touvron et. al, the authors propose to set up an equivalent visualization for
+convnets. They propose to substitute the global average pooling layer
+of a convnet with a Transformer layer. The self-attention layer of the
+Transformer would produce attention maps that correspond to the
+most attended patches of the image for the classification decision.
+
+In this example, we minimally implement the ideas of
+[Augmenting Convolutional networks with attention-based aggregation](https://arxiv.org/abs/2112.13692).
+The main goal of this example is to cover the following ideas, with
+minor modifications (to adjust the implementation with CIFAR10):
+
+- The simple design for the attention-based pooling layer, such that
+    it explicitly provides the weights (importance) of the different
+    patches.
+- The novel architecture of convnet is called the **PatchConvNet** which
+    deviates from the age old pyramidal architecture.
+"""
+
+"""
+## Setup and Imports
+
+This example requires TensorFlow Addons, which can be installed using
+the following command:
+
+```shell
+pip install -U tensorflow-addons
+```
+"""
+
+import math
+import numpy as np
+import tensorflow as tf
+from tensorflow import keras
+import matplotlib.pyplot as plt
+import keras
+from keras import layers
+from keras import ops
+from tensorflow import data as tf_data
+
+# Set seed for reproducibiltiy
+SEED = 42
+keras.utils.set_random_seed(SEED)
+
+"""
+## Hyperparameters
+"""
+
+# DATA
+BATCH_SIZE = 128
+BUFFER_SIZE = BATCH_SIZE * 2
+AUTO = tf_data.AUTOTUNE
+INPUT_SHAPE = (32, 32, 3)
+NUM_CLASSES = 10  # for CIFAR 10
+
+# AUGMENTATION
+IMAGE_SIZE = 48  # We will resize input images to this size.
+
+# ARCHITECTURE
+DIMENSIONS = 256
+SE_RATIO = 8
+TRUNK_DEPTH = 2
+
+# OPTIMIZER
+LEARNING_RATE = 1e-3
+WEIGHT_DECAY = 1e-4
+
+# PRETRAINING
+EPOCHS = 50
+
+"""
+## Load the CIFAR10 dataset
+"""
+
+(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()
+(x_train, y_train), (x_val, y_val) = (
+    (x_train[:40000], y_train[:40000]),
+    (x_train[40000:], y_train[40000:]),
+)
+print(f"Training samples: {len(x_train)}")
+print(f"Validation samples: {len(x_val)}")
+print(f"Testing samples: {len(x_test)}")
+
+train_ds = tf_data.Dataset.from_tensor_slices((x_train, y_train))
+train_ds = train_ds.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(AUTO)
+
+val_ds = tf_data.Dataset.from_tensor_slices((x_val, y_val))
+val_ds = val_ds.batch(BATCH_SIZE).prefetch(AUTO)
+
+test_ds = tf_data.Dataset.from_tensor_slices((x_test, y_test))
+test_ds = test_ds.batch(BATCH_SIZE).prefetch(AUTO)
+
+"""
+## Augmentation layers
+"""
+
+
+def get_preprocessing():
+    model = keras.Sequential(
+        [
+            layers.Rescaling(1 / 255.0),
+            layers.Resizing(IMAGE_SIZE, IMAGE_SIZE),
+        ],
+        name="preprocessing",
+    )
+    return model
+
+
+def get_train_augmentation_model():
+    model = keras.Sequential(
+        [
+            layers.Rescaling(1 / 255.0),
+            layers.Resizing(INPUT_SHAPE[0] + 20, INPUT_SHAPE[0] + 20),
+            layers.RandomCrop(IMAGE_SIZE, IMAGE_SIZE),
+            layers.RandomFlip("horizontal"),
+        ],
+        name="train_data_augmentation",
+    )
+    return model
+
+
+"""
+## Convolutional stem
+
+The stem of the model is a lightweight preprocessing module that
+maps images pixels to a set of vectors (patches).
+"""
+
+
+def build_convolutional_stem(dimensions):
+    """Build the convolutional stem.
+
+    Args:
+        dimensions: The embedding dimension of the patches (d in paper).
+
+    Returs:
+        The convolutional stem as a keras seqeuntial
+        model.
+    """
+    config = {
+        "kernel_size": (3, 3),
+        "strides": (2, 2),
+        "activation": ops.gelu,
+        "padding": "same",
+    }
+
+    convolutional_stem = keras.Sequential(
+        [
+            layers.Conv2D(filters=dimensions // 2, **config),
+            layers.Conv2D(filters=dimensions, **config),
+        ],
+        name="convolutional_stem",
+    )
+
+    return convolutional_stem
+
+
+"""
+## Convolutional trunk
+
+The trunk of the model is the most compute-intesive part. It consists
+of `N` stacked residual convolutional blocks.
+"""
+
+
+class SqueezeExcite(layers.Layer):
+    """Applies squeeze and excitation to input feature maps as seen in
+    https://arxiv.org/abs/1709.01507.
+
+    Args:
+        ratio: The ratio with which the feature map needs to be reduced in
+        the reduction phase.
+
+    Inputs:
+        Convolutional features.
+
+    Outputs:
+        Attention modified feature maps.
+    """
+
+    def __init__(self, ratio, **kwargs):
+        super().__init__(**kwargs)
+        self.ratio = ratio
+
+    def get_config(self):
+        config = super().get_config()
+        config.update({"ratio": self.ratio})
+        return config
+
+    def build(self, input_shape):
+        filters = input_shape[-1]
+        self.squeeze = layers.GlobalAveragePooling2D(keepdims=True)
+        self.reduction = layers.Dense(
+            units=filters // self.ratio,
+            activation="relu",
+            use_bias=False,
+        )
+        self.excite = layers.Dense(units=filters, activation="sigmoid", use_bias=False)
+        self.multiply = layers.Multiply()
+
+    def call(self, x):
+        shortcut = x
+        x = self.squeeze(x)
+        x = self.reduction(x)
+        x = self.excite(x)
+        x = self.multiply([shortcut, x])
+        return x
+
+
+class Trunk(layers.Layer):
+    """Convolutional residual trunk as in the https://arxiv.org/abs/2112.13692
+
+    Args:
+        depth: Number of trunk residual blocks
+        dimensions: Dimnesion of the model (denoted by d in the paper)
+        ratio: The Squeeze-Excitation ratio
+
+    Inputs:
+        Convolutional features extracted from the conv stem.
+
+    Outputs:
+        Flattened patches.
+    """
+
+    def __init__(self, depth, dimensions, ratio, **kwargs):
+        super().__init__(**kwargs)
+        self.ratio = ratio
+        self.dimensions = dimensions
+        self.depth = depth
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "ratio": self.ratio,
+                "dimensions": self.dimensions,
+                "depth": self.depth,
+            }
+        )
+        return config
+
+    def build(self, input_shape):
+        config = {
+            "filters": self.dimensions,
+            "activation": ops.gelu,
+            "padding": "same",
+        }
+
+        trunk_block = [
+            layers.LayerNormalization(epsilon=1e-6),
+            layers.Conv2D(kernel_size=(1, 1), **config),
+            layers.Conv2D(kernel_size=(3, 3), **config),
+            SqueezeExcite(ratio=self.ratio),
+            layers.Conv2D(kernel_size=(1, 1), filters=self.dimensions, padding="same"),
+        ]
+
+        self.trunk_blocks = [keras.Sequential(trunk_block) for _ in range(self.depth)]
+        self.add = layers.Add()
+        self.flatten_spatial = layers.Reshape((-1, self.dimensions))
+
+    def call(self, x):
+        # Remember the input.
+        shortcut = x
+        for trunk_block in self.trunk_blocks:
+            output = trunk_block(x)
+            shortcut = self.add([output, shortcut])
+            x = shortcut
+        # Flatten the patches.
+        x = self.flatten_spatial(x)
+        return x
+
+
+"""
+## Attention Pooling
+
+The output of the convolutional trunk is attended with a trainable
+_query_ class token. The resulting attention map is the weight of
+every patch of the image for a classification decision.
+"""
+
+
+class AttentionPooling(layers.Layer):
+    """Applies attention to the patches extracted form the
+    trunk with the CLS token.
+
+    Args:
+        dimensions: The dimension of the whole architecture.
+        num_classes: The number of classes in the dataset.
+
+    Inputs:
+        Flattened patches from the trunk.
+
+    Outputs:
+        The modifies CLS token.
+    """
+
+    def __init__(self, dimensions, num_classes, **kwargs):
+        super().__init__(**kwargs)
+        self.dimensions = dimensions
+        self.num_classes = num_classes
+        self.cls = keras.Variable(ops.zeros((1, 1, dimensions)))
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "dimensions": self.dimensions,
+                "num_classes": self.num_classes,
+                "cls": self.cls.numpy(),
+            }
+        )
+        return config
+
+    def build(self, input_shape):
+        self.attention = layers.MultiHeadAttention(
+            num_heads=1,
+            key_dim=self.dimensions,
+            dropout=0.2,
+        )
+        self.layer_norm1 = layers.LayerNormalization(epsilon=1e-6)
+        self.layer_norm2 = layers.LayerNormalization(epsilon=1e-6)
+        self.layer_norm3 = layers.LayerNormalization(epsilon=1e-6)
+        self.mlp = keras.Sequential(
+            [
+                layers.Dense(units=self.dimensions, activation=ops.gelu),
+                layers.Dropout(0.2),
+                layers.Dense(units=self.dimensions, activation=ops.gelu),
+            ]
+        )
+        self.dense = layers.Dense(units=self.num_classes)
+        self.flatten = layers.Flatten()
+
+    def call(self, x):
+        batch_size = ops.shape(x)[0]
+        # Expand the class token batch number of times.
+        class_token = ops.repeat(self.cls, repeats=batch_size, axis=0)
+        # Concat the input with the trainable class token.
+        x = ops.concatenate([class_token, x], axis=1)
+        # Apply attention to x.
+        x = self.layer_norm1(x)
+        x, viz_weights = self.attention(
+            query=x[:, 0:1], key=x, value=x, return_attention_scores=True
+        )
+        class_token = class_token + x
+        class_token = self.layer_norm2(class_token)
+        class_token = self.flatten(class_token)
+        class_token = self.layer_norm3(class_token)
+        class_token = class_token + self.mlp(class_token)
+        # Build the logits
+        logits = self.dense(class_token)
+        return logits, ops.squeeze(viz_weights)[..., 1:]
+
+
+"""
+## Patch convnet
+
+The patch-convnet is shown in the figure below.
+
+| ![image model](https://i.imgur.com/NHiQeac.png) |
+| :--: |
+| [Source](https://arxiv.org/abs/2112.13692) |
+
+All the modules in the architecture are built in the earlier seciton.
+In this section, we stack all of the different modules together.
+"""
+
+
+class PatchConvNet(keras.Model):
+    def __init__(
+        self,
+        stem,
+        trunk,
+        attention_pooling,
+        preprocessing_model,
+        train_augmentation_model,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.stem = stem
+        self.trunk = trunk
+        self.attention_pooling = attention_pooling
+        self.train_augmentation_model = train_augmentation_model
+        self.preprocessing_model = preprocessing_model
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "stem": self.stem,
+                "trunk": self.trunk,
+                "attention_pooling": self.attention_pooling,
+                "train_augmentation_model": self.train_augmentation_model,
+                "preprocessing_model": self.preprocessing_model,
+            }
+        )
+        return config
+
+    def _calculate_loss(self, inputs, test=False):
+        images, labels = inputs
+        # Augment the input images.
+        if test:
+            augmented_images = self.preprocessing_model(images)
+        else:
+            augmented_images = self.train_augmentation_model(images)
+        # Pass through the stem.
+        x = self.stem(augmented_images)
+        # Pass through the trunk.
+        x = self.trunk(x)
+        # Pass through the attention pooling block.
+        logits, _ = self.attention_pooling(x)
+        # Compute the total loss.
+        total_loss = self.compiled_loss(labels, logits)
+        return total_loss, logits
+
+    def train_step(self, inputs):
+        with tf.GradientTape() as tape:
+            total_loss, logits = self._calculate_loss(inputs)
+        # Apply gradients.
+        train_vars = [
+            self.stem.trainable_variables,
+            self.trunk.trainable_variables,
+            self.attention_pooling.trainable_variables,
+        ]
+        grads = tape.gradient(total_loss, train_vars)
+        trainable_variable_list = []
+        for grad, var in zip(grads, train_vars):
+            for g, v in zip(grad, var):
+                trainable_variable_list.append((g, v))
+        self.optimizer.apply_gradients(trainable_variable_list)
+        # Report progress.
+        _, labels = inputs
+        self.compiled_metrics.update_state(labels, logits)
+        return {m.name: m.result() for m in self.metrics}
+
+    def test_step(self, inputs):
+        total_loss, logits = self._calculate_loss(inputs, test=True)
+        # Report progress.
+        _, labels = inputs
+        self.compiled_metrics.update_state(labels, logits)
+        return {m.name: m.result() for m in self.metrics}
+
+    def call(self, images):
+        # Augment the input images.
+        augmented_images = self.preprocessing_model(images)
+        # Pass through the stem.
+        x = self.stem(augmented_images)
+        # Pass through the trunk.
+        x = self.trunk(x)
+        # Pass through the attention pooling block.
+        logits, viz_weights = self.attention_pooling(x)
+        return logits, viz_weights
+
+
+"""
+## Callbacks
+
+This callback will plot the image and the attention map overlayed on
+the image.
+"""
+
+# Taking a batch of test inputs to measure model's progress.
+test_images, test_labels = next(iter(test_ds))
+
+
+class TrainMonitor(keras.callbacks.Callback):
+    def __init__(self, epoch_interval=None):
+        self.epoch_interval = epoch_interval
+
+    def on_epoch_end(self, epoch, logs=None):
+        if self.epoch_interval and epoch % self.epoch_interval == 4:
+            test_augmented_images = self.model.preprocessing_model(test_images)
+            # Pass through the stem.
+            test_x = self.model.stem(test_augmented_images)
+            # Pass through the trunk.
+            test_x = self.model.trunk(test_x)
+            # Pass through the attention pooling block.
+            _, test_viz_weights = self.model.attention_pooling(test_x)
+            # Reshape the vizualization weights
+            num_patches = ops.shape(test_viz_weights)[-1]
+            height = width = int(math.sqrt(num_patches))
+            test_viz_weights = layers.Reshape((height, width))(test_viz_weights)
+            # Take a random image and its attention weights.
+            index = np.random.randint(low=0, high=ops.shape(test_augmented_images)[0])
+            selected_image = test_augmented_images[index]
+            selected_weight = test_viz_weights[index]
+            # Plot the images and the overlayed attention map.
+            fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
+            ax[0].imshow(selected_image)
+            ax[0].set_title(f"Original: {epoch:03d}")
+            ax[0].axis("off")
+            img = ax[1].imshow(selected_image)
+            ax[1].imshow(
+                selected_weight, cmap="inferno", alpha=0.6, extent=img.get_extent()
+            )
+            ax[1].set_title(f"Attended: {epoch:03d}")
+            ax[1].axis("off")
+            plt.axis("off")
+            plt.show()
+            plt.close()
+
+
+"""
+## Learning rate schedule
+"""
+
+
+class WarmUpCosine(keras.optimizers.schedules.LearningRateSchedule):
+    def __init__(
+        self, learning_rate_base, total_steps, warmup_learning_rate, warmup_steps
+    ):
+        super().__init__()
+        self.learning_rate_base = learning_rate_base
+        self.total_steps = total_steps
+        self.warmup_learning_rate = warmup_learning_rate
+        self.warmup_steps = warmup_steps
+        self.pi = np.pi
+
+    def __call__(self, step):
+        if self.total_steps < self.warmup_steps:
+            raise ValueError("Total_steps must be larger or equal to warmup_steps.")
+        cos_annealed_lr = ops.cos(
+            self.pi
+            * (ops.cast(step, "float32") - self.warmup_steps)
+            / float(self.total_steps - self.warmup_steps)
+        )
+        learning_rate = 0.5 * self.learning_rate_base * (1 + cos_annealed_lr)
+        if self.warmup_steps > 0:
+            if self.learning_rate_base < self.warmup_learning_rate:
+                raise ValueError(
+                    "Learning_rate_base must be larger or equal to "
+                    "warmup_learning_rate."
+                )
+            slope = (
+                self.learning_rate_base - self.warmup_learning_rate
+            ) / self.warmup_steps
+            warmup_rate = slope * ops.cast(step, "float32") + self.warmup_learning_rate
+            learning_rate = ops.where(
+                step < self.warmup_steps, warmup_rate, learning_rate
+            )
+        return ops.where(
+            step > self.total_steps,
+            0.0,
+            learning_rate,
+        )
+
+
+total_steps = int((len(x_train) / BATCH_SIZE) * EPOCHS)
+warmup_epoch_percentage = 0.15
+warmup_steps = int(total_steps * warmup_epoch_percentage)
+scheduled_lrs = WarmUpCosine(
+    learning_rate_base=LEARNING_RATE,
+    total_steps=total_steps,
+    warmup_learning_rate=0.0,
+    warmup_steps=warmup_steps,
+)
+
+"""
+## Training
+
+We build the model, compile it, and train it.
+"""
+
+train_augmentation_model = get_train_augmentation_model()
+preprocessing_model = get_preprocessing()
+conv_stem = build_convolutional_stem(dimensions=DIMENSIONS)
+conv_trunk = Trunk(depth=TRUNK_DEPTH, dimensions=DIMENSIONS, ratio=SE_RATIO)
+attention_pooling = AttentionPooling(dimensions=DIMENSIONS, num_classes=NUM_CLASSES)
+
+patch_conv_net = PatchConvNet(
+    stem=conv_stem,
+    trunk=conv_trunk,
+    attention_pooling=attention_pooling,
+    train_augmentation_model=train_augmentation_model,
+    preprocessing_model=preprocessing_model,
+)
+
+# Assemble the callbacks.
+train_callbacks = [TrainMonitor(epoch_interval=5)]
+# Get the optimizer.
+optimizer = keras.optimizers.AdamW(
+    learning_rate=scheduled_lrs, weight_decay=WEIGHT_DECAY
+)
+# Compile and pretrain the model.
+patch_conv_net.compile(
+    optimizer=optimizer,
+    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    metrics=[
+        keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
+        keras.metrics.SparseTopKCategoricalAccuracy(5, name="top-5-accuracy"),
+    ],
+)
+history = patch_conv_net.fit(
+    train_ds,
+    epochs=EPOCHS,
+    validation_data=val_ds,
+    callbacks=train_callbacks,
+)
+
+# Evaluate the model with the test dataset.
+loss, acc_top1, acc_top5 = patch_conv_net.evaluate(test_ds)
+print(f"Loss: {loss:0.2f}")
+print(f"Top 1 test accuracy: {acc_top1*100:0.2f}%")
+print(f"Top 5 test accuracy: {acc_top5*100:0.2f}%")
+
+"""
+## Inference
+
+Here, we use the trained model to plot the attention map.
+"""
+
+
+def plot_attention(image):
+    """Plots the attention map on top of the image.
+
+    Args:
+        image: A numpy image of arbitrary size.
+    """
+    # Resize the image to a (32, 32) dim.
+    image = ops.image.resize(image, (32, 32))
+    image = image[np.newaxis, ...]
+    test_augmented_images = patch_conv_net.preprocessing_model(image)
+    # Pass through the stem.
+    test_x = patch_conv_net.stem(test_augmented_images)
+    # Pass through the trunk.
+    test_x = patch_conv_net.trunk(test_x)
+    # Pass through the attention pooling block.
+    _, test_viz_weights = patch_conv_net.attention_pooling(test_x)
+    test_viz_weights = test_viz_weights[np.newaxis, ...]
+    # Reshape the vizualization weights.
+    num_patches = ops.shape(test_viz_weights)[-1]
+    height = width = int(math.sqrt(num_patches))
+    test_viz_weights = layers.Reshape((height, width))(test_viz_weights)
+    selected_image = test_augmented_images[0]
+    selected_weight = test_viz_weights[0]
+    # Plot the images.
+    fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
+    ax[0].imshow(selected_image)
+    ax[0].set_title(f"Original")
+    ax[0].axis("off")
+    img = ax[1].imshow(selected_image)
+    ax[1].imshow(selected_weight, cmap="inferno", alpha=0.6, extent=img.get_extent())
+    ax[1].set_title(f"Attended")
+    ax[1].axis("off")
+    plt.axis("off")
+    plt.show()
+    plt.close()
+
+
+url = "http://farm9.staticflickr.com/8017/7140384795_385b1f48df_z.jpg"
+image_name = keras.utils.get_file(fname="image.jpg", origin=url)
+image = keras.utils.load_img(image_name)
+image = keras.utils.img_to_array(image)
+plot_attention(image)
+
+"""
+## Conclusions
+
+The attention map corresponding to the trainable `CLASS`
+token and the patches of the image helps explain the classificaiton
+decision. One should also note that the attention maps gradually get
+better. In the initial training regime, the attention is scattered all
+around while at a later stage, it focuses more on the objects of the
+image.
+
+The non-pyramidal convnet achieves an accuracy of ~84-85% top-1 test
+accuracy.
+
+I would like to thank [JarvisLabs.ai](https://jarvislabs.ai/) for
+providing GPU credits for this project.
+"""
diff --git a/knowledge_base/vision/perceiver_image_classification.py b/knowledge_base/vision/perceiver_image_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..95428396d961de8ea3afcaa427c15f58f7d703b3
--- /dev/null
+++ b/knowledge_base/vision/perceiver_image_classification.py
@@ -0,0 +1,469 @@
+"""
+Title: Image classification with Perceiver
+Author: [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/)
+Date created: 2021/04/30
+Last modified: 2023/12/30
+Description: Implementing the Perceiver model for image classification.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+This example implements the
+[Perceiver: General Perception with Iterative Attention](https://arxiv.org/abs/2103.03206)
+model by Andrew Jaegle et al. for image classification,
+and demonstrates it on the CIFAR-100 dataset.
+
+The Perceiver model leverages an asymmetric attention mechanism to iteratively
+distill inputs into a tight latent bottleneck,
+allowing it to scale to handle very large inputs.
+
+In other words: let's assume that your input data array (e.g. image) has `M` elements (i.e. patches), where `M` is large.
+In a standard Transformer model, a self-attention operation is performed for the `M` elements.
+The complexity of this operation is `O(M^2)`.
+However, the Perceiver model creates a latent array of size `N` elements, where `N << M`,
+and performs two operations iteratively:
+
+1. Cross-attention Transformer between the latent array and the data array - The complexity of this operation is `O(M.N)`.
+2. Self-attention Transformer on the latent array -  The complexity of this operation is `O(N^2)`.
+
+This example requires Keras 3.0 or higher.
+"""
+
+"""
+## Setup
+"""
+
+import keras
+from keras import layers, activations, ops
+
+"""
+## Prepare the data
+"""
+
+num_classes = 100
+input_shape = (32, 32, 3)
+
+(x_train, y_train), (x_test, y_test) = keras.datasets.cifar100.load_data()
+
+print(f"x_train shape: {x_train.shape} - y_train shape: {y_train.shape}")
+print(f"x_test shape: {x_test.shape} - y_test shape: {y_test.shape}")
+
+"""
+## Configure the hyperparameters
+"""
+
+learning_rate = 0.001
+weight_decay = 0.0001
+batch_size = 64
+num_epochs = 2  # It is recommended to run 50 epochs to observe improvements in accuracy
+dropout_rate = 0.2
+image_size = 64  # We'll resize input images to this size.
+patch_size = 2  # Size of the patches to be extract from the input images.
+num_patches = (image_size // patch_size) ** 2  # Size of the data array.
+latent_dim = 256  # Size of the latent array.
+projection_dim = 256  # Embedding size of each element in the data and latent arrays.
+num_heads = 8  # Number of Transformer heads.
+ffn_units = [
+    projection_dim,
+    projection_dim,
+]  # Size of the Transformer Feedforward network.
+num_transformer_blocks = 4
+num_iterations = 2  # Repetitions of the cross-attention and Transformer modules.
+classifier_units = [
+    projection_dim,
+    num_classes,
+]  # Size of the Feedforward network of the final classifier.
+
+print(f"Image size: {image_size} X {image_size} = {image_size ** 2}")
+print(f"Patch size: {patch_size} X {patch_size} = {patch_size ** 2} ")
+print(f"Patches per image: {num_patches}")
+print(f"Elements per patch (3 channels): {(patch_size ** 2) * 3}")
+print(f"Latent array shape: {latent_dim} X {projection_dim}")
+print(f"Data array shape: {num_patches} X {projection_dim}")
+
+"""
+Note that, in order to use each pixel as an individual input in the data array,
+set `patch_size` to 1.
+"""
+
+"""
+## Use data augmentation
+"""
+
+data_augmentation = keras.Sequential(
+    [
+        layers.Normalization(),
+        layers.Resizing(image_size, image_size),
+        layers.RandomFlip("horizontal"),
+        layers.RandomZoom(height_factor=0.2, width_factor=0.2),
+    ],
+    name="data_augmentation",
+)
+# Compute the mean and the variance of the training data for normalization.
+data_augmentation.layers[0].adapt(x_train)
+
+"""
+## Implement Feedforward network (FFN)
+"""
+
+
+def create_ffn(hidden_units, dropout_rate):
+    ffn_layers = []
+    for units in hidden_units[:-1]:
+        ffn_layers.append(layers.Dense(units, activation=activations.gelu))
+
+    ffn_layers.append(layers.Dense(units=hidden_units[-1]))
+    ffn_layers.append(layers.Dropout(dropout_rate))
+
+    ffn = keras.Sequential(ffn_layers)
+    return ffn
+
+
+"""
+## Implement patch creation as a layer
+"""
+
+
+class Patches(layers.Layer):
+    def __init__(self, patch_size):
+        super().__init__()
+        self.patch_size = patch_size
+
+    def call(self, images):
+        batch_size = ops.shape(images)[0]
+        patches = ops.image.extract_patches(
+            images=images,
+            size=(self.patch_size, self.patch_size),
+            strides=(self.patch_size, self.patch_size),
+            dilation_rate=1,
+            padding="valid",
+        )
+        patch_dims = patches.shape[-1]
+        patches = ops.reshape(patches, [batch_size, -1, patch_dims])
+        return patches
+
+
+"""
+## Implement the patch encoding layer
+
+The `PatchEncoder` layer will linearly transform a patch by projecting it into
+a vector of size `latent_dim`. In addition, it adds a learnable position embedding
+to the projected vector.
+
+Note that the orginal Perceiver paper uses the Fourier feature positional encodings.
+"""
+
+
+class PatchEncoder(layers.Layer):
+    def __init__(self, num_patches, projection_dim):
+        super().__init__()
+        self.num_patches = num_patches
+        self.projection = layers.Dense(units=projection_dim)
+        self.position_embedding = layers.Embedding(
+            input_dim=num_patches, output_dim=projection_dim
+        )
+
+    def call(self, patches):
+        positions = ops.arange(start=0, stop=self.num_patches, step=1)
+        encoded = self.projection(patches) + self.position_embedding(positions)
+        return encoded
+
+
+"""
+## Build the Perceiver model
+
+The Perceiver consists of two modules: a cross-attention
+module and a standard Transformer with self-attention.
+"""
+
+"""
+### Cross-attention module
+
+The cross-attention expects a `(latent_dim, projection_dim)` latent array,
+and the `(data_dim,  projection_dim)` data array as inputs,
+to produce a `(latent_dim, projection_dim)` latent array as an output.
+To apply cross-attention, the `query` vectors are generated from the latent array,
+while the `key` and `value` vectors are generated from the encoded image.
+
+Note that the data array in this example is the image,
+where the `data_dim` is set to the `num_patches`.
+"""
+
+
+def create_cross_attention_module(
+    latent_dim, data_dim, projection_dim, ffn_units, dropout_rate
+):
+    inputs = {
+        # Recieve the latent array as an input of shape [1, latent_dim, projection_dim].
+        "latent_array": layers.Input(
+            shape=(latent_dim, projection_dim), name="latent_array"
+        ),
+        # Recieve the data_array (encoded image) as an input of shape [batch_size, data_dim, projection_dim].
+        "data_array": layers.Input(shape=(data_dim, projection_dim), name="data_array"),
+    }
+
+    # Apply layer norm to the inputs
+    latent_array = layers.LayerNormalization(epsilon=1e-6)(inputs["latent_array"])
+    data_array = layers.LayerNormalization(epsilon=1e-6)(inputs["data_array"])
+
+    # Create query tensor: [1, latent_dim, projection_dim].
+    query = layers.Dense(units=projection_dim)(latent_array)
+    # Create key tensor: [batch_size, data_dim, projection_dim].
+    key = layers.Dense(units=projection_dim)(data_array)
+    # Create value tensor: [batch_size, data_dim, projection_dim].
+    value = layers.Dense(units=projection_dim)(data_array)
+
+    # Generate cross-attention outputs: [batch_size, latent_dim, projection_dim].
+    attention_output = layers.Attention(use_scale=True, dropout=0.1)(
+        [query, key, value], return_attention_scores=False
+    )
+    # Skip connection 1.
+    attention_output = layers.Add()([attention_output, latent_array])
+
+    # Apply layer norm.
+    attention_output = layers.LayerNormalization(epsilon=1e-6)(attention_output)
+    # Apply Feedforward network.
+    ffn = create_ffn(hidden_units=ffn_units, dropout_rate=dropout_rate)
+    outputs = ffn(attention_output)
+    # Skip connection 2.
+    outputs = layers.Add()([outputs, attention_output])
+
+    # Create the Keras model.
+    model = keras.Model(inputs=inputs, outputs=outputs)
+    return model
+
+
+"""
+### Transformer module
+
+The Transformer expects the output latent vector from the cross-attention module
+as an input, applies multi-head self-attention to its `latent_dim` elements,
+followed by feedforward network, to produce another `(latent_dim, projection_dim)` latent array.
+"""
+
+
+def create_transformer_module(
+    latent_dim,
+    projection_dim,
+    num_heads,
+    num_transformer_blocks,
+    ffn_units,
+    dropout_rate,
+):
+    # input_shape: [1, latent_dim, projection_dim]
+    inputs = layers.Input(shape=(latent_dim, projection_dim))
+
+    x0 = inputs
+    # Create multiple layers of the Transformer block.
+    for _ in range(num_transformer_blocks):
+        # Apply layer normalization 1.
+        x1 = layers.LayerNormalization(epsilon=1e-6)(x0)
+        # Create a multi-head self-attention layer.
+        attention_output = layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=projection_dim, dropout=0.1
+        )(x1, x1)
+        # Skip connection 1.
+        x2 = layers.Add()([attention_output, x0])
+        # Apply layer normalization 2.
+        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
+        # Apply Feedforward network.
+        ffn = create_ffn(hidden_units=ffn_units, dropout_rate=dropout_rate)
+        x3 = ffn(x3)
+        # Skip connection 2.
+        x0 = layers.Add()([x3, x2])
+
+    # Create the Keras model.
+    model = keras.Model(inputs=inputs, outputs=x0)
+    return model
+
+
+"""
+### Perceiver model
+
+The Perceiver model repeats the cross-attention and Transformer modules
+`num_iterations` times—with shared weights and skip connections—to allow
+the latent array to iteratively extract information from the input image as it is needed.
+"""
+
+
+class Perceiver(keras.Model):
+    def __init__(
+        self,
+        patch_size,
+        data_dim,
+        latent_dim,
+        projection_dim,
+        num_heads,
+        num_transformer_blocks,
+        ffn_units,
+        dropout_rate,
+        num_iterations,
+        classifier_units,
+    ):
+        super().__init__()
+
+        self.latent_dim = latent_dim
+        self.data_dim = data_dim
+        self.patch_size = patch_size
+        self.projection_dim = projection_dim
+        self.num_heads = num_heads
+        self.num_transformer_blocks = num_transformer_blocks
+        self.ffn_units = ffn_units
+        self.dropout_rate = dropout_rate
+        self.num_iterations = num_iterations
+        self.classifier_units = classifier_units
+
+    def build(self, input_shape):
+        # Create latent array.
+        self.latent_array = self.add_weight(
+            shape=(self.latent_dim, self.projection_dim),
+            initializer="random_normal",
+            trainable=True,
+        )
+
+        # Create patching module.
+        self.patcher = Patches(self.patch_size)
+
+        # Create patch encoder.
+        self.patch_encoder = PatchEncoder(self.data_dim, self.projection_dim)
+
+        # Create cross-attenion module.
+        self.cross_attention = create_cross_attention_module(
+            self.latent_dim,
+            self.data_dim,
+            self.projection_dim,
+            self.ffn_units,
+            self.dropout_rate,
+        )
+
+        # Create Transformer module.
+        self.transformer = create_transformer_module(
+            self.latent_dim,
+            self.projection_dim,
+            self.num_heads,
+            self.num_transformer_blocks,
+            self.ffn_units,
+            self.dropout_rate,
+        )
+
+        # Create global average pooling layer.
+        self.global_average_pooling = layers.GlobalAveragePooling1D()
+
+        # Create a classification head.
+        self.classification_head = create_ffn(
+            hidden_units=self.classifier_units, dropout_rate=self.dropout_rate
+        )
+
+        super().build(input_shape)
+
+    def call(self, inputs):
+        # Augment data.
+        augmented = data_augmentation(inputs)
+        # Create patches.
+        patches = self.patcher(augmented)
+        # Encode patches.
+        encoded_patches = self.patch_encoder(patches)
+        # Prepare cross-attention inputs.
+        cross_attention_inputs = {
+            "latent_array": ops.expand_dims(self.latent_array, 0),
+            "data_array": encoded_patches,
+        }
+        # Apply the cross-attention and the Transformer modules iteratively.
+        for _ in range(self.num_iterations):
+            # Apply cross-attention from the latent array to the data array.
+            latent_array = self.cross_attention(cross_attention_inputs)
+            # Apply self-attention Transformer to the latent array.
+            latent_array = self.transformer(latent_array)
+            # Set the latent array of the next iteration.
+            cross_attention_inputs["latent_array"] = latent_array
+
+        # Apply global average pooling to generate a [batch_size, projection_dim] repesentation tensor.
+        representation = self.global_average_pooling(latent_array)
+        # Generate logits.
+        logits = self.classification_head(representation)
+        return logits
+
+
+"""
+## Compile, train, and evaluate the mode
+"""
+
+
+def run_experiment(model):
+    # Create ADAM instead of LAMB optimizer with weight decay. (LAMB isn't supported yet)
+    optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
+
+    # Compile the model.
+    model.compile(
+        optimizer=optimizer,
+        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+        metrics=[
+            keras.metrics.SparseCategoricalAccuracy(name="acc"),
+            keras.metrics.SparseTopKCategoricalAccuracy(5, name="top5-acc"),
+        ],
+    )
+
+    # Create a learning rate scheduler callback.
+    reduce_lr = keras.callbacks.ReduceLROnPlateau(
+        monitor="val_loss", factor=0.2, patience=3
+    )
+
+    # Create an early stopping callback.
+    early_stopping = keras.callbacks.EarlyStopping(
+        monitor="val_loss", patience=15, restore_best_weights=True
+    )
+
+    # Fit the model.
+    history = model.fit(
+        x=x_train,
+        y=y_train,
+        batch_size=batch_size,
+        epochs=num_epochs,
+        validation_split=0.1,
+        callbacks=[early_stopping, reduce_lr],
+    )
+
+    _, accuracy, top_5_accuracy = model.evaluate(x_test, y_test)
+    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
+    print(f"Test top 5 accuracy: {round(top_5_accuracy * 100, 2)}%")
+
+    # Return history to plot learning curves.
+    return history
+
+
+"""
+Note that training the perceiver model with the current settings on a V100 GPUs takes
+around 200 seconds.
+"""
+
+perceiver_classifier = Perceiver(
+    patch_size,
+    num_patches,
+    latent_dim,
+    projection_dim,
+    num_heads,
+    num_transformer_blocks,
+    ffn_units,
+    dropout_rate,
+    num_iterations,
+    classifier_units,
+)
+
+
+history = run_experiment(perceiver_classifier)
+
+"""
+After 40 epochs, the Perceiver model achieves around 53% accuracy and 81% top-5 accuracy on the test data.
+
+As mentioned in the ablations of the [Perceiver](https://arxiv.org/abs/2103.03206) paper,
+you can obtain better results by increasing the latent array size,
+increasing the (projection) dimensions of the latent array and data array elements,
+increasing the number of blocks in the Transformer module, and increasing the number of iterations of applying
+the cross-attention and the latent Transformer modules. You may also try to increase the size the input images
+and use different patch sizes.
+
+The Perceiver benefits from inceasing the model size. However, larger models needs bigger accelerators
+to fit in and train efficiently. This is why in the Perceiver paper they used 32 TPU cores to run the experiments.
+"""
diff --git a/knowledge_base/vision/pointnet.py b/knowledge_base/vision/pointnet.py
new file mode 100644
index 0000000000000000000000000000000000000000..af637f41c32640916a9dd47dc2d3496a7ded8e65
--- /dev/null
+++ b/knowledge_base/vision/pointnet.py
@@ -0,0 +1,300 @@
+"""
+Title: Point cloud classification with PointNet
+Author: [David Griffiths](https://dgriffiths3.github.io)
+Date created: 2020/05/25
+Last modified: 2024/01/09
+Description: Implementation of PointNet for ModelNet10 classification.
+Accelerator: GPU
+"""
+
+"""
+# Point cloud classification
+"""
+
+"""
+## Introduction
+
+Classification, detection and segmentation of unordered 3D point sets i.e. point clouds
+is a core problem in computer vision. This example implements the seminal point cloud
+deep learning paper [PointNet (Qi et al., 2017)](https://arxiv.org/abs/1612.00593). For a
+detailed intoduction on PointNet see [this blog
+post](https://medium.com/@luis_gonzales/an-in-depth-look-at-pointnet-111d7efdaa1a).
+"""
+
+"""
+## Setup
+
+If using colab first install trimesh with `!pip install trimesh`.
+"""
+
+
+import os
+import glob
+import trimesh
+import numpy as np
+from tensorflow import data as tf_data
+from keras import ops
+import keras
+from keras import layers
+from matplotlib import pyplot as plt
+
+keras.utils.set_random_seed(seed=42)
+
+"""
+## Load dataset
+
+We use the ModelNet10 model dataset, the smaller 10 class version of the ModelNet40
+dataset. First download the data:
+"""
+
+DATA_DIR = keras.utils.get_file(
+    "modelnet.zip",
+    "http://3dvision.princeton.edu/projects/2014/3DShapeNets/ModelNet10.zip",
+    extract=True,
+)
+DATA_DIR = os.path.join(os.path.dirname(DATA_DIR), "ModelNet10")
+
+"""
+We can use the `trimesh` package to read and visualize the `.off` mesh files.
+"""
+
+mesh = trimesh.load(os.path.join(DATA_DIR, "chair/train/chair_0001.off"))
+mesh.show()
+
+"""
+To convert a mesh file to a point cloud we first need to sample points on the mesh
+surface. `.sample()` performs a uniform random sampling. Here we sample at 2048 locations
+and visualize in `matplotlib`.
+"""
+
+points = mesh.sample(2048)
+
+fig = plt.figure(figsize=(5, 5))
+ax = fig.add_subplot(111, projection="3d")
+ax.scatter(points[:, 0], points[:, 1], points[:, 2])
+ax.set_axis_off()
+plt.show()
+
+"""
+To generate a `tf.data.Dataset()` we need to first parse through the ModelNet data
+folders. Each mesh is loaded and sampled into a point cloud before being added to a
+standard python list and converted to a `numpy` array. We also store the current
+enumerate index value as the object label and use a dictionary to recall this later.
+"""
+
+
+def parse_dataset(num_points=2048):
+    train_points = []
+    train_labels = []
+    test_points = []
+    test_labels = []
+    class_map = {}
+    folders = glob.glob(os.path.join(DATA_DIR, "[!README]*"))
+
+    for i, folder in enumerate(folders):
+        print("processing class: {}".format(os.path.basename(folder)))
+        # store folder name with ID so we can retrieve later
+        class_map[i] = folder.split("/")[-1]
+        # gather all files
+        train_files = glob.glob(os.path.join(folder, "train/*"))
+        test_files = glob.glob(os.path.join(folder, "test/*"))
+
+        for f in train_files:
+            train_points.append(trimesh.load(f).sample(num_points))
+            train_labels.append(i)
+
+        for f in test_files:
+            test_points.append(trimesh.load(f).sample(num_points))
+            test_labels.append(i)
+
+    return (
+        np.array(train_points),
+        np.array(test_points),
+        np.array(train_labels),
+        np.array(test_labels),
+        class_map,
+    )
+
+
+"""
+Set the number of points to sample and batch size and parse the dataset. This can take
+~5minutes to complete.
+"""
+
+NUM_POINTS = 2048
+NUM_CLASSES = 10
+BATCH_SIZE = 32
+
+train_points, test_points, train_labels, test_labels, CLASS_MAP = parse_dataset(
+    NUM_POINTS
+)
+
+"""
+Our data can now be read into a `tf.data.Dataset()` object. We set the shuffle buffer
+size to the entire size of the dataset as prior to this the data is ordered by class.
+Data augmentation is important when working with point cloud data. We create a
+augmentation function to jitter and shuffle the train dataset.
+"""
+
+
+def augment(points, label):
+    # jitter points
+    points += keras.random.uniform(points.shape, -0.005, 0.005, dtype="float64")
+    # shuffle points
+    points = keras.random.shuffle(points)
+    return points, label
+
+
+train_size = 0.8
+dataset = tf_data.Dataset.from_tensor_slices((train_points, train_labels))
+test_dataset = tf_data.Dataset.from_tensor_slices((test_points, test_labels))
+train_dataset_size = int(len(dataset) * train_size)
+
+dataset = dataset.shuffle(len(train_points)).map(augment)
+test_dataset = test_dataset.shuffle(len(test_points)).batch(BATCH_SIZE)
+
+train_dataset = dataset.take(train_dataset_size).batch(BATCH_SIZE)
+validation_dataset = dataset.skip(train_dataset_size).batch(BATCH_SIZE)
+
+"""
+### Build a model
+
+Each convolution and fully-connected layer (with exception for end layers) consists of
+Convolution / Dense -> Batch Normalization -> ReLU Activation.
+"""
+
+
+def conv_bn(x, filters):
+    x = layers.Conv1D(filters, kernel_size=1, padding="valid")(x)
+    x = layers.BatchNormalization(momentum=0.0)(x)
+    return layers.Activation("relu")(x)
+
+
+def dense_bn(x, filters):
+    x = layers.Dense(filters)(x)
+    x = layers.BatchNormalization(momentum=0.0)(x)
+    return layers.Activation("relu")(x)
+
+
+"""
+PointNet consists of two core components. The primary MLP network, and the transformer
+net (T-net). The T-net aims to learn an affine transformation matrix by its own mini
+network. The T-net is used twice. The first time to transform the input features (n, 3)
+into a canonical representation. The second is an affine transformation for alignment in
+feature space (n, 3). As per the original paper we constrain the transformation to be
+close to an orthogonal matrix (i.e. ||X*X^T - I|| = 0).
+"""
+
+
+class OrthogonalRegularizer(keras.regularizers.Regularizer):
+    def __init__(self, num_features, l2reg=0.001):
+        self.num_features = num_features
+        self.l2reg = l2reg
+        self.eye = ops.eye(num_features)
+
+    def __call__(self, x):
+        x = ops.reshape(x, (-1, self.num_features, self.num_features))
+        xxt = ops.tensordot(x, x, axes=(2, 2))
+        xxt = ops.reshape(xxt, (-1, self.num_features, self.num_features))
+        return ops.sum(self.l2reg * ops.square(xxt - self.eye))
+
+
+"""
+ We can then define a general function to build T-net layers.
+"""
+
+
+def tnet(inputs, num_features):
+    # Initialise bias as the identity matrix
+    bias = keras.initializers.Constant(np.eye(num_features).flatten())
+    reg = OrthogonalRegularizer(num_features)
+
+    x = conv_bn(inputs, 32)
+    x = conv_bn(x, 64)
+    x = conv_bn(x, 512)
+    x = layers.GlobalMaxPooling1D()(x)
+    x = dense_bn(x, 256)
+    x = dense_bn(x, 128)
+    x = layers.Dense(
+        num_features * num_features,
+        kernel_initializer="zeros",
+        bias_initializer=bias,
+        activity_regularizer=reg,
+    )(x)
+    feat_T = layers.Reshape((num_features, num_features))(x)
+    # Apply affine transformation to input features
+    return layers.Dot(axes=(2, 1))([inputs, feat_T])
+
+
+"""
+The main network can be then implemented in the same manner where the t-net mini models
+can be dropped in a layers in the graph. Here we replicate the network architecture
+published in the original paper but with half the number of weights at each layer as we
+are using the smaller 10 class ModelNet dataset.
+"""
+
+inputs = keras.Input(shape=(NUM_POINTS, 3))
+
+x = tnet(inputs, 3)
+x = conv_bn(x, 32)
+x = conv_bn(x, 32)
+x = tnet(x, 32)
+x = conv_bn(x, 32)
+x = conv_bn(x, 64)
+x = conv_bn(x, 512)
+x = layers.GlobalMaxPooling1D()(x)
+x = dense_bn(x, 256)
+x = layers.Dropout(0.3)(x)
+x = dense_bn(x, 128)
+x = layers.Dropout(0.3)(x)
+
+outputs = layers.Dense(NUM_CLASSES, activation="softmax")(x)
+
+model = keras.Model(inputs=inputs, outputs=outputs, name="pointnet")
+model.summary()
+
+"""
+### Train model
+
+Once the model is defined it can be trained like any other standard classification model
+using `.compile()` and `.fit()`.
+"""
+
+model.compile(
+    loss="sparse_categorical_crossentropy",
+    optimizer=keras.optimizers.Adam(learning_rate=0.001),
+    metrics=["sparse_categorical_accuracy"],
+)
+
+model.fit(train_dataset, epochs=20, validation_data=validation_dataset)
+
+"""
+## Visualize predictions
+
+We can use matplotlib to visualize our trained model performance.
+"""
+
+data = test_dataset.take(1)
+
+points, labels = list(data)[0]
+points = points[:8, ...]
+labels = labels[:8, ...]
+
+# run test data through model
+preds = model.predict(points)
+preds = ops.argmax(preds, -1)
+
+points = points.numpy()
+
+# plot points with predicted class and label
+fig = plt.figure(figsize=(15, 10))
+for i in range(8):
+    ax = fig.add_subplot(2, 4, i + 1, projection="3d")
+    ax.scatter(points[i, :, 0], points[i, :, 1], points[i, :, 2])
+    ax.set_title(
+        "pred: {:}, label: {:}".format(
+            CLASS_MAP[preds[i].numpy()], CLASS_MAP[labels.numpy()[i]]
+        )
+    )
+    ax.set_axis_off()
+plt.show()
diff --git a/knowledge_base/vision/pointnet_segmentation.py b/knowledge_base/vision/pointnet_segmentation.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a66f42c57707b9ecf0257dbe0a0cdce4e8f1297
--- /dev/null
+++ b/knowledge_base/vision/pointnet_segmentation.py
@@ -0,0 +1,609 @@
+"""
+Title: Point cloud segmentation with PointNet
+Author: [Soumik Rakshit](https://github.com/soumik12345), [Sayak Paul](https://github.com/sayakpaul)
+Date created: 2020/10/23
+Last modified: 2020/10/24
+Description: Implementation of a PointNet-based model for segmenting point clouds.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+A "point cloud" is an important type of data structure for storing geometric shape data.
+Due to its irregular format, it's often transformed into
+regular 3D voxel grids or collections of images before being used in deep learning applications,
+a step which makes the data unnecessarily large.
+The PointNet family of models solves this problem by directly consuming point clouds, respecting
+the permutation-invariance property of the point data. The PointNet family of
+models provides a simple, unified architecture
+for applications ranging from **object classification**, **part segmentation**, to
+**scene semantic parsing**.
+
+In this example, we demonstrate the implementation of the PointNet architecture
+for shape segmentation.
+
+### References
+
+- [PointNet: Deep Learning on Point Sets for 3D Classification and Segmentation](https://arxiv.org/abs/1612.00593)
+- [Point cloud classification with PointNet](https://keras.io/examples/vision/pointnet/)
+- [Spatial Transformer Networks](https://arxiv.org/abs/1506.02025)
+"""
+
+"""
+## Imports
+"""
+
+import os
+import json
+import random
+import numpy as np
+import pandas as pd
+from tqdm import tqdm
+from glob import glob
+
+import tensorflow as tf  # For tf.data
+import keras
+from keras import layers
+
+import matplotlib.pyplot as plt
+
+"""
+## Downloading Dataset
+
+The [ShapeNet dataset](https://shapenet.org/) is an ongoing effort to establish a richly-annotated,
+large-scale dataset of 3D shapes. **ShapeNetCore** is a subset of the full ShapeNet
+dataset with clean single 3D models and manually verified category and alignment
+annotations. It covers 55 common object categories, with about 51,300 unique 3D models.
+
+For this example, we use one of the 12 object categories of
+[PASCAL 3D+](http://cvgl.stanford.edu/projects/pascal3d.html),
+included as part of the ShapenetCore dataset.
+"""
+
+dataset_url = "https://git.io/JiY4i"
+
+dataset_path = keras.utils.get_file(
+    fname="shapenet.zip",
+    origin=dataset_url,
+    cache_subdir="datasets",
+    hash_algorithm="auto",
+    extract=True,
+    archive_format="auto",
+    cache_dir="datasets",
+)
+
+"""
+## Loading the dataset
+
+We parse the dataset metadata in order to easily map model categories to their
+respective directories and segmentation classes to colors for the purpose of
+visualization.
+"""
+
+with open("/tmp/.keras/datasets/PartAnnotation/metadata.json") as json_file:
+    metadata = json.load(json_file)
+
+print(metadata)
+
+"""
+In this example, we train PointNet to segment the parts of an `Airplane` model.
+"""
+
+points_dir = "/tmp/.keras/datasets/PartAnnotation/{}/points".format(
+    metadata["Airplane"]["directory"]
+)
+labels_dir = "/tmp/.keras/datasets/PartAnnotation/{}/points_label".format(
+    metadata["Airplane"]["directory"]
+)
+LABELS = metadata["Airplane"]["lables"]
+COLORS = metadata["Airplane"]["colors"]
+
+VAL_SPLIT = 0.2
+NUM_SAMPLE_POINTS = 1024
+BATCH_SIZE = 32
+EPOCHS = 60
+INITIAL_LR = 1e-3
+
+"""
+## Structuring the dataset
+
+We generate the following in-memory data structures from the Airplane point clouds and
+their labels:
+
+- `point_clouds` is a list of `np.array` objects that represent the point cloud data in
+the form of x, y and z coordinates. Axis 0 represents the number of points in the
+point cloud, while axis 1 represents the coordinates. `all_labels` is the list
+that represents the label of each coordinate as a string (needed mainly for
+visualization purposes).
+- `test_point_clouds` is in the same format as `point_clouds`, but doesn't have
+corresponding the labels of the point clouds.
+- `all_labels` is a list of `np.array` objects that represent the point cloud labels
+for each coordinate, corresponding to the `point_clouds` list.
+- `point_cloud_labels` is a list of `np.array` objects that represent the point cloud
+labels for each coordinate in one-hot encoded form, corresponding to the `point_clouds`
+list.
+"""
+
+point_clouds, test_point_clouds = [], []
+point_cloud_labels, all_labels = [], []
+
+points_files = glob(os.path.join(points_dir, "*.pts"))
+for point_file in tqdm(points_files):
+    point_cloud = np.loadtxt(point_file)
+    if point_cloud.shape[0] < NUM_SAMPLE_POINTS:
+        continue
+
+    # Get the file-id of the current point cloud for parsing its
+    # labels.
+    file_id = point_file.split("/")[-1].split(".")[0]
+    label_data, num_labels = {}, 0
+    for label in LABELS:
+        label_file = os.path.join(labels_dir, label, file_id + ".seg")
+        if os.path.exists(label_file):
+            label_data[label] = np.loadtxt(label_file).astype("float32")
+            num_labels = len(label_data[label])
+
+    # Point clouds having labels will be our training samples.
+    try:
+        label_map = ["none"] * num_labels
+        for label in LABELS:
+            for i, data in enumerate(label_data[label]):
+                label_map[i] = label if data == 1 else label_map[i]
+        label_data = [
+            LABELS.index(label) if label != "none" else len(LABELS)
+            for label in label_map
+        ]
+        # Apply one-hot encoding to the dense label representation.
+        label_data = keras.utils.to_categorical(label_data, num_classes=len(LABELS) + 1)
+
+        point_clouds.append(point_cloud)
+        point_cloud_labels.append(label_data)
+        all_labels.append(label_map)
+    except KeyError:
+        test_point_clouds.append(point_cloud)
+
+"""
+Next, we take a look at some samples from the in-memory arrays we just generated:
+"""
+
+for _ in range(5):
+    i = random.randint(0, len(point_clouds) - 1)
+    print(f"point_clouds[{i}].shape:", point_clouds[0].shape)
+    print(f"point_cloud_labels[{i}].shape:", point_cloud_labels[0].shape)
+    for j in range(5):
+        print(
+            f"all_labels[{i}][{j}]:",
+            all_labels[i][j],
+            f"\tpoint_cloud_labels[{i}][{j}]:",
+            point_cloud_labels[i][j],
+            "\n",
+        )
+
+"""
+Now, let's visualize some of the point clouds along with their labels.
+"""
+
+
+def visualize_data(point_cloud, labels):
+    df = pd.DataFrame(
+        data={
+            "x": point_cloud[:, 0],
+            "y": point_cloud[:, 1],
+            "z": point_cloud[:, 2],
+            "label": labels,
+        }
+    )
+    fig = plt.figure(figsize=(15, 10))
+    ax = plt.axes(projection="3d")
+    for index, label in enumerate(LABELS):
+        c_df = df[df["label"] == label]
+        try:
+            ax.scatter(
+                c_df["x"], c_df["y"], c_df["z"], label=label, alpha=0.5, c=COLORS[index]
+            )
+        except IndexError:
+            pass
+    ax.legend()
+    plt.show()
+
+
+visualize_data(point_clouds[0], all_labels[0])
+visualize_data(point_clouds[300], all_labels[300])
+
+"""
+### Preprocessing
+
+Note that all the point clouds that we have loaded consist of a variable number of points,
+which makes it difficult for us to batch them together. In order to overcome this problem, we
+randomly sample a fixed number of points from each point cloud. We also normalize the
+point clouds in order to make the data scale-invariant.
+"""
+
+for index in tqdm(range(len(point_clouds))):
+    current_point_cloud = point_clouds[index]
+    current_label_cloud = point_cloud_labels[index]
+    current_labels = all_labels[index]
+    num_points = len(current_point_cloud)
+    # Randomly sampling respective indices.
+    sampled_indices = random.sample(list(range(num_points)), NUM_SAMPLE_POINTS)
+    # Sampling points corresponding to sampled indices.
+    sampled_point_cloud = np.array([current_point_cloud[i] for i in sampled_indices])
+    # Sampling corresponding one-hot encoded labels.
+    sampled_label_cloud = np.array([current_label_cloud[i] for i in sampled_indices])
+    # Sampling corresponding labels for visualization.
+    sampled_labels = np.array([current_labels[i] for i in sampled_indices])
+    # Normalizing sampled point cloud.
+    norm_point_cloud = sampled_point_cloud - np.mean(sampled_point_cloud, axis=0)
+    norm_point_cloud /= np.max(np.linalg.norm(norm_point_cloud, axis=1))
+    point_clouds[index] = norm_point_cloud
+    point_cloud_labels[index] = sampled_label_cloud
+    all_labels[index] = sampled_labels
+
+"""
+Let's visualize the sampled and normalized point clouds along with their corresponding
+labels.
+"""
+
+visualize_data(point_clouds[0], all_labels[0])
+visualize_data(point_clouds[300], all_labels[300])
+
+"""
+### Creating TensorFlow datasets
+
+We create `tf.data.Dataset` objects for the training and validation data.
+We also augment the training point clouds by applying random jitter to them.
+"""
+
+
+def load_data(point_cloud_batch, label_cloud_batch):
+    point_cloud_batch.set_shape([NUM_SAMPLE_POINTS, 3])
+    label_cloud_batch.set_shape([NUM_SAMPLE_POINTS, len(LABELS) + 1])
+    return point_cloud_batch, label_cloud_batch
+
+
+def augment(point_cloud_batch, label_cloud_batch):
+    noise = tf.random.uniform(
+        tf.shape(label_cloud_batch), -0.001, 0.001, dtype=tf.float64
+    )
+    point_cloud_batch += noise[:, :, :3]
+    return point_cloud_batch, label_cloud_batch
+
+
+def generate_dataset(point_clouds, label_clouds, is_training=True):
+    dataset = tf.data.Dataset.from_tensor_slices((point_clouds, label_clouds))
+    dataset = dataset.shuffle(BATCH_SIZE * 100) if is_training else dataset
+    dataset = dataset.map(load_data, num_parallel_calls=tf.data.AUTOTUNE)
+    dataset = dataset.batch(batch_size=BATCH_SIZE)
+    dataset = (
+        dataset.map(augment, num_parallel_calls=tf.data.AUTOTUNE)
+        if is_training
+        else dataset
+    )
+    return dataset
+
+
+split_index = int(len(point_clouds) * (1 - VAL_SPLIT))
+train_point_clouds = point_clouds[:split_index]
+train_label_cloud = point_cloud_labels[:split_index]
+total_training_examples = len(train_point_clouds)
+
+val_point_clouds = point_clouds[split_index:]
+val_label_cloud = point_cloud_labels[split_index:]
+
+print("Num train point clouds:", len(train_point_clouds))
+print("Num train point cloud labels:", len(train_label_cloud))
+print("Num val point clouds:", len(val_point_clouds))
+print("Num val point cloud labels:", len(val_label_cloud))
+
+train_dataset = generate_dataset(train_point_clouds, train_label_cloud)
+val_dataset = generate_dataset(val_point_clouds, val_label_cloud, is_training=False)
+
+print("Train Dataset:", train_dataset)
+print("Validation Dataset:", val_dataset)
+
+"""
+## PointNet model
+
+The figure below depicts the internals of the PointNet model family:
+
+![](https://i.imgur.com/qFLNw5L.png)
+
+Given that PointNet is meant to consume an ***unordered set*** of coordinates as its input data,
+its architecture needs to match the following characteristic properties
+of point cloud data:
+
+### Permutation invariance
+
+Given the unstructured nature of point cloud data, a scan made up of `n` points has `n!`
+permutations. The subsequent data processing must be invariant to the different
+representations. In order to make PointNet invariant to input permutations, we use a
+symmetric function (such as max-pooling) once the `n` input points are mapped to
+higher-dimensional space. The result is a **global feature vector** that aims to capture
+an aggregate signature of the `n` input points. The global feature vector is used alongside
+local point features for segmentation.
+
+![](https://i.imgur.com/0mrvvjb.png)
+
+### Transformation invariance
+
+Segmentation outputs should be unchanged if the object undergoes certain transformations,
+such as translation or scaling. For a given input point cloud, we apply an appropriate
+rigid or affine transformation to achieve pose normalization. Because each of the `n` input
+points are represented as a vector and are mapped to the embedding spaces independently,
+applying a geometric transformation simply amounts to matrix multiplying each point with
+a transformation matrix. This is motivated by the concept of
+[Spatial Transformer Networks](https://arxiv.org/abs/1506.02025).
+
+The operations comprising the T-Net are motivated by the higher-level architecture of
+PointNet. MLPs (or fully-connected layers) are used to map the input points independently
+and identically to a higher-dimensional space; max-pooling is used to encode a global
+feature vector whose dimensionality is then reduced with fully-connected layers. The
+input-dependent features at the final fully-connected layer are then combined with
+globally trainable weights and biases, resulting in a 3-by-3 transformation matrix.
+
+![](https://i.imgur.com/aEj3GYi.png)
+
+### Point interactions
+
+The interaction between neighboring points often carries useful information (i.e., a
+single point should not be treated in isolation). Whereas classification need only make
+use of global features, segmentation must be able to leverage local point features along
+with global point features.
+
+
+**Note**: The figures presented in this section have been taken from the
+[original paper](https://arxiv.org/abs/1612.00593).
+"""
+
+"""
+Now that we know the pieces that compose the PointNet model, we can implement the model.
+We start by implementing the basic blocks i.e., the convolutional block and the multi-layer
+perceptron block.
+"""
+
+
+def conv_block(x, filters, name):
+    x = layers.Conv1D(filters, kernel_size=1, padding="valid", name=f"{name}_conv")(x)
+    x = layers.BatchNormalization(name=f"{name}_batch_norm")(x)
+    return layers.Activation("relu", name=f"{name}_relu")(x)
+
+
+def mlp_block(x, filters, name):
+    x = layers.Dense(filters, name=f"{name}_dense")(x)
+    x = layers.BatchNormalization(name=f"{name}_batch_norm")(x)
+    return layers.Activation("relu", name=f"{name}_relu")(x)
+
+
+"""
+We implement a regularizer (taken from
+[this example](https://keras.io/examples/vision/pointnet/#build-a-model))
+to enforce orthogonality in the feature space. This is needed to ensure
+that the magnitudes of the transformed features do not vary too much.
+"""
+
+
+class OrthogonalRegularizer(keras.regularizers.Regularizer):
+    """Reference: https://keras.io/examples/vision/pointnet/#build-a-model"""
+
+    def __init__(self, num_features, l2reg=0.001):
+        self.num_features = num_features
+        self.l2reg = l2reg
+        self.identity = keras.ops.eye(num_features)
+
+    def __call__(self, x):
+        x = keras.ops.reshape(x, (-1, self.num_features, self.num_features))
+        xxt = keras.ops.tensordot(x, x, axes=(2, 2))
+        xxt = keras.ops.reshape(xxt, (-1, self.num_features, self.num_features))
+        return keras.ops.sum(self.l2reg * keras.ops.square(xxt - self.identity))
+
+    def get_config(self):
+        config = super().get_config()
+        config.update({"num_features": self.num_features, "l2reg_strength": self.l2reg})
+        return config
+
+
+"""
+The next piece is the transformation network which we explained earlier.
+"""
+
+
+def transformation_net(inputs, num_features, name):
+    """
+    Reference: https://keras.io/examples/vision/pointnet/#build-a-model.
+
+    The `filters` values come from the original paper:
+    https://arxiv.org/abs/1612.00593.
+    """
+    x = conv_block(inputs, filters=64, name=f"{name}_1")
+    x = conv_block(x, filters=128, name=f"{name}_2")
+    x = conv_block(x, filters=1024, name=f"{name}_3")
+    x = layers.GlobalMaxPooling1D()(x)
+    x = mlp_block(x, filters=512, name=f"{name}_1_1")
+    x = mlp_block(x, filters=256, name=f"{name}_2_1")
+    return layers.Dense(
+        num_features * num_features,
+        kernel_initializer="zeros",
+        bias_initializer=keras.initializers.Constant(np.eye(num_features).flatten()),
+        activity_regularizer=OrthogonalRegularizer(num_features),
+        name=f"{name}_final",
+    )(x)
+
+
+def transformation_block(inputs, num_features, name):
+    transformed_features = transformation_net(inputs, num_features, name=name)
+    transformed_features = layers.Reshape((num_features, num_features))(
+        transformed_features
+    )
+    return layers.Dot(axes=(2, 1), name=f"{name}_mm")([inputs, transformed_features])
+
+
+"""
+Finally, we piece the above blocks together and implement the segmentation model.
+"""
+
+
+def get_shape_segmentation_model(num_points, num_classes):
+    input_points = keras.Input(shape=(None, 3))
+
+    # PointNet Classification Network.
+    transformed_inputs = transformation_block(
+        input_points, num_features=3, name="input_transformation_block"
+    )
+    features_64 = conv_block(transformed_inputs, filters=64, name="features_64")
+    features_128_1 = conv_block(features_64, filters=128, name="features_128_1")
+    features_128_2 = conv_block(features_128_1, filters=128, name="features_128_2")
+    transformed_features = transformation_block(
+        features_128_2, num_features=128, name="transformed_features"
+    )
+    features_512 = conv_block(transformed_features, filters=512, name="features_512")
+    features_2048 = conv_block(features_512, filters=2048, name="pre_maxpool_block")
+    global_features = layers.MaxPool1D(pool_size=num_points, name="global_features")(
+        features_2048
+    )
+    global_features = keras.ops.tile(global_features, [1, num_points, 1])
+
+    # Segmentation head.
+    segmentation_input = layers.Concatenate(name="segmentation_input")(
+        [
+            features_64,
+            features_128_1,
+            features_128_2,
+            transformed_features,
+            features_512,
+            global_features,
+        ]
+    )
+    segmentation_features = conv_block(
+        segmentation_input, filters=128, name="segmentation_features"
+    )
+    outputs = layers.Conv1D(
+        num_classes, kernel_size=1, activation="softmax", name="segmentation_head"
+    )(segmentation_features)
+    return keras.Model(input_points, outputs)
+
+
+"""
+## Instantiate the model
+"""
+
+x, y = next(iter(train_dataset))
+
+num_points = x.shape[1]
+num_classes = y.shape[-1]
+
+segmentation_model = get_shape_segmentation_model(num_points, num_classes)
+segmentation_model.summary()
+
+"""
+## Training
+
+For the training the authors recommend using a learning rate schedule that decays the
+initial learning rate by half every 20 epochs. In this example, we use 5 epochs.
+"""
+
+steps_per_epoch = total_training_examples // BATCH_SIZE
+total_training_steps = steps_per_epoch * EPOCHS
+print(f"Steps per epoch: {steps_per_epoch}.")
+print(f"Total training steps: {total_training_steps}.")
+
+lr_schedule = keras.optimizers.schedules.ExponentialDecay(
+    initial_learning_rate=0.003,
+    decay_steps=steps_per_epoch * 5,
+    decay_rate=0.5,
+    staircase=True,
+)
+
+steps = range(total_training_steps)
+lrs = [lr_schedule(step) for step in steps]
+
+plt.plot(lrs)
+plt.xlabel("Steps")
+plt.ylabel("Learning Rate")
+plt.show()
+
+"""
+Finally, we implement a utility for running our experiments and launch model training.
+"""
+
+
+def run_experiment(epochs):
+    segmentation_model = get_shape_segmentation_model(num_points, num_classes)
+    segmentation_model.compile(
+        optimizer=keras.optimizers.Adam(learning_rate=lr_schedule),
+        loss=keras.losses.CategoricalCrossentropy(),
+        metrics=["accuracy"],
+    )
+
+    checkpoint_filepath = "checkpoint.weights.h5"
+    checkpoint_callback = keras.callbacks.ModelCheckpoint(
+        checkpoint_filepath,
+        monitor="val_loss",
+        save_best_only=True,
+        save_weights_only=True,
+    )
+
+    history = segmentation_model.fit(
+        train_dataset,
+        validation_data=val_dataset,
+        epochs=epochs,
+        callbacks=[checkpoint_callback],
+    )
+
+    segmentation_model.load_weights(checkpoint_filepath)
+    return segmentation_model, history
+
+
+segmentation_model, history = run_experiment(epochs=EPOCHS)
+
+"""
+## Visualize the training landscape
+"""
+
+
+def plot_result(item):
+    plt.plot(history.history[item], label=item)
+    plt.plot(history.history["val_" + item], label="val_" + item)
+    plt.xlabel("Epochs")
+    plt.ylabel(item)
+    plt.title("Train and Validation {} Over Epochs".format(item), fontsize=14)
+    plt.legend()
+    plt.grid()
+    plt.show()
+
+
+plot_result("loss")
+plot_result("accuracy")
+
+"""
+## Inference
+"""
+
+validation_batch = next(iter(val_dataset))
+val_predictions = segmentation_model.predict(validation_batch[0])
+print(f"Validation prediction shape: {val_predictions.shape}")
+
+
+def visualize_single_point_cloud(point_clouds, label_clouds, idx):
+    label_map = LABELS + ["none"]
+    point_cloud = point_clouds[idx]
+    label_cloud = label_clouds[idx]
+    visualize_data(point_cloud, [label_map[np.argmax(label)] for label in label_cloud])
+
+
+idx = np.random.choice(len(validation_batch[0]))
+print(f"Index selected: {idx}")
+
+# Plotting with ground-truth.
+visualize_single_point_cloud(validation_batch[0], validation_batch[1], idx)
+
+# Plotting with predicted labels.
+visualize_single_point_cloud(validation_batch[0], val_predictions, idx)
+
+"""
+## Final notes
+
+If you are interested in learning more about this topic, you may find
+[this repository](https://github.com/soumik12345/point-cloud-segmentation)
+useful.
+"""
diff --git a/knowledge_base/vision/probing_vits.py b/knowledge_base/vision/probing_vits.py
new file mode 100644
index 0000000000000000000000000000000000000000..16f7b1a8815c50226a9178fab3e21f0d1bef9cb9
--- /dev/null
+++ b/knowledge_base/vision/probing_vits.py
@@ -0,0 +1,706 @@
+"""
+Title: Investigating Vision Transformer representations
+Authors: [Aritra Roy Gosthipaty](https://twitter.com/ariG23498), [Sayak Paul](https://twitter.com/RisingSayak) (equal contribution)
+Date created: 2022/04/12
+Last modified: 2023/11/20
+Description: Looking into the representations learned by different Vision Transformers variants.
+Accelerator: None
+"""
+
+"""
+## Introduction
+
+In this example, we look into the representations learned by different Vision
+Transformer (ViT) models. Our main goal with this example is to provide insights into
+what empowers ViTs to learn from image data. In particular, the example discusses
+implementations of a few different ViT analysis tools.
+
+**Note:** when we say "Vision Transformer", we refer to a computer vision architecture that
+involves Transformer blocks ([Vaswani et al.](https://arxiv.org/abs/1706.03762)) and not
+necessarily the original Vision Transformer model
+([Dosovitskiy et al.](https://arxiv.org/abs/2010.11929)).
+
+## Models considered
+
+Since the inception of the original Vision Transformer, the computer vision community has
+seen a number of different ViT variants improving upon the original in various ways:
+training improvements, architecture improvements, and so on.
+In this example, we consider the following ViT model families:
+
+* ViTs trained using supervised pretraining with the ImageNet-1k and ImageNet-21k
+datasets ([Dosovitskiy et al.](https://arxiv.org/abs/2010.11929))
+* ViTs trained using supervised pretraining but only with the ImageNet-1k dataset with
+more regularization and distillation ([Touvron et al.](https://arxiv.org/abs/2012.12877))
+(DeiT).
+* ViTs trained using self-supervised pretraining ([Caron et al.](https://arxiv.org/abs/2104.14294))
+(DINO).
+
+Since the pretrained models are not implemented in Keras, we first implemented them as
+faithfully as possible. We then populated them with the official pretrained parameters.
+Finally, we evaluated our implementations on the ImageNet-1k validation set to ensure the
+evaluation numbers were matching with the original implementations. The details of our implementations
+are available in [this repository](https://github.com/sayakpaul/probing-vits).
+
+To keep the example concise, we won't exhaustively pair each model with the analysis
+methods. We'll provide notes in the respective sections so that you can pick up the
+pieces.
+
+To run this example on Google Colab, we need to update the `gdown` library like so:
+
+```shell
+pip install -U gdown -q
+```
+"""
+
+"""
+## Imports
+"""
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import zipfile
+from io import BytesIO
+
+import cv2
+import matplotlib.pyplot as plt
+import numpy as np
+import requests
+
+from PIL import Image
+from sklearn.preprocessing import MinMaxScaler
+import keras
+from keras import ops
+
+"""
+## Constants
+"""
+
+RESOLUTION = 224
+PATCH_SIZE = 16
+GITHUB_RELEASE = "https://github.com/sayakpaul/probing-vits/releases/download/v1.0.0/probing_vits.zip"
+FNAME = "probing_vits.zip"
+MODELS_ZIP = {
+    "vit_dino_base16": "Probing_ViTs/vit_dino_base16.zip",
+    "vit_b16_patch16_224": "Probing_ViTs/vit_b16_patch16_224.zip",
+    "vit_b16_patch16_224-i1k_pretrained": "Probing_ViTs/vit_b16_patch16_224-i1k_pretrained.zip",
+}
+
+"""
+## Data utilities
+
+For the original ViT models, the input images need to be scaled to the range `[-1, 1]`. For
+the other model families mentioned at the beginning, we need to normalize the images with
+channel-wise mean and standard deviation of the ImageNet-1k training set.
+
+
+"""
+
+crop_layer = keras.layers.CenterCrop(RESOLUTION, RESOLUTION)
+norm_layer = keras.layers.Normalization(
+    mean=[0.485 * 255, 0.456 * 255, 0.406 * 255],
+    variance=[(0.229 * 255) ** 2, (0.224 * 255) ** 2, (0.225 * 255) ** 2],
+)
+rescale_layer = keras.layers.Rescaling(scale=1.0 / 127.5, offset=-1)
+
+
+def preprocess_image(image, model_type, size=RESOLUTION):
+    # Turn the image into a numpy array and add batch dim.
+    image = np.array(image)
+    image = ops.expand_dims(image, 0)
+
+    # If model type is vit rescale the image to [-1, 1].
+    if model_type == "original_vit":
+        image = rescale_layer(image)
+
+    # Resize the image using bicubic interpolation.
+    resize_size = int((256 / 224) * size)
+    image = ops.image.resize(image, (resize_size, resize_size), interpolation="bicubic")
+
+    # Crop the image.
+    image = crop_layer(image)
+
+    # If model type is DeiT or DINO normalize the image.
+    if model_type != "original_vit":
+        image = norm_layer(image)
+
+    return ops.convert_to_numpy(image)
+
+
+def load_image_from_url(url, model_type):
+    # Credit: Willi Gierke
+    response = requests.get(url)
+    image = Image.open(BytesIO(response.content))
+    preprocessed_image = preprocess_image(image, model_type)
+    return image, preprocessed_image
+
+
+"""
+## Load a test image and display it
+"""
+
+# ImageNet-1k label mapping file and load it.
+
+mapping_file = keras.utils.get_file(
+    origin="https://storage.googleapis.com/bit_models/ilsvrc2012_wordnet_lemmas.txt"
+)
+
+with open(mapping_file, "r") as f:
+    lines = f.readlines()
+imagenet_int_to_str = [line.rstrip() for line in lines]
+
+img_url = "https://dl.fbaipublicfiles.com/dino/img.png"
+image, preprocessed_image = load_image_from_url(img_url, model_type="original_vit")
+
+plt.imshow(image)
+plt.axis("off")
+plt.show()
+
+"""
+## Load a model
+"""
+
+
+zip_path = keras.utils.get_file(
+    fname=FNAME,
+    origin=GITHUB_RELEASE,
+)
+
+with zipfile.ZipFile(zip_path, "r") as zip_ref:
+    zip_ref.extractall("./")
+
+os.rename("Probing ViTs", "Probing_ViTs")
+
+
+def load_model(model_path: str) -> keras.Model:
+    with zipfile.ZipFile(model_path, "r") as zip_ref:
+        zip_ref.extractall("Probing_ViTs/")
+    model_name = model_path.split(".")[0]
+
+    inputs = keras.Input((RESOLUTION, RESOLUTION, 3))
+    model = keras.layers.TFSMLayer(model_name, call_endpoint="serving_default")
+    outputs = model(inputs, training=False)
+
+    return keras.Model(inputs, outputs=outputs)
+
+
+vit_base_i21k_patch16_224 = load_model(MODELS_ZIP["vit_b16_patch16_224-i1k_pretrained"])
+print("Model loaded.")
+
+"""
+**More about the model**:
+
+This model was pretrained on the ImageNet-21k dataset and was then fine-tuned on the
+ImageNet-1k dataset. To learn more about how we developed this model in TensorFlow
+(with pretrained weights from
+[this source](https://github.com/google-research/vision_transformer/)) refer to
+[this notebook](https://github.com/sayakpaul/probing-vits/blob/main/notebooks/load-jax-weights-vitb16.ipynb).
+"""
+
+"""
+## Running regular inference with the model
+
+We now run inference with the loaded model on our test image.
+"""
+
+
+def split_prediction_and_attention_scores(outputs):
+    predictions = outputs["output_1"]
+    attention_score_dict = {}
+    for key, value in outputs.items():
+        if key.startswith("output_2_"):
+            attention_score_dict[key[len("output_2_") :]] = value
+    return predictions, attention_score_dict
+
+
+predictions, attention_score_dict = split_prediction_and_attention_scores(
+    vit_base_i21k_patch16_224.predict(preprocessed_image)
+)
+predicted_label = imagenet_int_to_str[int(np.argmax(predictions))]
+print(predicted_label)
+
+"""
+`attention_score_dict` contains the attention scores (softmaxed outputs) from each
+attention head of each Transformer block.
+"""
+
+"""
+## Method I: Mean attention distance
+
+[Dosovitskiy et al.](https://arxiv.org/abs/2010.11929) and
+[Raghu et al.](https://arxiv.org/abs/2108.08810) use a measure called
+"mean attention distance" from each attention head of different
+Transformer blocks to understand how local and global information flows
+into Vision Transformers.
+
+Mean attention distance is defined as the distance between query tokens and the other
+tokens times attention weights. So, for a single image
+
+* we take individual patches (tokens) extracted from the image,
+* calculate their geometric distance, and
+* multiply that with the attention scores.
+
+Attention scores are calculated here after forward passing the image in inference mode
+through the network. The following figure may help you  understand the process a
+little bit better.
+
+<img src="https://i.imgur.com/pZCgPwl.gif" height=500>
+
+This animation is created by [Ritwik Raha](https://twitter.com/ritwik_raha).
+"""
+
+
+def compute_distance_matrix(patch_size, num_patches, length):
+    distance_matrix = np.zeros((num_patches, num_patches))
+    for i in range(num_patches):
+        for j in range(num_patches):
+            if i == j:  # zero distance
+                continue
+
+            xi, yi = (int(i / length)), (i % length)
+            xj, yj = (int(j / length)), (j % length)
+            distance_matrix[i, j] = patch_size * np.linalg.norm([xi - xj, yi - yj])
+
+    return distance_matrix
+
+
+def compute_mean_attention_dist(patch_size, attention_weights, model_type):
+    num_cls_tokens = 2 if "distilled" in model_type else 1
+
+    # The attention_weights shape = (batch, num_heads, num_patches, num_patches)
+    attention_weights = attention_weights[
+        ..., num_cls_tokens:, num_cls_tokens:
+    ]  # Removing the CLS token
+    num_patches = attention_weights.shape[-1]
+    length = int(np.sqrt(num_patches))
+    assert length**2 == num_patches, "Num patches is not perfect square"
+
+    distance_matrix = compute_distance_matrix(patch_size, num_patches, length)
+    h, w = distance_matrix.shape
+
+    distance_matrix = distance_matrix.reshape((1, 1, h, w))
+    # The attention_weights along the last axis adds to 1
+    # this is due to the fact that they are softmax of the raw logits
+    # summation of the (attention_weights * distance_matrix)
+    # should result in an average distance per token.
+    mean_distances = attention_weights * distance_matrix
+    mean_distances = np.sum(
+        mean_distances, axis=-1
+    )  # Sum along last axis to get average distance per token
+    mean_distances = np.mean(
+        mean_distances, axis=-1
+    )  # Now average across all the tokens
+
+    return mean_distances
+
+
+"""
+Thanks to [Simon Kornblith](https://scholar.google.com/citations?user=1O3RPmsAAAAJ&hl=en)
+from Google who helped us with this code snippet. It can be found
+[here](https://gist.github.com/simonster/155894d48aef2bd36bd2dd8267e62391). Let's now use
+these utilities to generate a plot of attention distances with our loaded model and test
+image.
+"""
+
+# Build the mean distances for every Transformer block.
+mean_distances = {
+    f"{name}_mean_dist": compute_mean_attention_dist(
+        patch_size=PATCH_SIZE,
+        attention_weights=attention_weight,
+        model_type="original_vit",
+    )
+    for name, attention_weight in attention_score_dict.items()
+}
+
+# Get the number of heads from the mean distance output.
+num_heads = mean_distances["transformer_block_0_att_mean_dist"].shape[-1]
+
+# Print the shapes
+print(f"Num Heads: {num_heads}.")
+
+plt.figure(figsize=(9, 9))
+
+for idx in range(len(mean_distances)):
+    mean_distance = mean_distances[f"transformer_block_{idx}_att_mean_dist"]
+    x = [idx] * num_heads
+    y = mean_distance[0, :]
+    plt.scatter(x=x, y=y, label=f"transformer_block_{idx}")
+
+plt.legend(loc="lower right")
+plt.xlabel("Attention Head", fontsize=14)
+plt.ylabel("Attention Distance", fontsize=14)
+plt.title("vit_base_i21k_patch16_224", fontsize=14)
+plt.grid()
+plt.show()
+
+"""
+### Inspecting the plots
+
+**How does self-attention span across the input space? Do they attend
+input regions locally or globally?**
+
+The promise of self-attention is to enable the learning of contextual dependencies
+so that a model can attend to the regions of inputs which are the most salient w.r.t
+the objective. From the above plots we can notice that different attention heads yield
+different attention distances suggesting they use both local and global information
+from an image. But as we go deeper in the Transformer blocks the heads tend to
+focus more on global aggregate information.
+
+Inspired by [Raghu et al.](https://arxiv.org/abs/2108.08810) we computed mean attention
+distances over 1000 images randomly taken from the ImageNet-1k validation set and we
+repeated the process for all the models mentioned at the beginning. Intrestingly, we
+notice the following:
+
+* Pretraining with a larger dataset helps with more global attention spans:
+
+
+| Pretrained on ImageNet-21k<br>Fine-tuned on ImageNet-1k | Pretrained on ImageNet-1k |
+| :--: | :--: |
+| ![](https://drive.google.com/uc?export=view&id=1aFob5Cj0FkRyVhH4Iw7Dh5SFxQH3rpYs) | ![](https://drive.google.com/uc?export=view&id=13Y-3Ypi58PPRqd-pqP1oHkyNkRHCYypH) |
+
+
+* When distilled from a CNN ViTs tend to have less global attention spans:
+
+
+| No distillation (ViT B-16 from DeiT) | Distilled ViT B-16 from DeiT |
+| :--: | :--: |
+| ![](https://drive.google.com/uc?export=view&id=1yH4cPQcMFCnuo3-IW3S9Baszr_d0o2Se) | ![](https://drive.google.com/uc?export=view&id=1m_nG12kq7E_zIEkxhsi7U0Xr_VDXhJYE) |
+
+To reproduce these plots, please refer to
+[this notebook](https://github.com/sayakpaul/probing-vits/blob/main/notebooks/mean-attention-distance-1k.ipynb).
+
+"""
+
+"""
+## Method II: Attention Rollout
+
+[Abnar et al.](https://arxiv.org/abs/2005.00928) introduce "Attention rollout" for
+quantifying how information flow through self-attention layers of Transformer blocks.
+Original ViT authors use this method to investigate the learned representations, stating:
+
+> Briefly, we averaged attention weights of ViTL/16 across all heads and then recursively
+multiplied the weight matrices of all layers. This accounts for the mixing of attention
+across tokens through all layers.
+
+We used
+[this notebook](https://colab.research.google.com/github/jeonsworld/ViT-pytorch/blob/main/visualize_attention_map.ipynb)
+and modified the attention rollout code from it for compatibility with our models.
+"""
+
+
+def attention_rollout_map(image, attention_score_dict, model_type):
+    num_cls_tokens = 2 if "distilled" in model_type else 1
+
+    # Stack the individual attention matrices from individual Transformer blocks.
+    attn_mat = ops.stack([attention_score_dict[k] for k in attention_score_dict.keys()])
+    attn_mat = ops.squeeze(attn_mat, axis=1)
+
+    # Average the attention weights across all heads.
+    attn_mat = ops.mean(attn_mat, axis=1)
+
+    # To account for residual connections, we add an identity matrix to the
+    # attention matrix and re-normalize the weights.
+    residual_attn = ops.eye(attn_mat.shape[1])
+    aug_attn_mat = attn_mat + residual_attn
+    aug_attn_mat = aug_attn_mat / ops.sum(aug_attn_mat, axis=-1)[..., None]
+    aug_attn_mat = ops.convert_to_numpy(aug_attn_mat)
+
+    # Recursively multiply the weight matrices.
+    joint_attentions = np.zeros(aug_attn_mat.shape)
+    joint_attentions[0] = aug_attn_mat[0]
+
+    for n in range(1, aug_attn_mat.shape[0]):
+        joint_attentions[n] = np.matmul(aug_attn_mat[n], joint_attentions[n - 1])
+
+    # Attention from the output token to the input space.
+    v = joint_attentions[-1]
+    grid_size = int(np.sqrt(aug_attn_mat.shape[-1]))
+    mask = v[0, num_cls_tokens:].reshape(grid_size, grid_size)
+    mask = cv2.resize(mask / mask.max(), image.size)[..., np.newaxis]
+    result = (mask * image).astype("uint8")
+    return result
+
+
+"""
+Let's now use these utilities to generate an attention plot based on our previous results
+from the "Running regular inference with the model" section. Following are the links to
+download each individual model:
+
+* [Original ViT model (pretrained on ImageNet-21k)](https://drive.google.com/file/d/1mbtnliT3jRb3yJUHhbItWw8unfYZw8KJ/view?usp=sharing)
+* [Original ViT model (pretrained on ImageNet-1k)](https://drive.google.com/file/d/1ApOdYe4NXxhPhJABefgZ3KVvqsQzhCL7/view?usp=sharing)
+* [DINO model (pretrained on ImageNet-1k)](https://drive.google.com/file/d/16_1oDm0PeCGJ_KGBG5UKVN7TsAtiRNrN/view?usp=sharing)
+* [DeiT models (pretrained on ImageNet-1k including distilled and non-distilled ones)](https://tfhub.dev/sayakpaul/collections/deit/1)
+"""
+
+attn_rollout_result = attention_rollout_map(
+    image, attention_score_dict, model_type="original_vit"
+)
+
+fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(8, 10))
+fig.suptitle(f"Predicted label: {predicted_label}.", fontsize=20)
+
+_ = ax1.imshow(image)
+_ = ax2.imshow(attn_rollout_result)
+ax1.set_title("Input Image", fontsize=16)
+ax2.set_title("Attention Map", fontsize=16)
+ax1.axis("off")
+ax2.axis("off")
+
+fig.tight_layout()
+fig.subplots_adjust(top=1.35)
+fig.show()
+
+"""
+### Inspecting the plots
+
+**How can we quanitfy the information flow that propagates through the
+attention layers?**
+
+We notice that the model is able to focus its attention on the
+salient parts of the input image. We encourage you to apply this
+method to the other models we mentioned and compare the results. The
+attention rollout plots will differ according to the tasks and
+augmentation the model was trained with. We observe that DeiT has the
+best rollout plot, likely due to its augmentation regime.
+"""
+
+"""
+## Method III: Attention heatmaps
+
+A simple yet useful way to probe into the representation of a Vision Transformer is to
+visualise the attention maps overlayed on the input images. This helps form an intuition
+about what the model attends to. We use the DINO model for this purpose, because it
+yields better attention heatmaps.
+"""
+
+# Load the model.
+vit_dino_base16 = load_model(MODELS_ZIP["vit_dino_base16"])
+print("Model loaded.")
+
+# Preprocess the same image but with normlization.
+img_url = "https://dl.fbaipublicfiles.com/dino/img.png"
+image, preprocessed_image = load_image_from_url(img_url, model_type="dino")
+
+# Grab the predictions.
+predictions, attention_score_dict = split_prediction_and_attention_scores(
+    vit_dino_base16.predict(preprocessed_image)
+)
+
+"""
+A Transformer block consists of multiple heads. Each head in a Transformer block projects
+the input data to different sub-spaces. This helps each individual head to attend to
+different parts of the image. Therefore, it makes sense to visualize each attention head
+map seperately, to make sense of what each heads looks at.
+
+**Notes**:
+
+* The following code has been copy-modified from the
+[original DINO codebase](https://github.com/facebookresearch/dino/blob/main/visualize_attention.py).
+* Here we grab the attention maps of the last Transformer block.
+* [DINO](https://arxiv.org/abs/2104.14294) was pretrained using a self-supervised
+objective.
+"""
+
+
+def attention_heatmap(attention_score_dict, image, model_type="dino"):
+    num_tokens = 2 if "distilled" in model_type else 1
+
+    # Sort the Transformer blocks in order of their depth.
+    attention_score_list = list(attention_score_dict.keys())
+    attention_score_list.sort(key=lambda x: int(x.split("_")[-2]), reverse=True)
+
+    # Process the attention maps for overlay.
+    w_featmap = image.shape[2] // PATCH_SIZE
+    h_featmap = image.shape[1] // PATCH_SIZE
+    attention_scores = attention_score_dict[attention_score_list[0]]
+
+    # Taking the representations from CLS token.
+    attentions = attention_scores[0, :, 0, num_tokens:].reshape(num_heads, -1)
+
+    # Reshape the attention scores to resemble mini patches.
+    attentions = attentions.reshape(num_heads, w_featmap, h_featmap)
+    attentions = attentions.transpose((1, 2, 0))
+
+    # Resize the attention patches to 224x224 (224: 14x16).
+    attentions = ops.image.resize(
+        attentions, size=(h_featmap * PATCH_SIZE, w_featmap * PATCH_SIZE)
+    )
+    return attentions
+
+
+"""
+We can use the same image we used for inference with DINO and the `attention_score_dict`
+we extracted from the results.
+"""
+
+# De-normalize the image for visual clarity.
+in1k_mean = np.array([0.485 * 255, 0.456 * 255, 0.406 * 255])
+in1k_std = np.array([0.229 * 255, 0.224 * 255, 0.225 * 255])
+preprocessed_img_orig = (preprocessed_image * in1k_std) + in1k_mean
+preprocessed_img_orig = preprocessed_img_orig / 255.0
+preprocessed_img_orig = ops.convert_to_numpy(ops.clip(preprocessed_img_orig, 0.0, 1.0))
+
+# Generate the attention heatmaps.
+attentions = attention_heatmap(attention_score_dict, preprocessed_img_orig)
+
+# Plot the maps.
+fig, axes = plt.subplots(nrows=3, ncols=4, figsize=(13, 13))
+img_count = 0
+
+for i in range(3):
+    for j in range(4):
+        if img_count < len(attentions):
+            axes[i, j].imshow(preprocessed_img_orig[0])
+            axes[i, j].imshow(attentions[..., img_count], cmap="inferno", alpha=0.6)
+            axes[i, j].title.set_text(f"Attention head: {img_count}")
+            axes[i, j].axis("off")
+            img_count += 1
+
+"""
+### Inspecting the plots
+
+**How can we qualitatively evaluate the attention weights?**
+
+The attention weights of a Transformer block are computed between the
+key and the query. The weights quantifies how important is the key to the query.
+In the ViTs the key and the query comes from the same image, hence
+the weights determine which part of the image is important.
+
+Plotting the attention weigths overlayed on the image gives us a great
+intuition about the parts of the image that are important to the Transformer.
+This plot qualitatively evaluates the purpose of the attention weights.
+"""
+
+"""
+## Method IV: Visualizing the learned projection filters
+
+After extracting non-overlapping patches, ViTs flatten those patches across their
+saptial dimensions, and then linearly project them. One might wonder, how do these
+projections look like? Below, we take the ViT B-16 model and visualize its
+learned projections.
+"""
+
+
+def extract_weights(model, name):
+    for variable in model.weights:
+        if variable.name.startswith(name):
+            return variable.numpy()
+
+
+# Extract the projections.
+projections = extract_weights(vit_base_i21k_patch16_224, "conv_projection/kernel")
+projection_dim = projections.shape[-1]
+patch_h, patch_w, patch_channels = projections.shape[:-1]
+
+# Scale the projections.
+scaled_projections = MinMaxScaler().fit_transform(
+    projections.reshape(-1, projection_dim)
+)
+
+# Reshape the scaled projections so that the leading
+# three dimensions resemble an image.
+scaled_projections = scaled_projections.reshape(patch_h, patch_w, patch_channels, -1)
+
+# Visualize the first 128 filters of the learned
+# projections.
+fig, axes = plt.subplots(nrows=8, ncols=16, figsize=(13, 8))
+img_count = 0
+limit = 128
+
+for i in range(8):
+    for j in range(16):
+        if img_count < limit:
+            axes[i, j].imshow(scaled_projections[..., img_count])
+            axes[i, j].axis("off")
+            img_count += 1
+
+fig.tight_layout()
+
+"""
+### Inspecting the plots
+
+**What do the projection filters learn?**
+
+[When visualized](https://distill.pub/2017/feature-visualization/),
+the kernels of a convolutional neural network show
+the pattern that they look for in an image. This could be circles,
+sometimes lines -- when combined together (in later stage of a ConvNet), the filters
+transform into more complex shapes. We have found a stark similarity between such
+ConvNet kernels and the projection filters of a ViT.
+"""
+
+"""
+## Method V: Visualizing the positional emebddings
+
+Transformers are permutation-invariant. This means that do not take into account
+the spatial position of the input tokens. To overcome this
+limitation, we add positional information to the input tokens.
+
+The positional information can be in the form of leaned positional
+embeddings or handcrafted constant embeddings. In our case, all the
+three variants of ViTs feature learned positional embeddings.
+
+In this section, we visualize the similarities between the
+learned positional embeddings with itself. Below, we take the ViT B-16
+model and visualize the similarity of the positional embeddings by
+taking their dot-product.
+"""
+
+position_embeddings = extract_weights(vit_base_i21k_patch16_224, "pos_embedding")
+
+# Discard the batch dimension and the position embeddings of the
+# cls token.
+position_embeddings = position_embeddings.squeeze()[1:, ...]
+
+similarity = position_embeddings @ position_embeddings.T
+plt.imshow(similarity, cmap="inferno")
+plt.show()
+
+"""
+### Inspecting the plots
+
+**What do the positional embeddings tell us?**
+
+The plot has a distinctive diagonal pattern. The main diagonal is the brightest
+signifying that a position is the most similar to itself. An interesting
+pattern to look out for is the repeating diagonals. The repeating pattern
+portrays a sinusoidal function which is close in essence to what was proposed by
+[Vaswani et. al.](https://arxiv.org/abs/1706.03762) as a hand-crafted feature.
+"""
+
+"""
+## Notes
+
+* DINO extended the attention heatmap generation process to videos. We also
+[applied](https://github.com/sayakpaul/probing-vits/blob/main/notebooks/dino-attention-map
+s-video.ipynb) our DINO implementation on a series of videos and obtained similar
+results. Here's one such video of attention heatmaps:
+
+  ![dino](https://i.imgur.com/kAShjbs.gif)
+
+* [Raghu et al.](https://arxiv.org/abs/2108.08810) use an array of techniques to
+investigate the representations learned by ViTs and make comparisons with that of
+ResNets. We strongly recommend reading their work.
+
+* To author this example, we developed
+[this repository](https://github.com/sayakpaul/probing-vits) to guide our readers so that they
+can easily reproduce the experiments and extend them.
+
+* Another repository that you may find interesting in this regard is
+[`vit-explain`](https://github.com/jacobgil/vit-explain).
+
+* One can also plot the attention rollout and attention heat maps with
+custom images using our Hugging Face spaces.
+
+| Attention Heat Maps | Attention Rollout |
+| :--: | :--: |
+| [![Generic badge](https://img.shields.io/badge/🤗%20Spaces-Attention%20Heat%20Maps-black.svg)](https://huggingface.co/spaces/probing-vits/attention-heat-maps) | [![Generic badge](https://img.shields.io/badge/🤗%20Spaces-Attention%20Rollout-black.svg)](https://huggingface.co/spaces/probing-vits/attention-rollout) |
+"""
+
+"""
+## Acknowledgements
+
+- [PyImageSearch](https://pyimagesearch.com)
+- [Jarvislabs.ai](https://jarvislabs.ai/)
+- [GDE Program](https://developers.google.com/programs/experts/)
+"""
diff --git a/knowledge_base/vision/randaugment.py b/knowledge_base/vision/randaugment.py
new file mode 100644
index 0000000000000000000000000000000000000000..1196c2872b308b6ffbee0bf60d62fb2774674e28
--- /dev/null
+++ b/knowledge_base/vision/randaugment.py
@@ -0,0 +1,296 @@
+"""
+Title: RandAugment for Image Classification for Improved Robustness
+Authors: [Sayak Paul](https://twitter.com/RisingSayak)[Sachin Prasad](https://github.com/sachinprasadhs)
+Date created: 2021/03/13
+Last modified: 2023/12/12
+Description: RandAugment for training an image classification model with improved robustness.
+Accelerator: GPU
+"""
+
+"""
+Data augmentation is a very useful technique that can help to improve the translational
+invariance of convolutional neural networks (CNN). RandAugment is a stochastic data
+augmentation routine for vision data and was proposed in
+[RandAugment: Practical automated data augmentation with a reduced search space](https://arxiv.org/abs/1909.13719).
+It is composed of strong augmentation transforms like color jitters, Gaussian blurs,
+saturations, etc. along with more traditional augmentation transforms such as
+random crops.
+
+These parameters are tuned for a given dataset and a network architecture. The authors of
+RandAugment also provide pseudocode of RandAugment in the original paper (Figure 2).
+
+Recently, it has been a key component of works like
+[Noisy Student Training](https://arxiv.org/abs/1911.04252) and
+[Unsupervised Data Augmentation for Consistency Training](https://arxiv.org/abs/1904.12848).
+It has been also central to the
+success of [EfficientNets](https://arxiv.org/abs/1905.11946).
+
+
+```python
+pip install keras-cv
+```
+"""
+
+"""
+## Imports & setup
+"""
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+import keras
+import keras_cv
+from keras import ops
+from keras import layers
+import tensorflow as tf
+import numpy as np
+import matplotlib.pyplot as plt
+import tensorflow_datasets as tfds
+
+tfds.disable_progress_bar()
+keras.utils.set_random_seed(42)
+
+"""
+## Load the CIFAR10 dataset
+
+For this example, we will be using the
+[CIFAR10 dataset](https://www.cs.toronto.edu/~kriz/cifar.html).
+"""
+
+(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()
+print(f"Total training examples: {len(x_train)}")
+print(f"Total test examples: {len(x_test)}")
+
+"""
+## Define hyperparameters
+"""
+
+AUTO = tf.data.AUTOTUNE
+BATCH_SIZE = 128
+EPOCHS = 1
+IMAGE_SIZE = 72
+
+"""
+## Initialize `RandAugment` object
+
+Now, we will initialize a `RandAugment` object from the `imgaug.augmenters` module with
+the parameters suggested by the RandAugment authors.
+"""
+
+rand_augment = keras_cv.layers.RandAugment(
+    value_range=(0, 255), augmentations_per_image=3, magnitude=0.8
+)
+
+
+"""
+## Create TensorFlow `Dataset` objects
+"""
+
+train_ds_rand = (
+    tf.data.Dataset.from_tensor_slices((x_train, y_train))
+    .shuffle(BATCH_SIZE * 100)
+    .batch(BATCH_SIZE)
+    .map(
+        lambda x, y: (tf.image.resize(x, (IMAGE_SIZE, IMAGE_SIZE)), y),
+        num_parallel_calls=AUTO,
+    )
+    .map(
+        lambda x, y: (rand_augment(tf.cast(x, tf.uint8)), y),
+        num_parallel_calls=AUTO,
+    )
+    .prefetch(AUTO)
+)
+
+test_ds = (
+    tf.data.Dataset.from_tensor_slices((x_test, y_test))
+    .batch(BATCH_SIZE)
+    .map(
+        lambda x, y: (tf.image.resize(x, (IMAGE_SIZE, IMAGE_SIZE)), y),
+        num_parallel_calls=AUTO,
+    )
+    .prefetch(AUTO)
+)
+
+"""
+For comparison purposes, let's also define a simple augmentation pipeline consisting of
+random flips, random rotations, and random zoomings.
+"""
+
+simple_aug = keras.Sequential(
+    [
+        layers.Resizing(IMAGE_SIZE, IMAGE_SIZE),
+        layers.RandomFlip("horizontal"),
+        layers.RandomRotation(factor=0.02),
+        layers.RandomZoom(height_factor=0.2, width_factor=0.2),
+    ]
+)
+
+# Now, map the augmentation pipeline to our training dataset
+train_ds_simple = (
+    tf.data.Dataset.from_tensor_slices((x_train, y_train))
+    .shuffle(BATCH_SIZE * 100)
+    .batch(BATCH_SIZE)
+    .map(lambda x, y: (simple_aug(x), y), num_parallel_calls=AUTO)
+    .prefetch(AUTO)
+)
+
+"""
+## Visualize the dataset augmented with RandAugment
+"""
+
+sample_images, _ = next(iter(train_ds_rand))
+plt.figure(figsize=(10, 10))
+for i, image in enumerate(sample_images[:9]):
+    ax = plt.subplot(3, 3, i + 1)
+    plt.imshow(image.numpy().astype("int"))
+    plt.axis("off")
+
+"""
+You are encouraged to run the above code block a couple of times to see different
+variations.
+"""
+
+"""
+## Visualize the dataset augmented with `simple_aug`
+"""
+
+sample_images, _ = next(iter(train_ds_simple))
+plt.figure(figsize=(10, 10))
+for i, image in enumerate(sample_images[:9]):
+    ax = plt.subplot(3, 3, i + 1)
+    plt.imshow(image.numpy().astype("int"))
+    plt.axis("off")
+
+"""
+## Define a model building utility function
+
+Now, we define a CNN model that is based on the
+[ResNet50V2 architecture](https://arxiv.org/abs/1603.05027). Also,
+notice that the network already has a rescaling layer inside it. This eliminates the need
+to do any separate preprocessing on our dataset and is specifically very useful for
+deployment purposes.
+"""
+
+
+def get_training_model():
+    resnet50_v2 = keras.applications.ResNet50V2(
+        weights=None,
+        include_top=True,
+        input_shape=(IMAGE_SIZE, IMAGE_SIZE, 3),
+        classes=10,
+    )
+    model = keras.Sequential(
+        [
+            layers.Input((IMAGE_SIZE, IMAGE_SIZE, 3)),
+            layers.Rescaling(scale=1.0 / 127.5, offset=-1),
+            resnet50_v2,
+        ]
+    )
+    return model
+
+
+get_training_model().summary()
+
+
+"""
+We will train this network on two different versions of our dataset:
+
+* One augmented with RandAugment.
+* Another one augmented with `simple_aug`.
+
+Since RandAugment is known to enhance the robustness of models to common perturbations
+and corruptions, we will also evaluate our models on the CIFAR-10-C dataset, proposed in
+[Benchmarking Neural Network Robustness to Common Corruptions and Perturbations](https://arxiv.org/abs/1903.12261)
+by Hendrycks et al. The CIFAR-10-C dataset
+consists of 19 different image corruptions and perturbations (for example speckle noise,
+fog, Gaussian blur, etc.) that too at varying severity levels. For this example we will
+be using the following configuration:
+[`cifar10_corrupted/saturate_5`](https://www.tensorflow.org/datasets/catalog/cifar10_corrupted#cifar10_corruptedsaturate_5).
+The images from this configuration look like so:
+
+![](https://storage.googleapis.com/tfds-data/visualization/fig/cifar10_corrupted-saturate_5-1.0.0.png)
+
+In the interest of reproducibility, we serialize the initial random weights of our shallow
+network.
+"""
+
+initial_model = get_training_model()
+initial_model.save_weights("initial.weights.h5")
+
+"""
+## Train model with RandAugment
+"""
+
+rand_aug_model = get_training_model()
+rand_aug_model.load_weights("initial.weights.h5")
+rand_aug_model.compile(
+    loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
+)
+rand_aug_model.fit(train_ds_rand, validation_data=test_ds, epochs=EPOCHS)
+_, test_acc = rand_aug_model.evaluate(test_ds)
+print("Test accuracy: {:.2f}%".format(test_acc * 100))
+
+"""
+## Train model with `simple_aug`
+"""
+
+simple_aug_model = get_training_model()
+simple_aug_model.load_weights("initial.weights.h5")
+simple_aug_model.compile(
+    loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
+)
+simple_aug_model.fit(train_ds_simple, validation_data=test_ds, epochs=EPOCHS)
+_, test_acc = simple_aug_model.evaluate(test_ds)
+print("Test accuracy: {:.2f}%".format(test_acc * 100))
+
+"""
+## Load the CIFAR-10-C dataset and evaluate performance
+"""
+
+# Load and prepare the CIFAR-10-C dataset
+# (If it's not already downloaded, it takes ~10 minutes of time to download)
+cifar_10_c = tfds.load("cifar10_corrupted/saturate_5", split="test", as_supervised=True)
+cifar_10_c = cifar_10_c.batch(BATCH_SIZE).map(
+    lambda x, y: (tf.image.resize(x, (IMAGE_SIZE, IMAGE_SIZE)), y),
+    num_parallel_calls=AUTO,
+)
+
+# Evaluate `rand_aug_model`
+_, test_acc = rand_aug_model.evaluate(cifar_10_c, verbose=0)
+print(
+    "Accuracy with RandAugment on CIFAR-10-C (saturate_5): {:.2f}%".format(
+        test_acc * 100
+    )
+)
+
+# Evaluate `simple_aug_model`
+_, test_acc = simple_aug_model.evaluate(cifar_10_c, verbose=0)
+print(
+    "Accuracy with simple_aug on CIFAR-10-C (saturate_5): {:.2f}%".format(
+        test_acc * 100
+    )
+)
+
+"""
+For the purpose of this example, we trained the models for only a single epoch. On the
+CIFAR-10-C dataset, the model with RandAugment can perform better with a higher accuracy
+(for example, 76.64% in one experiment) compared with the model trained with `simple_aug`
+(e.g., 64.80%). RandAugment can also help stabilize the training.
+
+In the notebook, you may notice that, at the expense of increased training time with RandAugment,
+we are able to carve out far better performance on the CIFAR-10-C dataset. You can
+experiment on the other corruption and perturbation settings that come with the
+run the same CIFAR-10-C dataset and see if RandAugment helps.
+
+You can also experiment with the different values of `n` and `m` in the `RandAugment`
+object. In the [original paper](https://arxiv.org/abs/1909.13719), the authors show
+the impact of the individual augmentation transforms for a particular task and a range of
+ablation studies. You are welcome to check them out.
+
+RandAugment has shown great progress in improving the robustness of deep models for
+computer vision as shown in works like [Noisy Student Training](https://arxiv.org/abs/1911.04252) and
+[FixMatch](https://arxiv.org/abs/2001.07685). This makes RandAugment quite a useful
+recipe for training different vision models.
+
+You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/keras-io/randaugment)
+and try the demo on [Hugging Face Spaces](https://huggingface.co/spaces/keras-io/randaugment).
+"""
diff --git a/knowledge_base/vision/reptile.py b/knowledge_base/vision/reptile.py
new file mode 100644
index 0000000000000000000000000000000000000000..184e4b81771df47fbe1d8b8632c7f8f030c8a019
--- /dev/null
+++ b/knowledge_base/vision/reptile.py
@@ -0,0 +1,296 @@
+"""
+Title: Few-Shot learning with Reptile
+Author: [ADMoreau](https://github.com/ADMoreau)
+Date created: 2020/05/21
+Last modified: 2023/07/20
+Description: Few-shot classification on the Omniglot dataset using Reptile.
+Accelerator: GPU
+Converted to Keras 3 By: [Muhammad Anas Raza](https://anasrz.com)
+"""
+
+"""
+## Introduction
+
+The [Reptile](https://arxiv.org/abs/1803.02999) algorithm was developed by OpenAI to
+perform model-agnostic meta-learning. Specifically, this algorithm was designed to
+quickly learn to perform new tasks with minimal training (few-shot learning).
+The algorithm works by performing Stochastic Gradient Descent using the
+difference between weights trained on a mini-batch of never-seen-before data and the
+model weights prior to training over a fixed number of meta-iterations.
+"""
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import keras
+from keras import layers
+
+import matplotlib.pyplot as plt
+import numpy as np
+import random
+import tensorflow as tf
+import tensorflow_datasets as tfds
+
+"""
+## Define the Hyperparameters
+"""
+
+learning_rate = 0.003
+meta_step_size = 0.25
+
+inner_batch_size = 25
+eval_batch_size = 25
+
+meta_iters = 2000
+eval_iters = 5
+inner_iters = 4
+
+eval_interval = 1
+train_shots = 20
+shots = 5
+classes = 5
+
+"""
+## Prepare the data
+
+The [Omniglot dataset](https://github.com/brendenlake/omniglot/) is a dataset of 1,623
+characters taken from 50 different alphabets, with 20 examples for each character.
+The 20 samples for each character were drawn online via Amazon's Mechanical Turk. For the
+few-shot learning task, `k` samples (or "shots") are drawn randomly from `n` randomly-chosen
+classes. These `n` numerical values are used to create a new set of temporary labels to use
+to test the model's ability to learn a new task given few examples. In other words, if you
+are training on 5 classes, your new class labels will be either 0, 1, 2, 3, or 4.
+Omniglot is a great dataset for this task since there are many different classes to draw
+from, with a reasonable number of samples for each class.
+"""
+
+
+class Dataset:
+    # This class will facilitate the creation of a few-shot dataset
+    # from the Omniglot dataset that can be sampled from quickly while also
+    # allowing to create new labels at the same time.
+    def __init__(self, training):
+        # Download the tfrecord files containing the omniglot data and convert to a
+        # dataset.
+        split = "train" if training else "test"
+        ds = tfds.load("omniglot", split=split, as_supervised=True, shuffle_files=False)
+        # Iterate over the dataset to get each individual image and its class,
+        # and put that data into a dictionary.
+        self.data = {}
+
+        def extraction(image, label):
+            # This function will shrink the Omniglot images to the desired size,
+            # scale pixel values and convert the RGB image to grayscale
+            image = tf.image.convert_image_dtype(image, tf.float32)
+            image = tf.image.rgb_to_grayscale(image)
+            image = tf.image.resize(image, [28, 28])
+            return image, label
+
+        for image, label in ds.map(extraction):
+            image = image.numpy()
+            label = str(label.numpy())
+            if label not in self.data:
+                self.data[label] = []
+            self.data[label].append(image)
+        self.labels = list(self.data.keys())
+
+    def get_mini_dataset(
+        self, batch_size, repetitions, shots, num_classes, split=False
+    ):
+        temp_labels = np.zeros(shape=(num_classes * shots))
+        temp_images = np.zeros(shape=(num_classes * shots, 28, 28, 1))
+        if split:
+            test_labels = np.zeros(shape=(num_classes))
+            test_images = np.zeros(shape=(num_classes, 28, 28, 1))
+
+        # Get a random subset of labels from the entire label set.
+        label_subset = random.choices(self.labels, k=num_classes)
+        for class_idx, class_obj in enumerate(label_subset):
+            # Use enumerated index value as a temporary label for mini-batch in
+            # few shot learning.
+            temp_labels[class_idx * shots : (class_idx + 1) * shots] = class_idx
+            # If creating a split dataset for testing, select an extra sample from each
+            # label to create the test dataset.
+            if split:
+                test_labels[class_idx] = class_idx
+                images_to_split = random.choices(
+                    self.data[label_subset[class_idx]], k=shots + 1
+                )
+                test_images[class_idx] = images_to_split[-1]
+                temp_images[class_idx * shots : (class_idx + 1) * shots] = (
+                    images_to_split[:-1]
+                )
+            else:
+                # For each index in the randomly selected label_subset, sample the
+                # necessary number of images.
+                temp_images[class_idx * shots : (class_idx + 1) * shots] = (
+                    random.choices(self.data[label_subset[class_idx]], k=shots)
+                )
+
+        dataset = tf.data.Dataset.from_tensor_slices(
+            (temp_images.astype(np.float32), temp_labels.astype(np.int32))
+        )
+        dataset = dataset.shuffle(100).batch(batch_size).repeat(repetitions)
+        if split:
+            return dataset, test_images, test_labels
+        return dataset
+
+
+import urllib3
+
+urllib3.disable_warnings()  # Disable SSL warnings that may happen during download.
+train_dataset = Dataset(training=True)
+test_dataset = Dataset(training=False)
+
+"""
+## Visualize some examples from the dataset
+"""
+
+_, axarr = plt.subplots(nrows=5, ncols=5, figsize=(20, 20))
+
+sample_keys = list(train_dataset.data.keys())
+
+for a in range(5):
+    for b in range(5):
+        temp_image = train_dataset.data[sample_keys[a]][b]
+        temp_image = np.stack((temp_image[:, :, 0],) * 3, axis=2)
+        temp_image *= 255
+        temp_image = np.clip(temp_image, 0, 255).astype("uint8")
+        if b == 2:
+            axarr[a, b].set_title("Class : " + sample_keys[a])
+        axarr[a, b].imshow(temp_image, cmap="gray")
+        axarr[a, b].xaxis.set_visible(False)
+        axarr[a, b].yaxis.set_visible(False)
+plt.show()
+
+"""
+## Build the model
+"""
+
+
+def conv_bn(x):
+    x = layers.Conv2D(filters=64, kernel_size=3, strides=2, padding="same")(x)
+    x = layers.BatchNormalization()(x)
+    return layers.ReLU()(x)
+
+
+inputs = layers.Input(shape=(28, 28, 1))
+x = conv_bn(inputs)
+x = conv_bn(x)
+x = conv_bn(x)
+x = conv_bn(x)
+x = layers.Flatten()(x)
+outputs = layers.Dense(classes, activation="softmax")(x)
+model = keras.Model(inputs=inputs, outputs=outputs)
+model.compile()
+optimizer = keras.optimizers.SGD(learning_rate=learning_rate)
+
+"""
+## Train the model
+"""
+
+training = []
+testing = []
+for meta_iter in range(meta_iters):
+    frac_done = meta_iter / meta_iters
+    cur_meta_step_size = (1 - frac_done) * meta_step_size
+    # Temporarily save the weights from the model.
+    old_vars = model.get_weights()
+    # Get a sample from the full dataset.
+    mini_dataset = train_dataset.get_mini_dataset(
+        inner_batch_size, inner_iters, train_shots, classes
+    )
+    for images, labels in mini_dataset:
+        with tf.GradientTape() as tape:
+            preds = model(images)
+            loss = keras.losses.sparse_categorical_crossentropy(labels, preds)
+        grads = tape.gradient(loss, model.trainable_weights)
+        optimizer.apply_gradients(zip(grads, model.trainable_weights))
+    new_vars = model.get_weights()
+    # Perform SGD for the meta step.
+    for var in range(len(new_vars)):
+        new_vars[var] = old_vars[var] + (
+            (new_vars[var] - old_vars[var]) * cur_meta_step_size
+        )
+    # After the meta-learning step, reload the newly-trained weights into the model.
+    model.set_weights(new_vars)
+    # Evaluation loop
+    if meta_iter % eval_interval == 0:
+        accuracies = []
+        for dataset in (train_dataset, test_dataset):
+            # Sample a mini dataset from the full dataset.
+            train_set, test_images, test_labels = dataset.get_mini_dataset(
+                eval_batch_size, eval_iters, shots, classes, split=True
+            )
+            old_vars = model.get_weights()
+            # Train on the samples and get the resulting accuracies.
+            for images, labels in train_set:
+                with tf.GradientTape() as tape:
+                    preds = model(images)
+                    loss = keras.losses.sparse_categorical_crossentropy(labels, preds)
+                grads = tape.gradient(loss, model.trainable_weights)
+                optimizer.apply_gradients(zip(grads, model.trainable_weights))
+            test_preds = model.predict(test_images, verbose=0)
+            test_preds = tf.argmax(test_preds).numpy()
+            num_correct = (test_preds == test_labels).sum()
+            # Reset the weights after getting the evaluation accuracies.
+            model.set_weights(old_vars)
+            accuracies.append(num_correct / classes)
+        training.append(accuracies[0])
+        testing.append(accuracies[1])
+        if meta_iter % 100 == 0:
+            print(
+                "batch %d: train=%f test=%f" % (meta_iter, accuracies[0], accuracies[1])
+            )
+
+"""
+## Visualize Results
+"""
+
+# First, some preprocessing to smooth the training and testing arrays for display.
+window_length = 100
+train_s = np.r_[
+    training[window_length - 1 : 0 : -1],
+    training,
+    training[-1:-window_length:-1],
+]
+test_s = np.r_[
+    testing[window_length - 1 : 0 : -1], testing, testing[-1:-window_length:-1]
+]
+w = np.hamming(window_length)
+train_y = np.convolve(w / w.sum(), train_s, mode="valid")
+test_y = np.convolve(w / w.sum(), test_s, mode="valid")
+
+# Display the training accuracies.
+x = np.arange(0, len(test_y), 1)
+plt.plot(x, test_y, x, train_y)
+plt.legend(["test", "train"])
+plt.grid()
+
+train_set, test_images, test_labels = dataset.get_mini_dataset(
+    eval_batch_size, eval_iters, shots, classes, split=True
+)
+for images, labels in train_set:
+    with tf.GradientTape() as tape:
+        preds = model(images)
+        loss = keras.losses.sparse_categorical_crossentropy(labels, preds)
+    grads = tape.gradient(loss, model.trainable_weights)
+    optimizer.apply_gradients(zip(grads, model.trainable_weights))
+test_preds = model.predict(test_images)
+test_preds = tf.argmax(test_preds).numpy()
+
+_, axarr = plt.subplots(nrows=1, ncols=5, figsize=(20, 20))
+
+sample_keys = list(train_dataset.data.keys())
+
+for i, ax in zip(range(5), axarr):
+    temp_image = np.stack((test_images[i, :, :, 0],) * 3, axis=2)
+    temp_image *= 255
+    temp_image = np.clip(temp_image, 0, 255).astype("uint8")
+    ax.set_title(
+        "Label : {}, Prediction : {}".format(int(test_labels[i]), test_preds[i])
+    )
+    ax.imshow(temp_image, cmap="gray")
+    ax.xaxis.set_visible(False)
+    ax.yaxis.set_visible(False)
+plt.show()
diff --git a/knowledge_base/vision/retinanet.py b/knowledge_base/vision/retinanet.py
new file mode 100644
index 0000000000000000000000000000000000000000..c27685616f19f070940f81f79b3feb4077eb3b97
--- /dev/null
+++ b/knowledge_base/vision/retinanet.py
@@ -0,0 +1,984 @@
+"""
+Title: Object Detection with RetinaNet
+Author: [Srihari Humbarwadi](https://twitter.com/srihari_rh)
+Date created: 2020/05/17
+Last modified: 2023/07/10
+Description: Implementing RetinaNet: Focal Loss for Dense Object Detection.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+Object detection a very important problem in computer
+vision. Here the model is tasked with localizing the objects present in an
+image, and at the same time, classifying them into different categories.
+Object detection models can be broadly classified into "single-stage" and
+"two-stage" detectors. Two-stage detectors are often more accurate but at the
+cost of being slower. Here in this example, we will implement RetinaNet,
+a popular single-stage detector, which is accurate and runs fast.
+RetinaNet uses a feature pyramid network to efficiently detect objects at
+multiple scales and introduces a new loss, the Focal loss function, to alleviate
+the problem of the extreme foreground-background class imbalance.
+
+**References:**
+
+- [RetinaNet Paper](https://arxiv.org/abs/1708.02002)
+- [Feature Pyramid Network Paper](https://arxiv.org/abs/1612.03144)
+"""
+
+
+import os
+import re
+import zipfile
+
+import numpy as np
+import tensorflow as tf
+from tensorflow import keras
+
+import matplotlib.pyplot as plt
+import tensorflow_datasets as tfds
+
+
+"""
+## Downloading the COCO2017 dataset
+
+Training on the entire COCO2017 dataset which has around 118k images takes a
+lot of time, hence we will be using a smaller subset of ~500 images for
+training in this example.
+"""
+
+url = "https://github.com/srihari-humbarwadi/datasets/releases/download/v0.1.0/data.zip"
+filename = os.path.join(os.getcwd(), "data.zip")
+keras.utils.get_file(filename, url)
+
+
+with zipfile.ZipFile("data.zip", "r") as z_fp:
+    z_fp.extractall("./")
+
+
+"""
+## Implementing utility functions
+
+Bounding boxes can be represented in multiple ways, the most common formats are:
+
+- Storing the coordinates of the corners `[xmin, ymin, xmax, ymax]`
+- Storing the coordinates of the center and the box dimensions
+`[x, y, width, height]`
+
+Since we require both formats, we will be implementing functions for converting
+between the formats.
+"""
+
+
+def swap_xy(boxes):
+    """Swaps order the of x and y coordinates of the boxes.
+
+    Arguments:
+      boxes: A tensor with shape `(num_boxes, 4)` representing bounding boxes.
+
+    Returns:
+      swapped boxes with shape same as that of boxes.
+    """
+    return tf.stack([boxes[:, 1], boxes[:, 0], boxes[:, 3], boxes[:, 2]], axis=-1)
+
+
+def convert_to_xywh(boxes):
+    """Changes the box format to center, width and height.
+
+    Arguments:
+      boxes: A tensor of rank 2 or higher with a shape of `(..., num_boxes, 4)`
+        representing bounding boxes where each box is of the format
+        `[xmin, ymin, xmax, ymax]`.
+
+    Returns:
+      converted boxes with shape same as that of boxes.
+    """
+    return tf.concat(
+        [(boxes[..., :2] + boxes[..., 2:]) / 2.0, boxes[..., 2:] - boxes[..., :2]],
+        axis=-1,
+    )
+
+
+def convert_to_corners(boxes):
+    """Changes the box format to corner coordinates
+
+    Arguments:
+      boxes: A tensor of rank 2 or higher with a shape of `(..., num_boxes, 4)`
+        representing bounding boxes where each box is of the format
+        `[x, y, width, height]`.
+
+    Returns:
+      converted boxes with shape same as that of boxes.
+    """
+    return tf.concat(
+        [boxes[..., :2] - boxes[..., 2:] / 2.0, boxes[..., :2] + boxes[..., 2:] / 2.0],
+        axis=-1,
+    )
+
+
+"""
+## Computing pairwise Intersection Over Union (IOU)
+
+As we will see later in the example, we would be assigning ground truth boxes
+to anchor boxes based on the extent of overlapping. This will require us to
+calculate the Intersection Over Union (IOU) between all the anchor
+boxes and ground truth boxes pairs.
+"""
+
+
+def compute_iou(boxes1, boxes2):
+    """Computes pairwise IOU matrix for given two sets of boxes
+
+    Arguments:
+      boxes1: A tensor with shape `(N, 4)` representing bounding boxes
+        where each box is of the format `[x, y, width, height]`.
+        boxes2: A tensor with shape `(M, 4)` representing bounding boxes
+        where each box is of the format `[x, y, width, height]`.
+
+    Returns:
+      pairwise IOU matrix with shape `(N, M)`, where the value at ith row
+        jth column holds the IOU between ith box and jth box from
+        boxes1 and boxes2 respectively.
+    """
+    boxes1_corners = convert_to_corners(boxes1)
+    boxes2_corners = convert_to_corners(boxes2)
+    lu = tf.maximum(boxes1_corners[:, None, :2], boxes2_corners[:, :2])
+    rd = tf.minimum(boxes1_corners[:, None, 2:], boxes2_corners[:, 2:])
+    intersection = tf.maximum(0.0, rd - lu)
+    intersection_area = intersection[:, :, 0] * intersection[:, :, 1]
+    boxes1_area = boxes1[:, 2] * boxes1[:, 3]
+    boxes2_area = boxes2[:, 2] * boxes2[:, 3]
+    union_area = tf.maximum(
+        boxes1_area[:, None] + boxes2_area - intersection_area, 1e-8
+    )
+    return tf.clip_by_value(intersection_area / union_area, 0.0, 1.0)
+
+
+def visualize_detections(
+    image, boxes, classes, scores, figsize=(7, 7), linewidth=1, color=[0, 0, 1]
+):
+    """Visualize Detections"""
+    image = np.array(image, dtype=np.uint8)
+    plt.figure(figsize=figsize)
+    plt.axis("off")
+    plt.imshow(image)
+    ax = plt.gca()
+    for box, _cls, score in zip(boxes, classes, scores):
+        text = "{}: {:.2f}".format(_cls, score)
+        x1, y1, x2, y2 = box
+        w, h = x2 - x1, y2 - y1
+        patch = plt.Rectangle(
+            [x1, y1], w, h, fill=False, edgecolor=color, linewidth=linewidth
+        )
+        ax.add_patch(patch)
+        ax.text(
+            x1,
+            y1,
+            text,
+            bbox={"facecolor": color, "alpha": 0.4},
+            clip_box=ax.clipbox,
+            clip_on=True,
+        )
+    plt.show()
+    return ax
+
+
+"""
+## Implementing Anchor generator
+
+Anchor boxes are fixed sized boxes that the model uses to predict the bounding
+box for an object. It does this by regressing the offset between the location
+of the object's center and the center of an anchor box, and then uses the width
+and height of the anchor box to predict a relative scale of the object. In the
+case of RetinaNet, each location on a given feature map has nine anchor boxes
+(at three scales and three ratios).
+"""
+
+
+class AnchorBox:
+    """Generates anchor boxes.
+
+    This class has operations to generate anchor boxes for feature maps at
+    strides `[8, 16, 32, 64, 128]`. Where each anchor each box is of the
+    format `[x, y, width, height]`.
+
+    Attributes:
+      aspect_ratios: A list of float values representing the aspect ratios of
+        the anchor boxes at each location on the feature map
+      scales: A list of float values representing the scale of the anchor boxes
+        at each location on the feature map.
+      num_anchors: The number of anchor boxes at each location on feature map
+      areas: A list of float values representing the areas of the anchor
+        boxes for each feature map in the feature pyramid.
+      strides: A list of float value representing the strides for each feature
+        map in the feature pyramid.
+    """
+
+    def __init__(self):
+        self.aspect_ratios = [0.5, 1.0, 2.0]
+        self.scales = [2**x for x in [0, 1 / 3, 2 / 3]]
+
+        self._num_anchors = len(self.aspect_ratios) * len(self.scales)
+        self._strides = [2**i for i in range(3, 8)]
+        self._areas = [x**2 for x in [32.0, 64.0, 128.0, 256.0, 512.0]]
+        self._anchor_dims = self._compute_dims()
+
+    def _compute_dims(self):
+        """Computes anchor box dimensions for all ratios and scales at all levels
+        of the feature pyramid.
+        """
+        anchor_dims_all = []
+        for area in self._areas:
+            anchor_dims = []
+            for ratio in self.aspect_ratios:
+                anchor_height = tf.math.sqrt(area / ratio)
+                anchor_width = area / anchor_height
+                dims = tf.reshape(
+                    tf.stack([anchor_width, anchor_height], axis=-1), [1, 1, 2]
+                )
+                for scale in self.scales:
+                    anchor_dims.append(scale * dims)
+            anchor_dims_all.append(tf.stack(anchor_dims, axis=-2))
+        return anchor_dims_all
+
+    def _get_anchors(self, feature_height, feature_width, level):
+        """Generates anchor boxes for a given feature map size and level
+
+        Arguments:
+          feature_height: An integer representing the height of the feature map.
+          feature_width: An integer representing the width of the feature map.
+          level: An integer representing the level of the feature map in the
+            feature pyramid.
+
+        Returns:
+          anchor boxes with the shape
+          `(feature_height * feature_width * num_anchors, 4)`
+        """
+        rx = tf.range(feature_width, dtype=tf.float32) + 0.5
+        ry = tf.range(feature_height, dtype=tf.float32) + 0.5
+        centers = tf.stack(tf.meshgrid(rx, ry), axis=-1) * self._strides[level - 3]
+        centers = tf.expand_dims(centers, axis=-2)
+        centers = tf.tile(centers, [1, 1, self._num_anchors, 1])
+        dims = tf.tile(
+            self._anchor_dims[level - 3], [feature_height, feature_width, 1, 1]
+        )
+        anchors = tf.concat([centers, dims], axis=-1)
+        return tf.reshape(
+            anchors, [feature_height * feature_width * self._num_anchors, 4]
+        )
+
+    def get_anchors(self, image_height, image_width):
+        """Generates anchor boxes for all the feature maps of the feature pyramid.
+
+        Arguments:
+          image_height: Height of the input image.
+          image_width: Width of the input image.
+
+        Returns:
+          anchor boxes for all the feature maps, stacked as a single tensor
+            with shape `(total_anchors, 4)`
+        """
+        anchors = [
+            self._get_anchors(
+                tf.math.ceil(image_height / 2**i),
+                tf.math.ceil(image_width / 2**i),
+                i,
+            )
+            for i in range(3, 8)
+        ]
+        return tf.concat(anchors, axis=0)
+
+
+"""
+## Preprocessing data
+
+Preprocessing the images involves two steps:
+
+- Resizing the image: Images are resized such that the shortest size is equal
+to 800 px, after resizing if the longest side of the image exceeds 1333 px,
+the image is resized such that the longest size is now capped at 1333 px.
+- Applying augmentation: Random scale jittering  and random horizontal flipping
+are the only augmentations applied to the images.
+
+Along with the images, bounding boxes are rescaled and flipped if required.
+"""
+
+
+def random_flip_horizontal(image, boxes):
+    """Flips image and boxes horizontally with 50% chance
+
+    Arguments:
+      image: A 3-D tensor of shape `(height, width, channels)` representing an
+        image.
+      boxes: A tensor with shape `(num_boxes, 4)` representing bounding boxes,
+        having normalized coordinates.
+
+    Returns:
+      Randomly flipped image and boxes
+    """
+    if tf.random.uniform(()) > 0.5:
+        image = tf.image.flip_left_right(image)
+        boxes = tf.stack(
+            [1 - boxes[:, 2], boxes[:, 1], 1 - boxes[:, 0], boxes[:, 3]], axis=-1
+        )
+    return image, boxes
+
+
+def resize_and_pad_image(
+    image, min_side=800.0, max_side=1333.0, jitter=[640, 1024], stride=128.0
+):
+    """Resizes and pads image while preserving aspect ratio.
+
+    1. Resizes images so that the shorter side is equal to `min_side`
+    2. If the longer side is greater than `max_side`, then resize the image
+      with longer side equal to `max_side`
+    3. Pad with zeros on right and bottom to make the image shape divisible by
+    `stride`
+
+    Arguments:
+      image: A 3-D tensor of shape `(height, width, channels)` representing an
+        image.
+      min_side: The shorter side of the image is resized to this value, if
+        `jitter` is set to None.
+      max_side: If the longer side of the image exceeds this value after
+        resizing, the image is resized such that the longer side now equals to
+        this value.
+      jitter: A list of floats containing minimum and maximum size for scale
+        jittering. If available, the shorter side of the image will be
+        resized to a random value in this range.
+      stride: The stride of the smallest feature map in the feature pyramid.
+        Can be calculated using `image_size / feature_map_size`.
+
+    Returns:
+      image: Resized and padded image.
+      image_shape: Shape of the image before padding.
+      ratio: The scaling factor used to resize the image
+    """
+    image_shape = tf.cast(tf.shape(image)[:2], dtype=tf.float32)
+    if jitter is not None:
+        min_side = tf.random.uniform((), jitter[0], jitter[1], dtype=tf.float32)
+    ratio = min_side / tf.reduce_min(image_shape)
+    if ratio * tf.reduce_max(image_shape) > max_side:
+        ratio = max_side / tf.reduce_max(image_shape)
+    image_shape = ratio * image_shape
+    image = tf.image.resize(image, tf.cast(image_shape, dtype=tf.int32))
+    padded_image_shape = tf.cast(
+        tf.math.ceil(image_shape / stride) * stride, dtype=tf.int32
+    )
+    image = tf.image.pad_to_bounding_box(
+        image, 0, 0, padded_image_shape[0], padded_image_shape[1]
+    )
+    return image, image_shape, ratio
+
+
+def preprocess_data(sample):
+    """Applies preprocessing step to a single sample
+
+    Arguments:
+      sample: A dict representing a single training sample.
+
+    Returns:
+      image: Resized and padded image with random horizontal flipping applied.
+      bbox: Bounding boxes with the shape `(num_objects, 4)` where each box is
+        of the format `[x, y, width, height]`.
+      class_id: An tensor representing the class id of the objects, having
+        shape `(num_objects,)`.
+    """
+    image = sample["image"]
+    bbox = swap_xy(sample["objects"]["bbox"])
+    class_id = tf.cast(sample["objects"]["label"], dtype=tf.int32)
+
+    image, bbox = random_flip_horizontal(image, bbox)
+    image, image_shape, _ = resize_and_pad_image(image)
+
+    bbox = tf.stack(
+        [
+            bbox[:, 0] * image_shape[1],
+            bbox[:, 1] * image_shape[0],
+            bbox[:, 2] * image_shape[1],
+            bbox[:, 3] * image_shape[0],
+        ],
+        axis=-1,
+    )
+    bbox = convert_to_xywh(bbox)
+    return image, bbox, class_id
+
+
+"""
+## Encoding labels
+
+The raw labels, consisting of bounding boxes and class ids need to be
+transformed into targets for training. This transformation consists of
+the following steps:
+
+- Generating anchor boxes for the given image dimensions
+- Assigning ground truth boxes to the anchor boxes
+- The anchor boxes that are not assigned any objects, are either assigned the
+background class or ignored depending on the IOU
+- Generating the classification and regression targets using anchor boxes
+"""
+
+
+class LabelEncoder:
+    """Transforms the raw labels into targets for training.
+
+    This class has operations to generate targets for a batch of samples which
+    is made up of the input images, bounding boxes for the objects present and
+    their class ids.
+
+    Attributes:
+      anchor_box: Anchor box generator to encode the bounding boxes.
+      box_variance: The scaling factors used to scale the bounding box targets.
+    """
+
+    def __init__(self):
+        self._anchor_box = AnchorBox()
+        self._box_variance = tf.convert_to_tensor(
+            [0.1, 0.1, 0.2, 0.2], dtype=tf.float32
+        )
+
+    def _match_anchor_boxes(
+        self, anchor_boxes, gt_boxes, match_iou=0.5, ignore_iou=0.4
+    ):
+        """Matches ground truth boxes to anchor boxes based on IOU.
+
+        1. Calculates the pairwise IOU for the M `anchor_boxes` and N `gt_boxes`
+          to get a `(M, N)` shaped matrix.
+        2. The ground truth box with the maximum IOU in each row is assigned to
+          the anchor box provided the IOU is greater than `match_iou`.
+        3. If the maximum IOU in a row is less than `ignore_iou`, the anchor
+          box is assigned with the background class.
+        4. The remaining anchor boxes that do not have any class assigned are
+          ignored during training.
+
+        Arguments:
+          anchor_boxes: A float tensor with the shape `(total_anchors, 4)`
+            representing all the anchor boxes for a given input image shape,
+            where each anchor box is of the format `[x, y, width, height]`.
+          gt_boxes: A float tensor with shape `(num_objects, 4)` representing
+            the ground truth boxes, where each box is of the format
+            `[x, y, width, height]`.
+          match_iou: A float value representing the minimum IOU threshold for
+            determining if a ground truth box can be assigned to an anchor box.
+          ignore_iou: A float value representing the IOU threshold under which
+            an anchor box is assigned to the background class.
+
+        Returns:
+          matched_gt_idx: Index of the matched object
+          positive_mask: A mask for anchor boxes that have been assigned ground
+            truth boxes.
+          ignore_mask: A mask for anchor boxes that need to by ignored during
+            training
+        """
+        iou_matrix = compute_iou(anchor_boxes, gt_boxes)
+        max_iou = tf.reduce_max(iou_matrix, axis=1)
+        matched_gt_idx = tf.argmax(iou_matrix, axis=1)
+        positive_mask = tf.greater_equal(max_iou, match_iou)
+        negative_mask = tf.less(max_iou, ignore_iou)
+        ignore_mask = tf.logical_not(tf.logical_or(positive_mask, negative_mask))
+        return (
+            matched_gt_idx,
+            tf.cast(positive_mask, dtype=tf.float32),
+            tf.cast(ignore_mask, dtype=tf.float32),
+        )
+
+    def _compute_box_target(self, anchor_boxes, matched_gt_boxes):
+        """Transforms the ground truth boxes into targets for training"""
+        box_target = tf.concat(
+            [
+                (matched_gt_boxes[:, :2] - anchor_boxes[:, :2]) / anchor_boxes[:, 2:],
+                tf.math.log(matched_gt_boxes[:, 2:] / anchor_boxes[:, 2:]),
+            ],
+            axis=-1,
+        )
+        box_target = box_target / self._box_variance
+        return box_target
+
+    def _encode_sample(self, image_shape, gt_boxes, cls_ids):
+        """Creates box and classification targets for a single sample"""
+        anchor_boxes = self._anchor_box.get_anchors(image_shape[1], image_shape[2])
+        cls_ids = tf.cast(cls_ids, dtype=tf.float32)
+        matched_gt_idx, positive_mask, ignore_mask = self._match_anchor_boxes(
+            anchor_boxes, gt_boxes
+        )
+        matched_gt_boxes = tf.gather(gt_boxes, matched_gt_idx)
+        box_target = self._compute_box_target(anchor_boxes, matched_gt_boxes)
+        matched_gt_cls_ids = tf.gather(cls_ids, matched_gt_idx)
+        cls_target = tf.where(
+            tf.not_equal(positive_mask, 1.0), -1.0, matched_gt_cls_ids
+        )
+        cls_target = tf.where(tf.equal(ignore_mask, 1.0), -2.0, cls_target)
+        cls_target = tf.expand_dims(cls_target, axis=-1)
+        label = tf.concat([box_target, cls_target], axis=-1)
+        return label
+
+    def encode_batch(self, batch_images, gt_boxes, cls_ids):
+        """Creates box and classification targets for a batch"""
+        images_shape = tf.shape(batch_images)
+        batch_size = images_shape[0]
+
+        labels = tf.TensorArray(dtype=tf.float32, size=batch_size, dynamic_size=True)
+        for i in range(batch_size):
+            label = self._encode_sample(images_shape, gt_boxes[i], cls_ids[i])
+            labels = labels.write(i, label)
+        batch_images = tf.keras.applications.resnet.preprocess_input(batch_images)
+        return batch_images, labels.stack()
+
+
+"""
+## Building the ResNet50 backbone
+
+RetinaNet uses a ResNet based backbone, using which a feature pyramid network
+is constructed. In the example we use ResNet50 as the backbone, and return the
+feature maps at strides 8, 16 and 32.
+"""
+
+
+def get_backbone():
+    """Builds ResNet50 with pre-trained imagenet weights"""
+    backbone = keras.applications.ResNet50(
+        include_top=False, input_shape=[None, None, 3]
+    )
+    c3_output, c4_output, c5_output = [
+        backbone.get_layer(layer_name).output
+        for layer_name in ["conv3_block4_out", "conv4_block6_out", "conv5_block3_out"]
+    ]
+    return keras.Model(
+        inputs=[backbone.inputs], outputs=[c3_output, c4_output, c5_output]
+    )
+
+
+"""
+## Building Feature Pyramid Network as a custom layer
+"""
+
+
+class FeaturePyramid(keras.layers.Layer):
+    """Builds the Feature Pyramid with the feature maps from the backbone.
+
+    Attributes:
+      num_classes: Number of classes in the dataset.
+      backbone: The backbone to build the feature pyramid from.
+        Currently supports ResNet50 only.
+    """
+
+    def __init__(self, backbone=None, **kwargs):
+        super().__init__(name="FeaturePyramid", **kwargs)
+        self.backbone = backbone if backbone else get_backbone()
+        self.conv_c3_1x1 = keras.layers.Conv2D(256, 1, 1, "same")
+        self.conv_c4_1x1 = keras.layers.Conv2D(256, 1, 1, "same")
+        self.conv_c5_1x1 = keras.layers.Conv2D(256, 1, 1, "same")
+        self.conv_c3_3x3 = keras.layers.Conv2D(256, 3, 1, "same")
+        self.conv_c4_3x3 = keras.layers.Conv2D(256, 3, 1, "same")
+        self.conv_c5_3x3 = keras.layers.Conv2D(256, 3, 1, "same")
+        self.conv_c6_3x3 = keras.layers.Conv2D(256, 3, 2, "same")
+        self.conv_c7_3x3 = keras.layers.Conv2D(256, 3, 2, "same")
+        self.upsample_2x = keras.layers.UpSampling2D(2)
+
+    def call(self, images, training=False):
+        c3_output, c4_output, c5_output = self.backbone(images, training=training)
+        p3_output = self.conv_c3_1x1(c3_output)
+        p4_output = self.conv_c4_1x1(c4_output)
+        p5_output = self.conv_c5_1x1(c5_output)
+        p4_output = p4_output + self.upsample_2x(p5_output)
+        p3_output = p3_output + self.upsample_2x(p4_output)
+        p3_output = self.conv_c3_3x3(p3_output)
+        p4_output = self.conv_c4_3x3(p4_output)
+        p5_output = self.conv_c5_3x3(p5_output)
+        p6_output = self.conv_c6_3x3(c5_output)
+        p7_output = self.conv_c7_3x3(tf.nn.relu(p6_output))
+        return p3_output, p4_output, p5_output, p6_output, p7_output
+
+
+"""
+## Building the classification and box regression heads.
+The RetinaNet model has separate heads for bounding box regression and
+for predicting class probabilities for the objects. These heads are shared
+between all the feature maps of the feature pyramid.
+"""
+
+
+def build_head(output_filters, bias_init):
+    """Builds the class/box predictions head.
+
+    Arguments:
+      output_filters: Number of convolution filters in the final layer.
+      bias_init: Bias Initializer for the final convolution layer.
+
+    Returns:
+      A keras sequential model representing either the classification
+        or the box regression head depending on `output_filters`.
+    """
+    head = keras.Sequential([keras.Input(shape=[None, None, 256])])
+    kernel_init = tf.initializers.RandomNormal(0.0, 0.01)
+    for _ in range(4):
+        head.add(
+            keras.layers.Conv2D(256, 3, padding="same", kernel_initializer=kernel_init)
+        )
+        head.add(keras.layers.ReLU())
+    head.add(
+        keras.layers.Conv2D(
+            output_filters,
+            3,
+            1,
+            padding="same",
+            kernel_initializer=kernel_init,
+            bias_initializer=bias_init,
+        )
+    )
+    return head
+
+
+"""
+## Building RetinaNet using a subclassed model
+"""
+
+
+class RetinaNet(keras.Model):
+    """A subclassed Keras model implementing the RetinaNet architecture.
+
+    Attributes:
+      num_classes: Number of classes in the dataset.
+      backbone: The backbone to build the feature pyramid from.
+        Currently supports ResNet50 only.
+    """
+
+    def __init__(self, num_classes, backbone=None, **kwargs):
+        super().__init__(name="RetinaNet", **kwargs)
+        self.fpn = FeaturePyramid(backbone)
+        self.num_classes = num_classes
+
+        prior_probability = tf.constant_initializer(-np.log((1 - 0.01) / 0.01))
+        self.cls_head = build_head(9 * num_classes, prior_probability)
+        self.box_head = build_head(9 * 4, "zeros")
+
+    def call(self, image, training=False):
+        features = self.fpn(image, training=training)
+        N = tf.shape(image)[0]
+        cls_outputs = []
+        box_outputs = []
+        for feature in features:
+            box_outputs.append(tf.reshape(self.box_head(feature), [N, -1, 4]))
+            cls_outputs.append(
+                tf.reshape(self.cls_head(feature), [N, -1, self.num_classes])
+            )
+        cls_outputs = tf.concat(cls_outputs, axis=1)
+        box_outputs = tf.concat(box_outputs, axis=1)
+        return tf.concat([box_outputs, cls_outputs], axis=-1)
+
+
+"""
+## Implementing a custom layer to decode predictions
+"""
+
+
+class DecodePredictions(tf.keras.layers.Layer):
+    """A Keras layer that decodes predictions of the RetinaNet model.
+
+    Attributes:
+      num_classes: Number of classes in the dataset
+      confidence_threshold: Minimum class probability, below which detections
+        are pruned.
+      nms_iou_threshold: IOU threshold for the NMS operation
+      max_detections_per_class: Maximum number of detections to retain per
+       class.
+      max_detections: Maximum number of detections to retain across all
+        classes.
+      box_variance: The scaling factors used to scale the bounding box
+        predictions.
+    """
+
+    def __init__(
+        self,
+        num_classes=80,
+        confidence_threshold=0.05,
+        nms_iou_threshold=0.5,
+        max_detections_per_class=100,
+        max_detections=100,
+        box_variance=[0.1, 0.1, 0.2, 0.2],
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.num_classes = num_classes
+        self.confidence_threshold = confidence_threshold
+        self.nms_iou_threshold = nms_iou_threshold
+        self.max_detections_per_class = max_detections_per_class
+        self.max_detections = max_detections
+
+        self._anchor_box = AnchorBox()
+        self._box_variance = tf.convert_to_tensor(
+            [0.1, 0.1, 0.2, 0.2], dtype=tf.float32
+        )
+
+    def _decode_box_predictions(self, anchor_boxes, box_predictions):
+        boxes = box_predictions * self._box_variance
+        boxes = tf.concat(
+            [
+                boxes[:, :, :2] * anchor_boxes[:, :, 2:] + anchor_boxes[:, :, :2],
+                tf.math.exp(boxes[:, :, 2:]) * anchor_boxes[:, :, 2:],
+            ],
+            axis=-1,
+        )
+        boxes_transformed = convert_to_corners(boxes)
+        return boxes_transformed
+
+    def call(self, images, predictions):
+        image_shape = tf.cast(tf.shape(images), dtype=tf.float32)
+        anchor_boxes = self._anchor_box.get_anchors(image_shape[1], image_shape[2])
+        box_predictions = predictions[:, :, :4]
+        cls_predictions = tf.nn.sigmoid(predictions[:, :, 4:])
+        boxes = self._decode_box_predictions(anchor_boxes[None, ...], box_predictions)
+
+        return tf.image.combined_non_max_suppression(
+            tf.expand_dims(boxes, axis=2),
+            cls_predictions,
+            self.max_detections_per_class,
+            self.max_detections,
+            self.nms_iou_threshold,
+            self.confidence_threshold,
+            clip_boxes=False,
+        )
+
+
+"""
+## Implementing Smooth L1 loss and Focal Loss as keras custom losses
+"""
+
+
+class RetinaNetBoxLoss(tf.losses.Loss):
+    """Implements Smooth L1 loss"""
+
+    def __init__(self, delta):
+        super().__init__(reduction="none", name="RetinaNetBoxLoss")
+        self._delta = delta
+
+    def call(self, y_true, y_pred):
+        difference = y_true - y_pred
+        absolute_difference = tf.abs(difference)
+        squared_difference = difference**2
+        loss = tf.where(
+            tf.less(absolute_difference, self._delta),
+            0.5 * squared_difference,
+            absolute_difference - 0.5,
+        )
+        return tf.reduce_sum(loss, axis=-1)
+
+
+class RetinaNetClassificationLoss(tf.losses.Loss):
+    """Implements Focal loss"""
+
+    def __init__(self, alpha, gamma):
+        super().__init__(reduction="none", name="RetinaNetClassificationLoss")
+        self._alpha = alpha
+        self._gamma = gamma
+
+    def call(self, y_true, y_pred):
+        cross_entropy = tf.nn.sigmoid_cross_entropy_with_logits(
+            labels=y_true, logits=y_pred
+        )
+        probs = tf.nn.sigmoid(y_pred)
+        alpha = tf.where(tf.equal(y_true, 1.0), self._alpha, (1.0 - self._alpha))
+        pt = tf.where(tf.equal(y_true, 1.0), probs, 1 - probs)
+        loss = alpha * tf.pow(1.0 - pt, self._gamma) * cross_entropy
+        return tf.reduce_sum(loss, axis=-1)
+
+
+class RetinaNetLoss(tf.losses.Loss):
+    """Wrapper to combine both the losses"""
+
+    def __init__(self, num_classes=80, alpha=0.25, gamma=2.0, delta=1.0):
+        super().__init__(reduction="auto", name="RetinaNetLoss")
+        self._clf_loss = RetinaNetClassificationLoss(alpha, gamma)
+        self._box_loss = RetinaNetBoxLoss(delta)
+        self._num_classes = num_classes
+
+    def call(self, y_true, y_pred):
+        y_pred = tf.cast(y_pred, dtype=tf.float32)
+        box_labels = y_true[:, :, :4]
+        box_predictions = y_pred[:, :, :4]
+        cls_labels = tf.one_hot(
+            tf.cast(y_true[:, :, 4], dtype=tf.int32),
+            depth=self._num_classes,
+            dtype=tf.float32,
+        )
+        cls_predictions = y_pred[:, :, 4:]
+        positive_mask = tf.cast(tf.greater(y_true[:, :, 4], -1.0), dtype=tf.float32)
+        ignore_mask = tf.cast(tf.equal(y_true[:, :, 4], -2.0), dtype=tf.float32)
+        clf_loss = self._clf_loss(cls_labels, cls_predictions)
+        box_loss = self._box_loss(box_labels, box_predictions)
+        clf_loss = tf.where(tf.equal(ignore_mask, 1.0), 0.0, clf_loss)
+        box_loss = tf.where(tf.equal(positive_mask, 1.0), box_loss, 0.0)
+        normalizer = tf.reduce_sum(positive_mask, axis=-1)
+        clf_loss = tf.math.divide_no_nan(tf.reduce_sum(clf_loss, axis=-1), normalizer)
+        box_loss = tf.math.divide_no_nan(tf.reduce_sum(box_loss, axis=-1), normalizer)
+        loss = clf_loss + box_loss
+        return loss
+
+
+"""
+## Setting up training parameters
+"""
+
+model_dir = "retinanet/"
+label_encoder = LabelEncoder()
+
+num_classes = 80
+batch_size = 2
+
+learning_rates = [2.5e-06, 0.000625, 0.00125, 0.0025, 0.00025, 2.5e-05]
+learning_rate_boundaries = [125, 250, 500, 240000, 360000]
+learning_rate_fn = tf.optimizers.schedules.PiecewiseConstantDecay(
+    boundaries=learning_rate_boundaries, values=learning_rates
+)
+
+"""
+## Initializing and compiling model
+"""
+
+resnet50_backbone = get_backbone()
+loss_fn = RetinaNetLoss(num_classes)
+model = RetinaNet(num_classes, resnet50_backbone)
+
+optimizer = tf.keras.optimizers.legacy.SGD(learning_rate=learning_rate_fn, momentum=0.9)
+model.compile(loss=loss_fn, optimizer=optimizer)
+
+"""
+## Setting up callbacks
+"""
+
+callbacks_list = [
+    tf.keras.callbacks.ModelCheckpoint(
+        filepath=os.path.join(model_dir, "weights" + "_epoch_{epoch}"),
+        monitor="loss",
+        save_best_only=False,
+        save_weights_only=True,
+        verbose=1,
+    )
+]
+
+"""
+## Load the COCO2017 dataset using TensorFlow Datasets
+"""
+
+#  set `data_dir=None` to load the complete dataset
+
+(train_dataset, val_dataset), dataset_info = tfds.load(
+    "coco/2017", split=["train", "validation"], with_info=True, data_dir="data"
+)
+
+"""
+## Setting up a `tf.data` pipeline
+
+To ensure that the model is fed with data efficiently we will be using
+`tf.data` API to create our input pipeline. The input pipeline
+consists for the following major processing steps:
+
+- Apply the preprocessing function to the samples
+- Create batches with fixed batch size. Since images in the batch can
+have different dimensions, and can also have different number of
+objects, we use `padded_batch` to the add the necessary padding to create
+rectangular tensors
+- Create targets for each sample in the batch using `LabelEncoder`
+"""
+
+autotune = tf.data.AUTOTUNE
+train_dataset = train_dataset.map(preprocess_data, num_parallel_calls=autotune)
+train_dataset = train_dataset.shuffle(8 * batch_size)
+train_dataset = train_dataset.padded_batch(
+    batch_size=batch_size, padding_values=(0.0, 1e-8, -1), drop_remainder=True
+)
+train_dataset = train_dataset.map(
+    label_encoder.encode_batch, num_parallel_calls=autotune
+)
+train_dataset = train_dataset.apply(tf.data.experimental.ignore_errors())
+train_dataset = train_dataset.prefetch(autotune)
+
+val_dataset = val_dataset.map(preprocess_data, num_parallel_calls=autotune)
+val_dataset = val_dataset.padded_batch(
+    batch_size=1, padding_values=(0.0, 1e-8, -1), drop_remainder=True
+)
+val_dataset = val_dataset.map(label_encoder.encode_batch, num_parallel_calls=autotune)
+val_dataset = val_dataset.apply(tf.data.experimental.ignore_errors())
+val_dataset = val_dataset.prefetch(autotune)
+
+"""
+## Training the model
+"""
+
+# Uncomment the following lines, when training on full dataset
+# train_steps_per_epoch = dataset_info.splits["train"].num_examples // batch_size
+# val_steps_per_epoch = \
+#     dataset_info.splits["validation"].num_examples // batch_size
+
+# train_steps = 4 * 100000
+# epochs = train_steps // train_steps_per_epoch
+
+epochs = 1
+
+# Running 100 training and 50 validation steps,
+# remove `.take` when training on the full dataset
+
+model.fit(
+    train_dataset.take(100),
+    validation_data=val_dataset.take(50),
+    epochs=epochs,
+    callbacks=callbacks_list,
+    verbose=1,
+)
+
+"""
+## Loading weights
+"""
+
+# Change this to `model_dir` when not using the downloaded weights
+weights_dir = "data"
+
+latest_checkpoint = tf.train.latest_checkpoint(weights_dir)
+model.load_weights(latest_checkpoint)
+
+"""
+## Building inference model
+"""
+
+image = tf.keras.Input(shape=[None, None, 3], name="image")
+predictions = model(image, training=False)
+detections = DecodePredictions(confidence_threshold=0.5)(image, predictions)
+inference_model = tf.keras.Model(inputs=image, outputs=detections)
+
+"""
+## Generating detections
+"""
+
+
+def prepare_image(image):
+    image, _, ratio = resize_and_pad_image(image, jitter=None)
+    image = tf.keras.applications.resnet.preprocess_input(image)
+    return tf.expand_dims(image, axis=0), ratio
+
+
+val_dataset = tfds.load("coco/2017", split="validation", data_dir="data")
+int2str = dataset_info.features["objects"]["label"].int2str
+
+for sample in val_dataset.take(2):
+    image = tf.cast(sample["image"], dtype=tf.float32)
+    input_image, ratio = prepare_image(image)
+    detections = inference_model.predict(input_image)
+    num_detections = detections.valid_detections[0]
+    class_names = [
+        int2str(int(x)) for x in detections.nmsed_classes[0][:num_detections]
+    ]
+    visualize_detections(
+        image,
+        detections.nmsed_boxes[0][:num_detections] / ratio,
+        class_names,
+        detections.nmsed_scores[0][:num_detections],
+    )
+
+"""
+Example available on HuggingFace.
+
+| Trained Model | Demo |
+| :--: | :--: |
+| [![Generic badge](https://img.shields.io/badge/%F0%9F%A4%97%20Model-Object%20Detection%20With%20Retinanet-black.svg)](https://huggingface.co/keras-io/Object-Detection-RetinaNet) | [![Generic badge](https://img.shields.io/badge/%F0%9F%A4%97%20Spaces-Object%20Detection%20With%20Retinanet-black.svg)](https://huggingface.co/spaces/keras-io/Object-Detection-Using-RetinaNet) |
+"""
diff --git a/knowledge_base/vision/semantic_image_clustering.py b/knowledge_base/vision/semantic_image_clustering.py
new file mode 100644
index 0000000000000000000000000000000000000000..d70dfb4d5a04ce7f3f19c9cdfc440b082538a66f
--- /dev/null
+++ b/knowledge_base/vision/semantic_image_clustering.py
@@ -0,0 +1,600 @@
+"""
+Title: Semantic Image Clustering
+Author: [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/)
+Date created: 2021/02/28
+Last modified: 2021/02/28
+Description: Semantic Clustering by Adopting Nearest neighbors (SCAN) algorithm.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+This example demonstrates how to apply the [Semantic Clustering by Adopting Nearest neighbors
+(SCAN)](https://arxiv.org/abs/2005.12320) algorithm (Van Gansbeke et al., 2020) on the
+[CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html) dataset. The algorithm consists of
+two phases:
+
+1. Self-supervised visual representation learning of images, in which we use the
+[simCLR](https://arxiv.org/abs/2002.05709) technique.
+2. Clustering of the learned visual representation vectors to maximize the agreement
+between the cluster assignments of neighboring vectors.
+"""
+"""
+## Setup
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+from collections import defaultdict
+import numpy as np
+import tensorflow as tf
+import keras
+from keras import layers
+import matplotlib.pyplot as plt
+from tqdm import tqdm
+
+"""
+## Prepare the data
+"""
+
+num_classes = 10
+input_shape = (32, 32, 3)
+
+(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()
+x_data = np.concatenate([x_train, x_test])
+y_data = np.concatenate([y_train, y_test])
+
+print("x_data shape:", x_data.shape, "- y_data shape:", y_data.shape)
+
+classes = [
+    "airplane",
+    "automobile",
+    "bird",
+    "cat",
+    "deer",
+    "dog",
+    "frog",
+    "horse",
+    "ship",
+    "truck",
+]
+
+"""
+## Define hyperparameters
+"""
+
+target_size = 32  # Resize the input images.
+representation_dim = 512  # The dimensions of the features vector.
+projection_units = 128  # The projection head of the representation learner.
+num_clusters = 20  # Number of clusters.
+k_neighbours = 5  # Number of neighbours to consider during cluster learning.
+tune_encoder_during_clustering = False  # Freeze the encoder in the cluster learning.
+
+"""
+## Implement data preprocessing
+
+The data preprocessing step resizes the input images to the desired `target_size` and applies
+feature-wise normalization. Note that, when using `keras.applications.ResNet50V2` as the
+visual encoder, resizing the images into 255 x 255 inputs would lead to more accurate results
+but require a longer time to train.
+"""
+
+data_preprocessing = keras.Sequential(
+    [
+        layers.Resizing(target_size, target_size),
+        layers.Normalization(),
+    ]
+)
+# Compute the mean and the variance from the data for normalization.
+data_preprocessing.layers[-1].adapt(x_data)
+
+"""
+## Data augmentation
+
+Unlike simCLR, which randomly picks a single data augmentation function to apply to an input
+image, we apply a set of data augmentation functions randomly to the input image.
+(You can experiment with other image augmentation techniques by following
+the [data augmentation tutorial](https://www.tensorflow.org/tutorials/images/data_augmentation).)
+"""
+
+data_augmentation = keras.Sequential(
+    [
+        layers.RandomTranslation(
+            height_factor=(-0.2, 0.2), width_factor=(-0.2, 0.2), fill_mode="nearest"
+        ),
+        layers.RandomFlip(mode="horizontal"),
+        layers.RandomRotation(factor=0.15, fill_mode="nearest"),
+        layers.RandomZoom(
+            height_factor=(-0.3, 0.1), width_factor=(-0.3, 0.1), fill_mode="nearest"
+        ),
+    ]
+)
+
+"""
+Display a random image
+"""
+
+image_idx = np.random.choice(range(x_data.shape[0]))
+image = x_data[image_idx]
+image_class = classes[y_data[image_idx][0]]
+plt.figure(figsize=(3, 3))
+plt.imshow(x_data[image_idx].astype("uint8"))
+plt.title(image_class)
+_ = plt.axis("off")
+
+"""
+Display a sample of augmented versions of the image
+"""
+
+plt.figure(figsize=(10, 10))
+for i in range(9):
+    augmented_images = data_augmentation(np.array([image]))
+    ax = plt.subplot(3, 3, i + 1)
+    plt.imshow(augmented_images[0].numpy().astype("uint8"))
+    plt.axis("off")
+
+"""
+## Self-supervised representation learning
+"""
+
+"""
+### Implement the vision encoder
+"""
+
+
+def create_encoder(representation_dim):
+    encoder = keras.Sequential(
+        [
+            keras.applications.ResNet50V2(
+                include_top=False, weights=None, pooling="avg"
+            ),
+            layers.Dense(representation_dim),
+        ]
+    )
+    return encoder
+
+
+"""
+### Implement the unsupervised contrastive loss
+"""
+
+
+class RepresentationLearner(keras.Model):
+    def __init__(
+        self,
+        encoder,
+        projection_units,
+        num_augmentations,
+        temperature=1.0,
+        dropout_rate=0.1,
+        l2_normalize=False,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.encoder = encoder
+        # Create projection head.
+        self.projector = keras.Sequential(
+            [
+                layers.Dropout(dropout_rate),
+                layers.Dense(units=projection_units, use_bias=False),
+                layers.BatchNormalization(),
+                layers.ReLU(),
+            ]
+        )
+        self.num_augmentations = num_augmentations
+        self.temperature = temperature
+        self.l2_normalize = l2_normalize
+        self.loss_tracker = keras.metrics.Mean(name="loss")
+
+    @property
+    def metrics(self):
+        return [self.loss_tracker]
+
+    def compute_contrastive_loss(self, feature_vectors, batch_size):
+        num_augmentations = keras.ops.shape(feature_vectors)[0] // batch_size
+        if self.l2_normalize:
+            feature_vectors = keras.utils.normalize(feature_vectors)
+        # The logits shape is [num_augmentations * batch_size, num_augmentations * batch_size].
+        logits = (
+            tf.linalg.matmul(feature_vectors, feature_vectors, transpose_b=True)
+            / self.temperature
+        )
+        # Apply log-max trick for numerical stability.
+        logits_max = keras.ops.max(logits, axis=1)
+        logits = logits - logits_max
+        # The shape of targets is [num_augmentations * batch_size, num_augmentations * batch_size].
+        # targets is a matrix consits of num_augmentations submatrices of shape [batch_size * batch_size].
+        # Each [batch_size * batch_size] submatrix is an identity matrix (diagonal entries are ones).
+        targets = keras.ops.tile(
+            tf.eye(batch_size), [num_augmentations, num_augmentations]
+        )
+        # Compute cross entropy loss
+        return keras.losses.categorical_crossentropy(
+            y_true=targets, y_pred=logits, from_logits=True
+        )
+
+    def call(self, inputs):
+        # Preprocess the input images.
+        preprocessed = data_preprocessing(inputs)
+        # Create augmented versions of the images.
+        augmented = []
+        for _ in range(self.num_augmentations):
+            augmented.append(data_augmentation(preprocessed))
+        augmented = layers.Concatenate(axis=0)(augmented)
+        # Generate embedding representations of the images.
+        features = self.encoder(augmented)
+        # Apply projection head.
+        return self.projector(features)
+
+    def train_step(self, inputs):
+        batch_size = keras.ops.shape(inputs)[0]
+        # Run the forward pass and compute the contrastive loss
+        with tf.GradientTape() as tape:
+            feature_vectors = self(inputs, training=True)
+            loss = self.compute_contrastive_loss(feature_vectors, batch_size)
+        # Compute gradients
+        trainable_vars = self.trainable_variables
+        gradients = tape.gradient(loss, trainable_vars)
+        # Update weights
+        self.optimizer.apply_gradients(zip(gradients, trainable_vars))
+        # Update loss tracker metric
+        self.loss_tracker.update_state(loss)
+        # Return a dict mapping metric names to current value
+        return {m.name: m.result() for m in self.metrics}
+
+    def test_step(self, inputs):
+        batch_size = keras.ops.shape(inputs)[0]
+        feature_vectors = self(inputs, training=False)
+        loss = self.compute_contrastive_loss(feature_vectors, batch_size)
+        self.loss_tracker.update_state(loss)
+        return {"loss": self.loss_tracker.result()}
+
+
+"""
+### Train the model
+"""
+# Create vision encoder.
+encoder = create_encoder(representation_dim)
+# Create representation learner.
+representation_learner = RepresentationLearner(
+    encoder, projection_units, num_augmentations=2, temperature=0.1
+)
+# Create a a Cosine decay learning rate scheduler.
+lr_scheduler = keras.optimizers.schedules.CosineDecay(
+    initial_learning_rate=0.001, decay_steps=500, alpha=0.1
+)
+# Compile the model.
+representation_learner.compile(
+    optimizer=keras.optimizers.AdamW(learning_rate=lr_scheduler, weight_decay=0.0001),
+    jit_compile=False,
+)
+# Fit the model.
+history = representation_learner.fit(
+    x=x_data,
+    batch_size=512,
+    epochs=50,  # for better results, increase the number of epochs to 500.
+)
+
+
+"""
+Plot training loss
+"""
+
+plt.plot(history.history["loss"])
+plt.ylabel("loss")
+plt.xlabel("epoch")
+plt.show()
+
+"""
+## Compute the nearest neighbors
+"""
+
+"""
+### Generate the embeddings for the images
+"""
+
+batch_size = 500
+# Get the feature vector representations of the images.
+feature_vectors = encoder.predict(x_data, batch_size=batch_size, verbose=1)
+# Normalize the feature vectores.
+feature_vectors = keras.utils.normalize(feature_vectors)
+
+"""
+### Find the *k* nearest neighbours for each embedding
+"""
+
+neighbours = []
+num_batches = feature_vectors.shape[0] // batch_size
+for batch_idx in tqdm(range(num_batches)):
+    start_idx = batch_idx * batch_size
+    end_idx = start_idx + batch_size
+    current_batch = feature_vectors[start_idx:end_idx]
+    # Compute the dot similarity.
+    similarities = tf.linalg.matmul(current_batch, feature_vectors, transpose_b=True)
+    # Get the indices of most similar vectors.
+    _, indices = keras.ops.top_k(similarities, k=k_neighbours + 1, sorted=True)
+    # Add the indices to the neighbours.
+    neighbours.append(indices[..., 1:])
+
+neighbours = np.reshape(np.array(neighbours), (-1, k_neighbours))
+
+"""
+Let's display some neighbors on each row
+"""
+
+nrows = 4
+ncols = k_neighbours + 1
+
+plt.figure(figsize=(12, 12))
+position = 1
+for _ in range(nrows):
+    anchor_idx = np.random.choice(range(x_data.shape[0]))
+    neighbour_indicies = neighbours[anchor_idx]
+    indices = [anchor_idx] + neighbour_indicies.tolist()
+    for j in range(ncols):
+        plt.subplot(nrows, ncols, position)
+        plt.imshow(x_data[indices[j]].astype("uint8"))
+        plt.title(classes[y_data[indices[j]][0]])
+        plt.axis("off")
+        position += 1
+
+"""
+You notice that images on each row are visually similar, and belong to similar classes.
+"""
+
+"""
+## Semantic clustering with nearest neighbours
+"""
+
+"""
+### Implement clustering consistency loss
+
+This loss tries to make sure that neighbours have the same clustering assignments.
+"""
+
+
+class ClustersConsistencyLoss(keras.losses.Loss):
+    def __init__(self):
+        super().__init__()
+
+    def __call__(self, target, similarity, sample_weight=None):
+        # Set targets to be ones.
+        target = keras.ops.ones_like(similarity)
+        # Compute cross entropy loss.
+        loss = keras.losses.binary_crossentropy(
+            y_true=target, y_pred=similarity, from_logits=True
+        )
+        return keras.ops.mean(loss)
+
+
+"""
+### Implement the clusters entropy loss
+
+This loss tries to make sure that cluster distribution is roughly uniformed, to avoid
+assigning most of the instances to one cluster.
+"""
+
+
+class ClustersEntropyLoss(keras.losses.Loss):
+    def __init__(self, entropy_loss_weight=1.0):
+        super().__init__()
+        self.entropy_loss_weight = entropy_loss_weight
+
+    def __call__(self, target, cluster_probabilities, sample_weight=None):
+        # Ideal entropy = log(num_clusters).
+        num_clusters = keras.ops.cast(
+            keras.ops.shape(cluster_probabilities)[-1], "float32"
+        )
+        target = keras.ops.log(num_clusters)
+        # Compute the overall clusters distribution.
+        cluster_probabilities = keras.ops.mean(cluster_probabilities, axis=0)
+        # Replacing zero probabilities - if any - with a very small value.
+        cluster_probabilities = keras.ops.clip(cluster_probabilities, 1e-8, 1.0)
+        # Compute the entropy over the clusters.
+        entropy = -keras.ops.sum(
+            cluster_probabilities * keras.ops.log(cluster_probabilities)
+        )
+        # Compute the difference between the target and the actual.
+        loss = target - entropy
+        return loss
+
+
+"""
+### Implement clustering model
+
+This model takes a raw image as an input, generated its feature vector using the trained
+encoder, and produces a probability distribution of the clusters given the feature vector
+as the cluster assignments.
+"""
+
+
+def create_clustering_model(encoder, num_clusters, name=None):
+    inputs = keras.Input(shape=input_shape)
+    # Preprocess the input images.
+    preprocessed = data_preprocessing(inputs)
+    # Apply data augmentation to the images.
+    augmented = data_augmentation(preprocessed)
+    # Generate embedding representations of the images.
+    features = encoder(augmented)
+    # Assign the images to clusters.
+    outputs = layers.Dense(units=num_clusters, activation="softmax")(features)
+    # Create the model.
+    model = keras.Model(inputs=inputs, outputs=outputs, name=name)
+    return model
+
+
+"""
+### Implement clustering learner
+
+This model receives the input `anchor` image and its `neighbours`, produces the clusters
+assignments for them using the `clustering_model`, and produces two outputs:
+1. `similarity`: the similarity between the cluster assignments of the `anchor` image and
+its `neighbours`. This output is fed to the `ClustersConsistencyLoss`.
+2. `anchor_clustering`: cluster assignments of the `anchor` images. This is fed to the `ClustersEntropyLoss`.
+"""
+
+
+def create_clustering_learner(clustering_model):
+    anchor = keras.Input(shape=input_shape, name="anchors")
+    neighbours = keras.Input(
+        shape=tuple([k_neighbours]) + input_shape, name="neighbours"
+    )
+    # Changes neighbours shape to [batch_size * k_neighbours, width, height, channels]
+    neighbours_reshaped = keras.ops.reshape(neighbours, tuple([-1]) + input_shape)
+    # anchor_clustering shape: [batch_size, num_clusters]
+    anchor_clustering = clustering_model(anchor)
+    # neighbours_clustering shape: [batch_size * k_neighbours, num_clusters]
+    neighbours_clustering = clustering_model(neighbours_reshaped)
+    # Convert neighbours_clustering shape to [batch_size, k_neighbours, num_clusters]
+    neighbours_clustering = keras.ops.reshape(
+        neighbours_clustering,
+        (-1, k_neighbours, keras.ops.shape(neighbours_clustering)[-1]),
+    )
+    # similarity shape: [batch_size, 1, k_neighbours]
+    similarity = keras.ops.einsum(
+        "bij,bkj->bik",
+        keras.ops.expand_dims(anchor_clustering, axis=1),
+        neighbours_clustering,
+    )
+    # similarity shape:  [batch_size, k_neighbours]
+    similarity = layers.Lambda(
+        lambda x: keras.ops.squeeze(x, axis=1), name="similarity"
+    )(similarity)
+    # Create the model.
+    model = keras.Model(
+        inputs=[anchor, neighbours],
+        outputs=[similarity, anchor_clustering],
+        name="clustering_learner",
+    )
+    return model
+
+
+"""
+### Train model
+"""
+
+# If tune_encoder_during_clustering is set to False,
+# then freeze the encoder weights.
+for layer in encoder.layers:
+    layer.trainable = tune_encoder_during_clustering
+# Create the clustering model and learner.
+clustering_model = create_clustering_model(encoder, num_clusters, name="clustering")
+clustering_learner = create_clustering_learner(clustering_model)
+# Instantiate the model losses.
+losses = [ClustersConsistencyLoss(), ClustersEntropyLoss(entropy_loss_weight=5)]
+# Create the model inputs and labels.
+inputs = {"anchors": x_data, "neighbours": tf.gather(x_data, neighbours)}
+labels = np.ones(shape=(x_data.shape[0]))
+# Compile the model.
+clustering_learner.compile(
+    optimizer=keras.optimizers.AdamW(learning_rate=0.0005, weight_decay=0.0001),
+    loss=losses,
+    jit_compile=False,
+)
+
+# Begin training the model.
+clustering_learner.fit(x=inputs, y=labels, batch_size=512, epochs=50)
+
+"""
+Plot training loss
+"""
+
+plt.plot(history.history["loss"])
+plt.ylabel("loss")
+plt.xlabel("epoch")
+plt.show()
+
+"""
+## Cluster analysis
+"""
+
+"""
+### Assign images to clusters
+"""
+
+# Get the cluster probability distribution of the input images.
+clustering_probs = clustering_model.predict(x_data, batch_size=batch_size, verbose=1)
+# Get the cluster of the highest probability.
+cluster_assignments = keras.ops.argmax(clustering_probs, axis=-1).numpy()
+# Store the clustering confidence.
+# Images with the highest clustering confidence are considered the 'prototypes'
+# of the clusters.
+cluster_confidence = keras.ops.max(clustering_probs, axis=-1).numpy()
+
+"""
+Let's compute the cluster sizes
+"""
+
+clusters = defaultdict(list)
+for idx, c in enumerate(cluster_assignments):
+    clusters[c].append((idx, cluster_confidence[idx]))
+
+non_empty_clusters = defaultdict(list)
+for c in clusters.keys():
+    if clusters[c]:
+        non_empty_clusters[c] = clusters[c]
+
+for c in range(num_clusters):
+    print("cluster", c, ":", len(clusters[c]))
+
+"""
+### Visualize cluster images
+
+Display the *prototypes*—instances with the highest clustering confidence—of each cluster:
+"""
+
+num_images = 8
+plt.figure(figsize=(15, 15))
+position = 1
+for c in non_empty_clusters.keys():
+    cluster_instances = sorted(
+        non_empty_clusters[c], key=lambda kv: kv[1], reverse=True
+    )
+
+    for j in range(num_images):
+        image_idx = cluster_instances[j][0]
+        plt.subplot(len(non_empty_clusters), num_images, position)
+        plt.imshow(x_data[image_idx].astype("uint8"))
+        plt.title(classes[y_data[image_idx][0]])
+        plt.axis("off")
+        position += 1
+
+"""
+### Compute clustering accuracy
+
+First, we assign a label for each cluster based on the majority label of its images.
+Then, we compute the accuracy of each cluster by dividing the number of image with the
+majority label by the size of the cluster.
+"""
+
+cluster_label_counts = dict()
+
+for c in range(num_clusters):
+    cluster_label_counts[c] = [0] * num_classes
+    instances = clusters[c]
+    for i, _ in instances:
+        cluster_label_counts[c][y_data[i][0]] += 1
+
+    cluster_label_idx = np.argmax(cluster_label_counts[c])
+    correct_count = np.max(cluster_label_counts[c])
+    cluster_size = len(clusters[c])
+    accuracy = (
+        np.round((correct_count / cluster_size) * 100, 2) if cluster_size > 0 else 0
+    )
+    cluster_label = classes[cluster_label_idx]
+    print("cluster", c, "label is:", cluster_label, " -  accuracy:", accuracy, "%")
+
+"""
+## Conclusion
+
+To improve the accuracy results, you can: 1) increase the number
+of epochs in the representation learning and the clustering phases; 2)
+allow the encoder weights to be tuned during the clustering phase; and 3) perform a final
+fine-tuning step through self-labeling, as described in the [original SCAN paper](https://arxiv.org/abs/2005.12320).
+Note that unsupervised image clustering techniques are not expected to outperform the accuracy
+of supervised image classification techniques, rather showing that they can learn the semantics
+of the images and group them into clusters that are similar to their original classes.
+"""
diff --git a/knowledge_base/vision/semisupervised_simclr.py b/knowledge_base/vision/semisupervised_simclr.py
new file mode 100644
index 0000000000000000000000000000000000000000..c398210962cfac708e763c6d88e2a0670a63fbbc
--- /dev/null
+++ b/knowledge_base/vision/semisupervised_simclr.py
@@ -0,0 +1,691 @@
+"""
+Title: Semi-supervised image classification using contrastive pretraining with SimCLR
+Author: [András Béres](https://www.linkedin.com/in/andras-beres-789190210)
+Date created: 2021/04/24
+Last modified: 2024/03/04
+Description: Contrastive pretraining with SimCLR for semi-supervised image classification on the STL-10 dataset.
+Accelerator: GPU
+Converted to Keras 3 by: [Sitam Meur](https://github.com/sitamgithub-MSIT)
+"""
+
+"""
+## Introduction
+
+### Semi-supervised learning
+
+Semi-supervised learning is a machine learning paradigm that deals with
+**partially labeled datasets**. When applying deep learning in the real world,
+one usually has to gather a large dataset to make it work well. However, while
+the cost of labeling scales linearly with the dataset size (labeling each
+example takes a constant time), model performance only scales
+[sublinearly](https://arxiv.org/abs/2001.08361) with it. This means that
+labeling more and more samples becomes less and less cost-efficient, while
+gathering unlabeled data is generally cheap, as it is usually readily available
+in large quantities.
+
+Semi-supervised learning offers to solve this problem by only requiring a
+partially labeled dataset, and by being label-efficient by utilizing the
+unlabeled examples for learning as well.
+
+In this example, we will pretrain an encoder with contrastive learning on the
+[STL-10](https://ai.stanford.edu/~acoates/stl10/) semi-supervised dataset using
+no labels at all, and then fine-tune it using only its labeled subset.
+
+### Contrastive learning
+
+On the highest level, the main idea behind contrastive learning is to **learn
+representations that are invariant to image augmentations** in a self-supervised
+manner. One problem with this objective is that it has a trivial degenerate
+solution: the case where the representations are constant, and do not depend at all on the
+input images.
+
+Contrastive learning avoids this trap by modifying the objective in the
+following way: it pulls representations of augmented versions/views of the same
+image closer to each other (contracting positives), while simultaneously pushing
+different images away from each other (contrasting negatives) in representation
+space.
+
+One such contrastive approach is [SimCLR](https://arxiv.org/abs/2002.05709),
+which essentially identifies the core components needed to optimize this
+objective, and can achieve high performance by scaling this simple approach.
+
+Another approach is [SimSiam](https://arxiv.org/abs/2011.10566)
+([Keras example](https://keras.io/examples/vision/simsiam/)),
+whose main difference from
+SimCLR is that the former does not use any negatives in its loss. Therefore, it does not
+explicitly prevent the trivial solution, and, instead, avoids it implicitly by
+architecture design (asymmetric encoding paths using a predictor network and
+batch normalization (BatchNorm) are applied in the final layers).
+
+For further reading about SimCLR, check out
+[the official Google AI blog post](https://ai.googleblog.com/2020/04/advancing-self-supervised-and-semi.html),
+and for an overview of self-supervised learning across both vision and language
+check out
+[this blog post](https://ai.facebook.com/blog/self-supervised-learning-the-dark-matter-of-intelligence/).
+"""
+
+"""
+## Setup
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+
+# Make sure we are able to handle large datasets
+import resource
+
+low, high = resource.getrlimit(resource.RLIMIT_NOFILE)
+resource.setrlimit(resource.RLIMIT_NOFILE, (high, high))
+
+import math
+import matplotlib.pyplot as plt
+import tensorflow as tf
+import tensorflow_datasets as tfds
+
+import keras
+from keras import ops
+from keras import layers
+
+"""
+## Hyperparameter setup
+"""
+# Dataset hyperparameters
+unlabeled_dataset_size = 100000
+labeled_dataset_size = 5000
+image_channels = 3
+
+# Algorithm hyperparameters
+num_epochs = 20
+batch_size = 525  # Corresponds to 200 steps per epoch
+width = 128
+temperature = 0.1
+# Stronger augmentations for contrastive, weaker ones for supervised training
+contrastive_augmentation = {"min_area": 0.25, "brightness": 0.6, "jitter": 0.2}
+classification_augmentation = {
+    "min_area": 0.75,
+    "brightness": 0.3,
+    "jitter": 0.1,
+}
+
+"""
+## Dataset
+
+During training we will simultaneously load a large batch of unlabeled images along with a
+smaller batch of labeled images.
+"""
+
+
+def prepare_dataset():
+    # Labeled and unlabeled samples are loaded synchronously
+    # with batch sizes selected accordingly
+    steps_per_epoch = (unlabeled_dataset_size + labeled_dataset_size) // batch_size
+    unlabeled_batch_size = unlabeled_dataset_size // steps_per_epoch
+    labeled_batch_size = labeled_dataset_size // steps_per_epoch
+    print(
+        f"batch size is {unlabeled_batch_size} (unlabeled) + {labeled_batch_size} (labeled)"
+    )
+
+    # Turning off shuffle to lower resource usage
+    unlabeled_train_dataset = (
+        tfds.load("stl10", split="unlabelled", as_supervised=True, shuffle_files=False)
+        .shuffle(buffer_size=10 * unlabeled_batch_size)
+        .batch(unlabeled_batch_size)
+    )
+    labeled_train_dataset = (
+        tfds.load("stl10", split="train", as_supervised=True, shuffle_files=False)
+        .shuffle(buffer_size=10 * labeled_batch_size)
+        .batch(labeled_batch_size)
+    )
+    test_dataset = (
+        tfds.load("stl10", split="test", as_supervised=True)
+        .batch(batch_size)
+        .prefetch(buffer_size=tf.data.AUTOTUNE)
+    )
+
+    # Labeled and unlabeled datasets are zipped together
+    train_dataset = tf.data.Dataset.zip(
+        (unlabeled_train_dataset, labeled_train_dataset)
+    ).prefetch(buffer_size=tf.data.AUTOTUNE)
+
+    return train_dataset, labeled_train_dataset, test_dataset
+
+
+# Load STL10 dataset
+train_dataset, labeled_train_dataset, test_dataset = prepare_dataset()
+
+"""
+## Image augmentations
+
+The two most important image augmentations for contrastive learning are the
+following:
+
+- Cropping: forces the model to encode different parts of the same image
+similarly, we implement it with the
+[RandomTranslation](https://keras.io/api/layers/preprocessing_layers/image_augmentation/random_translation/)
+and
+[RandomZoom](https://keras.io/api/layers/preprocessing_layers/image_augmentation/random_zoom/)
+layers
+- Color jitter: prevents a trivial color histogram-based solution to the task by
+distorting color histograms. A principled way to implement that is by affine
+transformations in color space.
+
+In this example we use random horizontal flips as well. Stronger augmentations
+are applied for contrastive learning, along with weaker ones for supervised
+classification to avoid overfitting on the few labeled examples.
+
+We implement random color jitter as a custom preprocessing layer. Using
+preprocessing layers for data augmentation has the following two advantages:
+
+- The data augmentation will run on GPU in batches, so the training will not be
+bottlenecked by the data pipeline in environments with constrained CPU
+resources (such as a Colab Notebook, or a personal machine)
+- Deployment is easier as the data preprocessing pipeline is encapsulated in the
+model, and does not have to be reimplemented when deploying it
+"""
+
+
+# Distorts the color distibutions of images
+class RandomColorAffine(layers.Layer):
+    def __init__(self, brightness=0, jitter=0, **kwargs):
+        super().__init__(**kwargs)
+
+        self.seed_generator = keras.random.SeedGenerator(1337)
+        self.brightness = brightness
+        self.jitter = jitter
+
+    def get_config(self):
+        config = super().get_config()
+        config.update({"brightness": self.brightness, "jitter": self.jitter})
+        return config
+
+    def call(self, images, training=True):
+        if training:
+            batch_size = ops.shape(images)[0]
+
+            # Same for all colors
+            brightness_scales = 1 + keras.random.uniform(
+                (batch_size, 1, 1, 1),
+                minval=-self.brightness,
+                maxval=self.brightness,
+                seed=self.seed_generator,
+            )
+            # Different for all colors
+            jitter_matrices = keras.random.uniform(
+                (batch_size, 1, 3, 3),
+                minval=-self.jitter,
+                maxval=self.jitter,
+                seed=self.seed_generator,
+            )
+
+            color_transforms = (
+                ops.tile(ops.expand_dims(ops.eye(3), axis=0), (batch_size, 1, 1, 1))
+                * brightness_scales
+                + jitter_matrices
+            )
+            images = ops.clip(ops.matmul(images, color_transforms), 0, 1)
+        return images
+
+
+# Image augmentation module
+def get_augmenter(min_area, brightness, jitter):
+    zoom_factor = 1.0 - math.sqrt(min_area)
+    return keras.Sequential(
+        [
+            layers.Rescaling(1 / 255),
+            layers.RandomFlip("horizontal"),
+            layers.RandomTranslation(zoom_factor / 2, zoom_factor / 2),
+            layers.RandomZoom((-zoom_factor, 0.0), (-zoom_factor, 0.0)),
+            RandomColorAffine(brightness, jitter),
+        ]
+    )
+
+
+def visualize_augmentations(num_images):
+    # Sample a batch from a dataset
+    images = next(iter(train_dataset))[0][0][:num_images]
+
+    # Apply augmentations
+    augmented_images = zip(
+        images,
+        get_augmenter(**classification_augmentation)(images),
+        get_augmenter(**contrastive_augmentation)(images),
+        get_augmenter(**contrastive_augmentation)(images),
+    )
+    row_titles = [
+        "Original:",
+        "Weakly augmented:",
+        "Strongly augmented:",
+        "Strongly augmented:",
+    ]
+    plt.figure(figsize=(num_images * 2.2, 4 * 2.2), dpi=100)
+    for column, image_row in enumerate(augmented_images):
+        for row, image in enumerate(image_row):
+            plt.subplot(4, num_images, row * num_images + column + 1)
+            plt.imshow(image)
+            if column == 0:
+                plt.title(row_titles[row], loc="left")
+            plt.axis("off")
+    plt.tight_layout()
+
+
+visualize_augmentations(num_images=8)
+
+"""
+## Encoder architecture
+"""
+
+
+# Define the encoder architecture
+def get_encoder():
+    return keras.Sequential(
+        [
+            layers.Conv2D(width, kernel_size=3, strides=2, activation="relu"),
+            layers.Conv2D(width, kernel_size=3, strides=2, activation="relu"),
+            layers.Conv2D(width, kernel_size=3, strides=2, activation="relu"),
+            layers.Conv2D(width, kernel_size=3, strides=2, activation="relu"),
+            layers.Flatten(),
+            layers.Dense(width, activation="relu"),
+        ],
+        name="encoder",
+    )
+
+
+"""
+## Supervised baseline model
+
+A baseline supervised model is trained using random initialization.
+"""
+
+# Baseline supervised training with random initialization
+baseline_model = keras.Sequential(
+    [
+        get_augmenter(**classification_augmentation),
+        get_encoder(),
+        layers.Dense(10),
+    ],
+    name="baseline_model",
+)
+baseline_model.compile(
+    optimizer=keras.optimizers.Adam(),
+    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")],
+)
+
+baseline_history = baseline_model.fit(
+    labeled_train_dataset, epochs=num_epochs, validation_data=test_dataset
+)
+
+print(
+    "Maximal validation accuracy: {:.2f}%".format(
+        max(baseline_history.history["val_acc"]) * 100
+    )
+)
+
+"""
+## Self-supervised model for contrastive pretraining
+
+We pretrain an encoder on unlabeled images with a contrastive loss.
+A nonlinear projection head is attached to the top of the encoder, as it
+improves the quality of representations of the encoder.
+
+We use the InfoNCE/NT-Xent/N-pairs loss, which can be interpreted in the
+following way:
+
+1. We treat each image in the batch as if it had its own class.
+2. Then, we have two examples (a pair of augmented views) for each "class".
+3. Each view's representation is compared to every possible pair's one (for both
+  augmented versions).
+4. We use the temperature-scaled cosine similarity of compared representations as
+  logits.
+5. Finally, we use categorical cross-entropy as the "classification" loss
+
+The following two metrics are used for monitoring the pretraining performance:
+
+- [Contrastive accuracy (SimCLR Table 5)](https://arxiv.org/abs/2002.05709):
+Self-supervised metric, the ratio of cases in which the representation of an
+image is more similar to its differently augmented version's one, than to the
+representation of any other image in the current batch. Self-supervised
+metrics can be used for hyperparameter tuning even in the case when there are
+no labeled examples.
+- [Linear probing accuracy](https://arxiv.org/abs/1603.08511): Linear probing is
+a popular metric to evaluate self-supervised classifiers. It is computed as
+the accuracy of a logistic regression classifier trained on top of the
+encoder's features. In our case, this is done by training a single dense layer
+on top of the frozen encoder. Note that contrary to traditional approach where
+the classifier is trained after the pretraining phase, in this example we
+train it during pretraining. This might slightly decrease its accuracy, but
+that way we can monitor its value during training, which helps with
+experimentation and debugging.
+
+Another widely used supervised metric is the
+[KNN accuracy](https://arxiv.org/abs/1805.01978), which is the accuracy of a KNN
+classifier trained on top of the encoder's features, which is not implemented in
+this example.
+"""
+
+
+# Define the contrastive model with model-subclassing
+class ContrastiveModel(keras.Model):
+    def __init__(self):
+        super().__init__()
+
+        self.temperature = temperature
+        self.contrastive_augmenter = get_augmenter(**contrastive_augmentation)
+        self.classification_augmenter = get_augmenter(**classification_augmentation)
+        self.encoder = get_encoder()
+        # Non-linear MLP as projection head
+        self.projection_head = keras.Sequential(
+            [
+                keras.Input(shape=(width,)),
+                layers.Dense(width, activation="relu"),
+                layers.Dense(width),
+            ],
+            name="projection_head",
+        )
+        # Single dense layer for linear probing
+        self.linear_probe = keras.Sequential(
+            [layers.Input(shape=(width,)), layers.Dense(10)],
+            name="linear_probe",
+        )
+
+        self.encoder.summary()
+        self.projection_head.summary()
+        self.linear_probe.summary()
+
+    def compile(self, contrastive_optimizer, probe_optimizer, **kwargs):
+        super().compile(**kwargs)
+
+        self.contrastive_optimizer = contrastive_optimizer
+        self.probe_optimizer = probe_optimizer
+
+        # self.contrastive_loss will be defined as a method
+        self.probe_loss = keras.losses.SparseCategoricalCrossentropy(from_logits=True)
+
+        self.contrastive_loss_tracker = keras.metrics.Mean(name="c_loss")
+        self.contrastive_accuracy = keras.metrics.SparseCategoricalAccuracy(
+            name="c_acc"
+        )
+        self.probe_loss_tracker = keras.metrics.Mean(name="p_loss")
+        self.probe_accuracy = keras.metrics.SparseCategoricalAccuracy(name="p_acc")
+
+    @property
+    def metrics(self):
+        return [
+            self.contrastive_loss_tracker,
+            self.contrastive_accuracy,
+            self.probe_loss_tracker,
+            self.probe_accuracy,
+        ]
+
+    def contrastive_loss(self, projections_1, projections_2):
+        # InfoNCE loss (information noise-contrastive estimation)
+        # NT-Xent loss (normalized temperature-scaled cross entropy)
+
+        # Cosine similarity: the dot product of the l2-normalized feature vectors
+        projections_1 = ops.normalize(projections_1, axis=1)
+        projections_2 = ops.normalize(projections_2, axis=1)
+        similarities = (
+            ops.matmul(projections_1, ops.transpose(projections_2)) / self.temperature
+        )
+
+        # The similarity between the representations of two augmented views of the
+        # same image should be higher than their similarity with other views
+        batch_size = ops.shape(projections_1)[0]
+        contrastive_labels = ops.arange(batch_size)
+        self.contrastive_accuracy.update_state(contrastive_labels, similarities)
+        self.contrastive_accuracy.update_state(
+            contrastive_labels, ops.transpose(similarities)
+        )
+
+        # The temperature-scaled similarities are used as logits for cross-entropy
+        # a symmetrized version of the loss is used here
+        loss_1_2 = keras.losses.sparse_categorical_crossentropy(
+            contrastive_labels, similarities, from_logits=True
+        )
+        loss_2_1 = keras.losses.sparse_categorical_crossentropy(
+            contrastive_labels, ops.transpose(similarities), from_logits=True
+        )
+        return (loss_1_2 + loss_2_1) / 2
+
+    def train_step(self, data):
+        (unlabeled_images, _), (labeled_images, labels) = data
+
+        # Both labeled and unlabeled images are used, without labels
+        images = ops.concatenate((unlabeled_images, labeled_images), axis=0)
+        # Each image is augmented twice, differently
+        augmented_images_1 = self.contrastive_augmenter(images, training=True)
+        augmented_images_2 = self.contrastive_augmenter(images, training=True)
+        with tf.GradientTape() as tape:
+            features_1 = self.encoder(augmented_images_1, training=True)
+            features_2 = self.encoder(augmented_images_2, training=True)
+            # The representations are passed through a projection mlp
+            projections_1 = self.projection_head(features_1, training=True)
+            projections_2 = self.projection_head(features_2, training=True)
+            contrastive_loss = self.contrastive_loss(projections_1, projections_2)
+        gradients = tape.gradient(
+            contrastive_loss,
+            self.encoder.trainable_weights + self.projection_head.trainable_weights,
+        )
+        self.contrastive_optimizer.apply_gradients(
+            zip(
+                gradients,
+                self.encoder.trainable_weights + self.projection_head.trainable_weights,
+            )
+        )
+        self.contrastive_loss_tracker.update_state(contrastive_loss)
+
+        # Labels are only used in evalutation for an on-the-fly logistic regression
+        preprocessed_images = self.classification_augmenter(
+            labeled_images, training=True
+        )
+        with tf.GradientTape() as tape:
+            # the encoder is used in inference mode here to avoid regularization
+            # and updating the batch normalization paramers if they are used
+            features = self.encoder(preprocessed_images, training=False)
+            class_logits = self.linear_probe(features, training=True)
+            probe_loss = self.probe_loss(labels, class_logits)
+        gradients = tape.gradient(probe_loss, self.linear_probe.trainable_weights)
+        self.probe_optimizer.apply_gradients(
+            zip(gradients, self.linear_probe.trainable_weights)
+        )
+        self.probe_loss_tracker.update_state(probe_loss)
+        self.probe_accuracy.update_state(labels, class_logits)
+
+        return {m.name: m.result() for m in self.metrics}
+
+    def test_step(self, data):
+        labeled_images, labels = data
+
+        # For testing the components are used with a training=False flag
+        preprocessed_images = self.classification_augmenter(
+            labeled_images, training=False
+        )
+        features = self.encoder(preprocessed_images, training=False)
+        class_logits = self.linear_probe(features, training=False)
+        probe_loss = self.probe_loss(labels, class_logits)
+        self.probe_loss_tracker.update_state(probe_loss)
+        self.probe_accuracy.update_state(labels, class_logits)
+
+        # Only the probe metrics are logged at test time
+        return {m.name: m.result() for m in self.metrics[2:]}
+
+
+# Contrastive pretraining
+pretraining_model = ContrastiveModel()
+pretraining_model.compile(
+    contrastive_optimizer=keras.optimizers.Adam(),
+    probe_optimizer=keras.optimizers.Adam(),
+)
+
+pretraining_history = pretraining_model.fit(
+    train_dataset, epochs=num_epochs, validation_data=test_dataset
+)
+print(
+    "Maximal validation accuracy: {:.2f}%".format(
+        max(pretraining_history.history["val_p_acc"]) * 100
+    )
+)
+
+"""
+## Supervised finetuning of the pretrained encoder
+
+We then finetune the encoder on the labeled examples, by attaching
+a single randomly initalized fully connected classification layer on its top.
+"""
+
+# Supervised finetuning of the pretrained encoder
+finetuning_model = keras.Sequential(
+    [
+        get_augmenter(**classification_augmentation),
+        pretraining_model.encoder,
+        layers.Dense(10),
+    ],
+    name="finetuning_model",
+)
+finetuning_model.compile(
+    optimizer=keras.optimizers.Adam(),
+    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")],
+)
+
+finetuning_history = finetuning_model.fit(
+    labeled_train_dataset, epochs=num_epochs, validation_data=test_dataset
+)
+print(
+    "Maximal validation accuracy: {:.2f}%".format(
+        max(finetuning_history.history["val_acc"]) * 100
+    )
+)
+
+"""
+## Comparison against the baseline
+"""
+
+
+# The classification accuracies of the baseline and the pretraining + finetuning process:
+def plot_training_curves(pretraining_history, finetuning_history, baseline_history):
+    for metric_key, metric_name in zip(["acc", "loss"], ["accuracy", "loss"]):
+        plt.figure(figsize=(8, 5), dpi=100)
+        plt.plot(
+            baseline_history.history[f"val_{metric_key}"],
+            label="supervised baseline",
+        )
+        plt.plot(
+            pretraining_history.history[f"val_p_{metric_key}"],
+            label="self-supervised pretraining",
+        )
+        plt.plot(
+            finetuning_history.history[f"val_{metric_key}"],
+            label="supervised finetuning",
+        )
+        plt.legend()
+        plt.title(f"Classification {metric_name} during training")
+        plt.xlabel("epochs")
+        plt.ylabel(f"validation {metric_name}")
+
+
+plot_training_curves(pretraining_history, finetuning_history, baseline_history)
+
+"""
+By comparing the training curves, we can see that when using contrastive
+pretraining, a higher validation accuracy can be reached, paired with a lower
+validation loss, which means that the pretrained network was able to generalize
+better when seeing only a small amount of labeled examples.
+"""
+
+"""
+## Improving further
+
+### Architecture
+
+The experiment in the original paper demonstrated that increasing the width and depth of the
+models improves performance at a higher rate than for supervised learning. Also,
+using a [ResNet-50](https://keras.io/api/applications/resnet/#resnet50-function)
+encoder is quite standard in the literature. However keep in mind, that more
+powerful models will not only increase training time but will also require more
+memory and will limit the maximal batch size you can use.
+
+It has [been](https://arxiv.org/abs/1905.09272)
+[reported](https://arxiv.org/abs/1911.05722) that the usage of BatchNorm layers
+could sometimes degrade performance, as it introduces an intra-batch dependency
+between samples, which is why I did not have used them in this example. In my
+experiments however, using BatchNorm, especially in the projection head,
+improves performance.
+
+### Hyperparameters
+
+The hyperparameters used in this example have been tuned manually for this task and
+architecture. Therefore, without changing them, only marginal gains can be expected
+from further hyperparameter tuning.
+
+However for a different task or model architecture these would need tuning, so
+here are my notes on the most important ones:
+
+- **Batch size**: since the objective can be interpreted as a classification
+over a batch of images (loosely speaking), the batch size is actually a more
+important hyperparameter than usual. The higher, the better.
+- **Temperature**: the temperature defines the "softness" of the softmax
+distribution that is used in the cross-entropy loss, and is an important
+hyperparameter. Lower values generally lead to a higher contrastive accuracy.
+A recent trick (in [ALIGN](https://arxiv.org/abs/2102.05918)) is to learn
+the temperature's value as well (which can be done by defining it as a
+tf.Variable, and applying gradients on it). Even though this provides a good baseline
+value, in my experiments the learned temperature was somewhat lower
+than optimal, as it is optimized with respect to the contrastive loss, which is not a
+perfect proxy for representation quality.
+- **Image augmentation strength**: during pretraining stronger augmentations
+increase the difficulty of the task, however after a point too strong
+augmentations will degrade performance. During finetuning stronger
+augmentations reduce overfitting while in my experience too strong
+augmentations decrease the performance gains from pretraining. The whole data
+augmentation pipeline can be seen as an important hyperparameter of the
+algorithm, implementations of other custom image augmentation layers in Keras
+can be found in
+[this repository](https://github.com/beresandras/image-augmentation-layers-keras).
+- **Learning rate schedule**: a constant schedule is used here, but it is
+quite common in the literature to use a
+[cosine decay schedule](https://www.tensorflow.org/api_docs/python/tf/keras/experimental/CosineDecay),
+which can further improve performance.
+- **Optimizer**: Adam is used in this example, as it provides good performance
+with default parameters. SGD with momentum requires more tuning, however it
+could slightly increase performance.
+"""
+
+"""
+## Related works
+
+Other instance-level (image-level) contrastive learning methods:
+
+- [MoCo](https://arxiv.org/abs/1911.05722)
+([v2](https://arxiv.org/abs/2003.04297),
+[v3](https://arxiv.org/abs/2104.02057)): uses a momentum-encoder as well,
+whose weights are an exponential moving average of the target encoder
+- [SwAV](https://arxiv.org/abs/2006.09882): uses clustering instead of pairwise
+comparison
+- [BarlowTwins](https://arxiv.org/abs/2103.03230): uses a cross
+correlation-based objective instead of pairwise comparison
+
+Keras implementations of **MoCo** and **BarlowTwins** can be found in
+[this repository](https://github.com/beresandras/contrastive-classification-keras),
+which includes a Colab notebook.
+
+There is also a new line of works, which optimize a similar objective, but
+without the use of any negatives:
+
+- [BYOL](https://arxiv.org/abs/2006.07733): momentum-encoder + no negatives
+- [SimSiam](https://arxiv.org/abs/2011.10566)
+([Keras example](https://keras.io/examples/vision/simsiam/)):
+no momentum-encoder + no negatives
+
+In my experience, these methods are more brittle (they can collapse to a constant
+representation, I could not get them to work using this encoder architecture).
+Even though they are generally more dependent on the
+[model](https://generallyintelligent.ai/understanding-self-supervised-contrastive-learning.html)
+[architecture](https://arxiv.org/abs/2010.10241), they can improve
+performance at smaller batch sizes.
+
+You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/keras-io/semi-supervised-classification-simclr)
+and try the demo on [Hugging Face Spaces](https://huggingface.co/spaces/keras-io/semi-supervised-classification).
+"""
diff --git a/knowledge_base/vision/shiftvit.py b/knowledge_base/vision/shiftvit.py
new file mode 100644
index 0000000000000000000000000000000000000000..c47ddc0785b47d085e3d7252ae089822abf0cf2d
--- /dev/null
+++ b/knowledge_base/vision/shiftvit.py
@@ -0,0 +1,1053 @@
+"""
+Title: A Vision Transformer without Attention
+Author: [Aritra Roy Gosthipaty](https://twitter.com/ariG23498), [Ritwik Raha](https://twitter.com/ritwik_raha), [Shivalika Singh](https://www.linkedin.com/in/shivalika-singh/)
+Date created: 2022/02/24
+Last modified: 2024/12/06
+Description: A minimal implementation of ShiftViT.
+Accelerator: GPU
+Converted to Keras 3 by: [Sitam Meur](https://github.com/sitamgithub-MSIT)
+"""
+
+"""
+## Introduction
+
+[Vision Transformers](https://arxiv.org/abs/2010.11929) (ViTs) have sparked a wave of
+research at the intersection of Transformers and Computer Vision (CV).
+
+ViTs can simultaneously model long- and short-range dependencies, thanks to
+the Multi-Head Self-Attention mechanism in the Transformer block. Many researchers believe
+that the success of ViTs are purely due to the attention layer, and they seldom
+think about other parts of the ViT model.
+
+In the academic paper
+[When Shift Operation Meets Vision Transformer: An Extremely Simple Alternative to Attention Mechanism](https://arxiv.org/abs/2201.10801)
+the authors propose to demystify the success of ViTs with the introduction of a **NO
+PARAMETER** operation in place of the attention operation. They swap the attention
+operation with a shifting operation.
+
+In this example, we minimally implement the paper with close alignement to the author's
+[official implementation](https://github.com/microsoft/SPACH/blob/main/models/shiftvit.py).
+
+This example requires TensorFlow 2.9 or higher.
+"""
+
+"""
+## Setup and imports
+"""
+
+import numpy as np
+import matplotlib.pyplot as plt
+
+import keras
+from keras import ops
+from keras import layers
+import tensorflow as tf
+
+import pathlib
+import glob
+
+# Setting seed for reproducibiltiy
+SEED = 42
+keras.utils.set_random_seed(SEED)
+
+"""
+## Hyperparameters
+
+These are the hyperparameters that we have chosen for the experiment.
+Please feel free to tune them.
+"""
+
+
+class Config(object):
+    # DATA
+    batch_size = 256
+    buffer_size = batch_size * 2
+    input_shape = (32, 32, 3)
+    num_classes = 10
+
+    # AUGMENTATION
+    image_size = 48
+
+    # ARCHITECTURE
+    patch_size = 4
+    projected_dim = 96
+    num_shift_blocks_per_stages = [2, 4, 8, 2]
+    epsilon = 1e-5
+    stochastic_depth_rate = 0.2
+    mlp_dropout_rate = 0.2
+    num_div = 12
+    shift_pixel = 1
+    mlp_expand_ratio = 2
+
+    # OPTIMIZER
+    lr_start = 1e-5
+    lr_max = 1e-3
+    weight_decay = 1e-4
+
+    # TRAINING
+    epochs = 100
+
+    # INFERENCE
+    label_map = {
+        0: "airplane",
+        1: "automobile",
+        2: "bird",
+        3: "cat",
+        4: "deer",
+        5: "dog",
+        6: "frog",
+        7: "horse",
+        8: "ship",
+        9: "truck",
+    }
+    tf_ds_batch_size = 20
+
+
+config = Config()
+
+"""
+## Load the CIFAR-10 dataset
+
+We use the CIFAR-10 dataset for our experiments.
+"""
+
+(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()
+(x_train, y_train), (x_val, y_val) = (
+    (x_train[:40000], y_train[:40000]),
+    (x_train[40000:], y_train[40000:]),
+)
+print(f"Training samples: {len(x_train)}")
+print(f"Validation samples: {len(x_val)}")
+print(f"Testing samples: {len(x_test)}")
+
+AUTO = tf.data.AUTOTUNE
+train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+train_ds = train_ds.shuffle(config.buffer_size).batch(config.batch_size).prefetch(AUTO)
+
+val_ds = tf.data.Dataset.from_tensor_slices((x_val, y_val))
+val_ds = val_ds.batch(config.batch_size).prefetch(AUTO)
+
+test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test))
+test_ds = test_ds.batch(config.batch_size).prefetch(AUTO)
+
+"""
+## Data Augmentation
+
+The augmentation pipeline consists of:
+
+- Rescaling
+- Resizing
+- Random cropping
+- Random horizontal flipping
+
+_Note_: The image data augmentation layers do not apply
+data transformations at inference time. This means that
+when these layers are called with `training=False` they
+behave differently. Refer to the
+[documentation](https://keras.io/api/layers/preprocessing_layers/image_augmentation/)
+for more details.
+"""
+
+
+def get_augmentation_model():
+    """Build the data augmentation model."""
+    data_augmentation = keras.Sequential(
+        [
+            layers.Resizing(config.input_shape[0] + 20, config.input_shape[0] + 20),
+            layers.RandomCrop(config.image_size, config.image_size),
+            layers.RandomFlip("horizontal"),
+            layers.Rescaling(1 / 255.0),
+        ]
+    )
+    return data_augmentation
+
+
+"""
+## The ShiftViT architecture
+
+In this section, we build the architecture proposed in
+[the ShiftViT paper](https://arxiv.org/abs/2201.10801).
+
+| ![ShiftViT Architecture](https://i.imgur.com/CHU40HX.png) |
+| :--: |
+| Figure 1: The entire architecutre of ShiftViT.
+[Source](https://arxiv.org/abs/2201.10801) |
+
+The architecture as shown in Fig. 1, is inspired by
+[Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030).
+Here the authors propose a modular architecture with 4 stages. Each stage works on its
+own spatial size, creating a hierarchical architecture.
+
+An input image of size `HxWx3` is split into non-overlapping patches of size `4x4`.
+This is done via the patchify layer which results in individual tokens of feature size `48`
+(`4x4x3`). Each stage comprises two parts:
+
+1. Embedding Generation
+2. Stacked Shift Blocks
+
+We discuss the stages and the modules in detail in what follows.
+
+_Note_: Compared to the [official implementation](https://github.com/microsoft/SPACH/blob/main/models/shiftvit.py)
+we restructure some key components to better fit the Keras API.
+"""
+
+"""
+### The ShiftViT Block
+
+| ![ShiftViT block](https://i.imgur.com/IDe35vo.gif) |
+| :--: |
+| Figure 2: From the Model to a Shift Block. |
+
+Each stage in the ShiftViT architecture comprises of a Shift Block as shown in Fig 2.
+
+| ![Shift Vit Block](https://i.imgur.com/0q13pLu.png) |
+| :--: |
+| Figure 3: The Shift ViT Block. [Source](https://arxiv.org/abs/2201.10801) |
+
+The Shift Block as shown in Fig. 3, comprises of the following:
+
+1. Shift Operation
+2. Linear Normalization
+3. MLP Layer
+"""
+
+"""
+#### The MLP block
+
+The MLP block is intended to be a stack of densely-connected layers
+"""
+
+
+class MLP(layers.Layer):
+    """Get the MLP layer for each shift block.
+
+    Args:
+        mlp_expand_ratio (int): The ratio with which the first feature map is expanded.
+        mlp_dropout_rate (float): The rate for dropout.
+    """
+
+    def __init__(self, mlp_expand_ratio, mlp_dropout_rate, **kwargs):
+        super().__init__(**kwargs)
+        self.mlp_expand_ratio = mlp_expand_ratio
+        self.mlp_dropout_rate = mlp_dropout_rate
+
+    def build(self, input_shape):
+        input_channels = input_shape[-1]
+        initial_filters = int(self.mlp_expand_ratio * input_channels)
+
+        self.mlp = keras.Sequential(
+            [
+                layers.Dense(
+                    units=initial_filters,
+                    activation="gelu",
+                ),
+                layers.Dropout(rate=self.mlp_dropout_rate),
+                layers.Dense(units=input_channels),
+                layers.Dropout(rate=self.mlp_dropout_rate),
+            ]
+        )
+
+    def call(self, x):
+        x = self.mlp(x)
+        return x
+
+
+"""
+#### The DropPath layer
+
+Stochastic depth is a regularization technique that randomly drops a set of
+layers. During inference, the layers are kept as they are. It is very
+similar to Dropout, but it operates on a block of layers rather
+than on individual nodes present inside a layer.
+"""
+
+
+class DropPath(layers.Layer):
+    """Drop Path also known as the Stochastic Depth layer.
+
+    Refernece:
+        - https://keras.io/examples/vision/cct/#stochastic-depth-for-regularization
+        - github.com:rwightman/pytorch-image-models
+    """
+
+    def __init__(self, drop_path_prob, **kwargs):
+        super().__init__(**kwargs)
+        self.drop_path_prob = drop_path_prob
+        self.seed_generator = keras.random.SeedGenerator(1337)
+
+    def call(self, x, training=False):
+        if training:
+            keep_prob = 1 - self.drop_path_prob
+            shape = (ops.shape(x)[0],) + (1,) * (len(ops.shape(x)) - 1)
+            random_tensor = keep_prob + keras.random.uniform(
+                shape, 0, 1, seed=self.seed_generator
+            )
+            random_tensor = ops.floor(random_tensor)
+            return (x / keep_prob) * random_tensor
+        return x
+
+
+"""
+#### Block
+
+The most important operation in this paper is the **shift operation**. In this section,
+we describe the shift operation and compare it with its original implementation provided
+by the authors.
+
+A generic feature map is assumed to have the shape `[N, H, W, C]`. Here we choose a
+`num_div` parameter that decides the division size of the channels. The first 4 divisions
+are shifted (1 pixel) in the left, right, up, and down direction. The remaining splits
+are kept as is. After partial shifting the shifted channels are padded and the overflown
+pixels are chopped off. This completes the partial shifting operation.
+
+In the original implementation, the code is approximately:
+
+```python
+out[:, g * 0:g * 1, :, :-1] = x[:, g * 0:g * 1, :, 1:]  # shift left
+out[:, g * 1:g * 2, :, 1:] = x[:, g * 1:g * 2, :, :-1]  # shift right
+out[:, g * 2:g * 3, :-1, :] = x[:, g * 2:g * 3, 1:, :]  # shift up
+out[:, g * 3:g * 4, 1:, :] = x[:, g * 3:g * 4, :-1, :]  # shift down
+
+out[:, g * 4:, :, :] = x[:, g * 4:, :, :]  # no shift
+```
+
+In TensorFlow it would be infeasible for us to assign shifted channels to a tensor in the
+middle of the training process. This is why we have resorted to the following procedure:
+
+1. Split the channels with the `num_div` parameter.
+2. Select each of the first four spilts and shift and pad them in the respective
+directions.
+3. After shifting and padding, we concatenate the channel back.
+
+| ![Manim rendered animation for shift operation](https://i.imgur.com/PReeULP.gif) |
+| :--: |
+| Figure 4: The TensorFlow style shifting |
+
+The entire procedure is explained in the Fig. 4.
+"""
+
+
+class ShiftViTBlock(layers.Layer):
+    """A unit ShiftViT Block
+
+    Args:
+        shift_pixel (int): The number of pixels to shift. Default to 1.
+        mlp_expand_ratio (int): The ratio with which MLP features are
+            expanded. Default to 2.
+        mlp_dropout_rate (float): The dropout rate used in MLP.
+        num_div (int): The number of divisions of the feature map's channel.
+            Totally, 4/num_div of channels will be shifted. Defaults to 12.
+        epsilon (float): Epsilon constant.
+        drop_path_prob (float): The drop probability for drop path.
+    """
+
+    def __init__(
+        self,
+        epsilon,
+        drop_path_prob,
+        mlp_dropout_rate,
+        num_div=12,
+        shift_pixel=1,
+        mlp_expand_ratio=2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.shift_pixel = shift_pixel
+        self.mlp_expand_ratio = mlp_expand_ratio
+        self.mlp_dropout_rate = mlp_dropout_rate
+        self.num_div = num_div
+        self.epsilon = epsilon
+        self.drop_path_prob = drop_path_prob
+
+    def build(self, input_shape):
+        self.H = input_shape[1]
+        self.W = input_shape[2]
+        self.C = input_shape[3]
+        self.layer_norm = layers.LayerNormalization(epsilon=self.epsilon)
+        self.drop_path = (
+            DropPath(drop_path_prob=self.drop_path_prob)
+            if self.drop_path_prob > 0.0
+            else layers.Activation("linear")
+        )
+        self.mlp = MLP(
+            mlp_expand_ratio=self.mlp_expand_ratio,
+            mlp_dropout_rate=self.mlp_dropout_rate,
+        )
+
+    def get_shift_pad(self, x, mode):
+        """Shifts the channels according to the mode chosen."""
+        if mode == "left":
+            offset_height = 0
+            offset_width = 0
+            target_height = 0
+            target_width = self.shift_pixel
+        elif mode == "right":
+            offset_height = 0
+            offset_width = self.shift_pixel
+            target_height = 0
+            target_width = self.shift_pixel
+        elif mode == "up":
+            offset_height = 0
+            offset_width = 0
+            target_height = self.shift_pixel
+            target_width = 0
+        else:
+            offset_height = self.shift_pixel
+            offset_width = 0
+            target_height = self.shift_pixel
+            target_width = 0
+        crop = ops.image.crop_images(
+            x,
+            top_cropping=offset_height,
+            left_cropping=offset_width,
+            target_height=self.H - target_height,
+            target_width=self.W - target_width,
+        )
+        shift_pad = ops.image.pad_images(
+            crop,
+            top_padding=offset_height,
+            left_padding=offset_width,
+            target_height=self.H,
+            target_width=self.W,
+        )
+        return shift_pad
+
+    def call(self, x, training=False):
+        # Split the feature maps
+        x_splits = ops.split(x, indices_or_sections=self.C // self.num_div, axis=-1)
+
+        # Shift the feature maps
+        x_splits[0] = self.get_shift_pad(x_splits[0], mode="left")
+        x_splits[1] = self.get_shift_pad(x_splits[1], mode="right")
+        x_splits[2] = self.get_shift_pad(x_splits[2], mode="up")
+        x_splits[3] = self.get_shift_pad(x_splits[3], mode="down")
+
+        # Concatenate the shifted and unshifted feature maps
+        x = ops.concatenate(x_splits, axis=-1)
+
+        # Add the residual connection
+        shortcut = x
+        x = shortcut + self.drop_path(self.mlp(self.layer_norm(x)), training=training)
+        return x
+
+
+"""
+### The ShiftViT blocks
+
+| ![Shift Blokcs](https://i.imgur.com/FKy5NnD.png) |
+| :--: |
+| Figure 5: Shift Blocks in the architecture. [Source](https://arxiv.org/abs/2201.10801) |
+
+Each stage of the architecture has shift blocks as shown in Fig.5. Each of these blocks
+contain a variable number of stacked ShiftViT block (as built in the earlier section).
+
+Shift blocks are followed by a PatchMerging layer that scales down feature inputs. The
+PatchMerging layer helps in the pyramidal structure of the model.
+"""
+
+"""
+#### The PatchMerging layer
+
+This layer merges the two adjacent tokens. This layer helps in scaling the features down
+spatially and increasing the features up channel wise. We use a Conv2D layer to merge the
+patches.
+"""
+
+
+class PatchMerging(layers.Layer):
+    """The Patch Merging layer.
+
+    Args:
+        epsilon (float): The epsilon constant.
+    """
+
+    def __init__(self, epsilon, **kwargs):
+        super().__init__(**kwargs)
+        self.epsilon = epsilon
+
+    def build(self, input_shape):
+        filters = 2 * input_shape[-1]
+        self.reduction = layers.Conv2D(
+            filters=filters, kernel_size=2, strides=2, padding="same", use_bias=False
+        )
+        self.layer_norm = layers.LayerNormalization(epsilon=self.epsilon)
+
+    def call(self, x):
+        # Apply the patch merging algorithm on the feature maps
+        x = self.layer_norm(x)
+        x = self.reduction(x)
+        return x
+
+
+"""
+#### Stacked Shift Blocks
+
+Each stage will have a variable number of stacked ShiftViT Blocks, as suggested in
+the paper. This is a generic layer that will contain the stacked shift vit blocks
+with the patch merging layer as well. Combining the two operations (shift ViT
+block and patch merging) is a design choice we picked for better code reusability.
+"""
+
+
+# Note: This layer will have a different depth of stacking
+# for different stages on the model.
+class StackedShiftBlocks(layers.Layer):
+    """The layer containing stacked ShiftViTBlocks.
+
+    Args:
+        epsilon (float): The epsilon constant.
+        mlp_dropout_rate (float): The dropout rate used in the MLP block.
+        num_shift_blocks (int): The number of shift vit blocks for this stage.
+        stochastic_depth_rate (float): The maximum drop path rate chosen.
+        is_merge (boolean): A flag that determines the use of the Patch Merge
+            layer after the shift vit blocks.
+        num_div (int): The division of channels of the feature map. Defaults to 12.
+        shift_pixel (int): The number of pixels to shift. Defaults to 1.
+        mlp_expand_ratio (int): The ratio with which the initial dense layer of
+            the MLP is expanded Defaults to 2.
+    """
+
+    def __init__(
+        self,
+        epsilon,
+        mlp_dropout_rate,
+        num_shift_blocks,
+        stochastic_depth_rate,
+        is_merge,
+        num_div=12,
+        shift_pixel=1,
+        mlp_expand_ratio=2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.epsilon = epsilon
+        self.mlp_dropout_rate = mlp_dropout_rate
+        self.num_shift_blocks = num_shift_blocks
+        self.stochastic_depth_rate = stochastic_depth_rate
+        self.is_merge = is_merge
+        self.num_div = num_div
+        self.shift_pixel = shift_pixel
+        self.mlp_expand_ratio = mlp_expand_ratio
+
+    def build(self, input_shapes):
+        # Calculate stochastic depth probabilities.
+        # Reference: https://keras.io/examples/vision/cct/#the-final-cct-model
+        dpr = [
+            x
+            for x in np.linspace(
+                start=0, stop=self.stochastic_depth_rate, num=self.num_shift_blocks
+            )
+        ]
+
+        # Build the shift blocks as a list of ShiftViT Blocks
+        self.shift_blocks = list()
+        for num in range(self.num_shift_blocks):
+            self.shift_blocks.append(
+                ShiftViTBlock(
+                    num_div=self.num_div,
+                    epsilon=self.epsilon,
+                    drop_path_prob=dpr[num],
+                    mlp_dropout_rate=self.mlp_dropout_rate,
+                    shift_pixel=self.shift_pixel,
+                    mlp_expand_ratio=self.mlp_expand_ratio,
+                )
+            )
+        if self.is_merge:
+            self.patch_merge = PatchMerging(epsilon=self.epsilon)
+
+    def call(self, x, training=False):
+        for shift_block in self.shift_blocks:
+            x = shift_block(x, training=training)
+        if self.is_merge:
+            x = self.patch_merge(x)
+        return x
+
+    # Since this is a custom layer, we need to overwrite get_config()
+    # so that model can be easily saved & loaded after training
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "epsilon": self.epsilon,
+                "mlp_dropout_rate": self.mlp_dropout_rate,
+                "num_shift_blocks": self.num_shift_blocks,
+                "stochastic_depth_rate": self.stochastic_depth_rate,
+                "is_merge": self.is_merge,
+                "num_div": self.num_div,
+                "shift_pixel": self.shift_pixel,
+                "mlp_expand_ratio": self.mlp_expand_ratio,
+            }
+        )
+        return config
+
+
+"""
+## The ShiftViT model
+
+Build the ShiftViT custom model.
+"""
+
+
+class ShiftViTModel(keras.Model):
+    """The ShiftViT Model.
+
+    Args:
+        data_augmentation (keras.Model): A data augmentation model.
+        projected_dim (int): The dimension to which the patches of the image are
+            projected.
+        patch_size (int): The patch size of the images.
+        num_shift_blocks_per_stages (list[int]): A list of all the number of shit
+            blocks per stage.
+        epsilon (float): The epsilon constant.
+        mlp_dropout_rate (float): The dropout rate used in the MLP block.
+        stochastic_depth_rate (float): The maximum drop rate probability.
+        num_div (int): The number of divisions of the channesl of the feature
+            map. Defaults to 12.
+        shift_pixel (int): The number of pixel to shift. Default to 1.
+        mlp_expand_ratio (int): The ratio with which the initial mlp dense layer
+            is expanded to. Defaults to 2.
+    """
+
+    def __init__(
+        self,
+        data_augmentation,
+        projected_dim,
+        patch_size,
+        num_shift_blocks_per_stages,
+        epsilon,
+        mlp_dropout_rate,
+        stochastic_depth_rate,
+        num_div=12,
+        shift_pixel=1,
+        mlp_expand_ratio=2,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.data_augmentation = data_augmentation
+        self.patch_projection = layers.Conv2D(
+            filters=projected_dim,
+            kernel_size=patch_size,
+            strides=patch_size,
+            padding="same",
+        )
+        self.stages = list()
+        for index, num_shift_blocks in enumerate(num_shift_blocks_per_stages):
+            if index == len(num_shift_blocks_per_stages) - 1:
+                # This is the last stage, do not use the patch merge here.
+                is_merge = False
+            else:
+                is_merge = True
+            # Build the stages.
+            self.stages.append(
+                StackedShiftBlocks(
+                    epsilon=epsilon,
+                    mlp_dropout_rate=mlp_dropout_rate,
+                    num_shift_blocks=num_shift_blocks,
+                    stochastic_depth_rate=stochastic_depth_rate,
+                    is_merge=is_merge,
+                    num_div=num_div,
+                    shift_pixel=shift_pixel,
+                    mlp_expand_ratio=mlp_expand_ratio,
+                )
+            )
+        self.global_avg_pool = layers.GlobalAveragePooling2D()
+
+        self.classifier = layers.Dense(config.num_classes)
+
+    def get_config(self):
+        config = super().get_config()
+        config.update(
+            {
+                "data_augmentation": self.data_augmentation,
+                "patch_projection": self.patch_projection,
+                "stages": self.stages,
+                "global_avg_pool": self.global_avg_pool,
+                "classifier": self.classifier,
+            }
+        )
+        return config
+
+    def _calculate_loss(self, data, training=False):
+        (images, labels) = data
+
+        # Augment the images
+        augmented_images = self.data_augmentation(images, training=training)
+
+        # Create patches and project the pathces.
+        projected_patches = self.patch_projection(augmented_images)
+
+        # Pass through the stages
+        x = projected_patches
+        for stage in self.stages:
+            x = stage(x, training=training)
+
+        # Get the logits.
+        x = self.global_avg_pool(x)
+        logits = self.classifier(x)
+
+        # Calculate the loss and return it.
+        total_loss = self.compiled_loss(labels, logits)
+        return total_loss, labels, logits
+
+    def train_step(self, inputs):
+        with tf.GradientTape() as tape:
+            total_loss, labels, logits = self._calculate_loss(
+                data=inputs, training=True
+            )
+
+        # Apply gradients.
+        train_vars = [
+            self.data_augmentation.trainable_variables,
+            self.patch_projection.trainable_variables,
+            self.global_avg_pool.trainable_variables,
+            self.classifier.trainable_variables,
+        ]
+        train_vars = train_vars + [stage.trainable_variables for stage in self.stages]
+
+        # Optimize the gradients.
+        grads = tape.gradient(total_loss, train_vars)
+        trainable_variable_list = []
+        for grad, var in zip(grads, train_vars):
+            for g, v in zip(grad, var):
+                trainable_variable_list.append((g, v))
+        self.optimizer.apply_gradients(trainable_variable_list)
+
+        # Update the metrics
+        self.compiled_metrics.update_state(labels, logits)
+        return {m.name: m.result() for m in self.metrics}
+
+    def test_step(self, data):
+        _, labels, logits = self._calculate_loss(data=data, training=False)
+
+        # Update the metrics
+        self.compiled_metrics.update_state(labels, logits)
+        return {m.name: m.result() for m in self.metrics}
+
+    def call(self, images):
+        augmented_images = self.data_augmentation(images)
+        x = self.patch_projection(augmented_images)
+        for stage in self.stages:
+            x = stage(x, training=False)
+        x = self.global_avg_pool(x)
+        logits = self.classifier(x)
+        return logits
+
+
+"""
+## Instantiate the model
+"""
+
+model = ShiftViTModel(
+    data_augmentation=get_augmentation_model(),
+    projected_dim=config.projected_dim,
+    patch_size=config.patch_size,
+    num_shift_blocks_per_stages=config.num_shift_blocks_per_stages,
+    epsilon=config.epsilon,
+    mlp_dropout_rate=config.mlp_dropout_rate,
+    stochastic_depth_rate=config.stochastic_depth_rate,
+    num_div=config.num_div,
+    shift_pixel=config.shift_pixel,
+    mlp_expand_ratio=config.mlp_expand_ratio,
+)
+
+"""
+## Learning rate schedule
+
+In many experiments, we want to warm up the model with a slowly increasing learning rate
+and then cool down the model with a slowly decaying learning rate. In the warmup cosine
+decay, the learning rate linearly increases for the warmup steps and then decays with a
+cosine decay.
+"""
+
+
+# Some code is taken from:
+# https://www.kaggle.com/ashusma/training-rfcx-tensorflow-tpu-effnet-b2.
+class WarmUpCosine(keras.optimizers.schedules.LearningRateSchedule):
+    """A LearningRateSchedule that uses a warmup cosine decay schedule."""
+
+    def __init__(self, lr_start, lr_max, warmup_steps, total_steps):
+        """
+        Args:
+            lr_start: The initial learning rate
+            lr_max: The maximum learning rate to which lr should increase to in
+                the warmup steps
+            warmup_steps: The number of steps for which the model warms up
+            total_steps: The total number of steps for the model training
+        """
+        super().__init__()
+        self.lr_start = lr_start
+        self.lr_max = lr_max
+        self.warmup_steps = warmup_steps
+        self.total_steps = total_steps
+        self.pi = ops.array(np.pi)
+
+    def __call__(self, step):
+        # Check whether the total number of steps is larger than the warmup
+        # steps. If not, then throw a value error.
+        if self.total_steps < self.warmup_steps:
+            raise ValueError(
+                f"Total number of steps {self.total_steps} must be"
+                + f"larger or equal to warmup steps {self.warmup_steps}."
+            )
+
+        # `cos_annealed_lr` is a graph that increases to 1 from the initial
+        # step to the warmup step. After that this graph decays to -1 at the
+        # final step mark.
+        cos_annealed_lr = ops.cos(
+            self.pi
+            * (ops.cast(step, dtype="float32") - self.warmup_steps)
+            / ops.cast(self.total_steps - self.warmup_steps, dtype="float32")
+        )
+
+        # Shift the mean of the `cos_annealed_lr` graph to 1. Now the grpah goes
+        # from 0 to 2. Normalize the graph with 0.5 so that now it goes from 0
+        # to 1. With the normalized graph we scale it with `lr_max` such that
+        # it goes from 0 to `lr_max`
+        learning_rate = 0.5 * self.lr_max * (1 + cos_annealed_lr)
+
+        # Check whether warmup_steps is more than 0.
+        if self.warmup_steps > 0:
+            # Check whether lr_max is larger that lr_start. If not, throw a value
+            # error.
+            if self.lr_max < self.lr_start:
+                raise ValueError(
+                    f"lr_start {self.lr_start} must be smaller or"
+                    + f"equal to lr_max {self.lr_max}."
+                )
+
+            # Calculate the slope with which the learning rate should increase
+            # in the warumup schedule. The formula for slope is m = ((b-a)/steps)
+            slope = (self.lr_max - self.lr_start) / self.warmup_steps
+
+            # With the formula for a straight line (y = mx+c) build the warmup
+            # schedule
+            warmup_rate = slope * ops.cast(step, dtype="float32") + self.lr_start
+
+            # When the current step is lesser that warmup steps, get the line
+            # graph. When the current step is greater than the warmup steps, get
+            # the scaled cos graph.
+            learning_rate = ops.where(
+                step < self.warmup_steps, warmup_rate, learning_rate
+            )
+
+        # When the current step is more that the total steps, return 0 else return
+        # the calculated graph.
+        return ops.where(step > self.total_steps, 0.0, learning_rate)
+
+    def get_config(self):
+        config = {
+            "lr_start": self.lr_start,
+            "lr_max": self.lr_max,
+            "total_steps": self.total_steps,
+            "warmup_steps": self.warmup_steps,
+        }
+        return config
+
+
+"""
+## Compile and train the model
+"""
+
+# pass sample data to the model so that input shape is available at the time of
+# saving the model
+sample_ds, _ = next(iter(train_ds))
+model(sample_ds, training=False)
+
+# Get the total number of steps for training.
+total_steps = int((len(x_train) / config.batch_size) * config.epochs)
+
+# Calculate the number of steps for warmup.
+warmup_epoch_percentage = 0.15
+warmup_steps = int(total_steps * warmup_epoch_percentage)
+
+# Initialize the warmupcosine schedule.
+scheduled_lrs = WarmUpCosine(
+    lr_start=1e-5,
+    lr_max=1e-3,
+    warmup_steps=warmup_steps,
+    total_steps=total_steps,
+)
+
+# Get the optimizer.
+optimizer = keras.optimizers.AdamW(
+    learning_rate=scheduled_lrs, weight_decay=config.weight_decay
+)
+
+# Compile and pretrain the model.
+model.compile(
+    optimizer=optimizer,
+    loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+    metrics=[
+        keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
+        keras.metrics.SparseTopKCategoricalAccuracy(5, name="top-5-accuracy"),
+    ],
+)
+
+# Train the model
+history = model.fit(
+    train_ds,
+    epochs=config.epochs,
+    validation_data=val_ds,
+    callbacks=[
+        keras.callbacks.EarlyStopping(
+            monitor="val_accuracy",
+            patience=5,
+            mode="auto",
+        )
+    ],
+)
+
+# Evaluate the model with the test dataset.
+print("TESTING")
+loss, acc_top1, acc_top5 = model.evaluate(test_ds)
+print(f"Loss: {loss:0.2f}")
+print(f"Top 1 test accuracy: {acc_top1*100:0.2f}%")
+print(f"Top 5 test accuracy: {acc_top5*100:0.2f}%")
+
+"""
+## Save trained model
+
+Since we created the model by Subclassing, we can't save the model in HDF5 format.
+
+It can be saved in TF SavedModel format only. In general, this is the recommended format for saving models as well.
+"""
+model.export("ShiftViT")
+
+"""
+## Model inference
+"""
+
+"""
+**Download sample data for inference**
+"""
+
+"""shell
+wget -q 'https://tinyurl.com/2p9483sw' -O inference_set.zip
+unzip -q inference_set.zip
+"""
+
+
+"""
+**Load saved model**
+"""
+# Using TFSMLayer to reload the TF SavedModel as a Keras layer.
+# This is not limited to SavedModels that originate from Keras – it will work with any SavedModel, e.g. TF-Hub models.
+saved_model = keras.layers.TFSMLayer("ShiftViT", call_endpoint="serving_default")
+
+"""
+**Utility functions for inference**
+"""
+
+
+def process_image(img_path):
+    # read image file from string path
+    img = tf.io.read_file(img_path)
+
+    # decode jpeg to uint8 tensor
+    img = tf.io.decode_jpeg(img, channels=3)
+
+    # resize image to match input size accepted by model
+    # use `interpolation` as `nearest` to preserve dtype of input passed to `resize()`
+    img = ops.image.resize(
+        img, [config.input_shape[0], config.input_shape[1]], interpolation="nearest"
+    )
+    return img
+
+
+def create_tf_dataset(image_dir):
+    data_dir = pathlib.Path(image_dir)
+
+    # create tf.data dataset using directory of images
+    predict_ds = tf.data.Dataset.list_files(str(data_dir / "*.jpg"), shuffle=False)
+
+    # use map to convert string paths to uint8 image tensors
+    # setting `num_parallel_calls' helps in processing multiple images parallely
+    predict_ds = predict_ds.map(process_image, num_parallel_calls=AUTO)
+
+    # create a Prefetch Dataset for better latency & throughput
+    predict_ds = predict_ds.batch(config.tf_ds_batch_size).prefetch(AUTO)
+    return predict_ds
+
+
+def predict(predict_ds):
+    # ShiftViT model returns logits (non-normalized predictions)
+    model = keras.Sequential([saved_model])
+    output_dict = model.predict(predict_ds)
+    logits = list(output_dict.values())[0]
+
+    # normalize predictions by calling softmax()
+    probabilities = ops.softmax(logits)
+    return probabilities
+
+
+def get_predicted_class(probabilities):
+    pred_label = np.argmax(probabilities)
+    predicted_class = config.label_map[pred_label]
+    return predicted_class
+
+
+def get_confidence_scores(probabilities):
+    # get the indices of the probability scores sorted in descending order
+    labels = np.argsort(probabilities)[::-1]
+    confidences = {
+        config.label_map[label]: np.round((probabilities[label]) * 100, 2)
+        for label in labels
+    }
+    return confidences
+
+
+"""
+**Get predictions**
+"""
+
+img_dir = "inference_set"
+predict_ds = create_tf_dataset(img_dir)
+probabilities = predict(predict_ds)
+print(f"probabilities: {probabilities[0]}")
+confidences = get_confidence_scores(probabilities[0])
+print(confidences)
+
+"""
+**View predictions**
+"""
+
+plt.figure(figsize=(10, 10))
+for images in predict_ds:
+    for i in range(min(6, probabilities.shape[0])):
+        ax = plt.subplot(3, 3, i + 1)
+        plt.imshow(images[i].numpy().astype("uint8"))
+        predicted_class = get_predicted_class(probabilities[i])
+        plt.title(predicted_class)
+        plt.axis("off")
+
+"""
+## Conclusion
+
+The most impactful contribution of the paper is not the novel architecture, but
+the idea that hierarchical ViTs trained with no attention can perform quite well. This
+opens up the question of how essential attention is to the performance of ViTs.
+
+For curious minds, we would suggest reading the
+[ConvNexT](https://arxiv.org/abs/2201.03545) paper which attends more to the training
+paradigms and architectural details of ViTs rather than providing a novel architecture
+based on attention.
+
+Acknowledgements:
+
+- We would like to thank [PyImageSearch](https://pyimagesearch.com) for providing us with
+resources that helped in the completion of this project.
+- We would like to thank [JarvisLabs.ai](https://jarvislabs.ai/) for providing with the
+GPU credits.
+- We would like to thank [Manim Community](https://www.manim.community/) for the manim
+library.
+- A personal note of thanks to [Puja Roychowdhury](https://twitter.com/pleb_talks) for
+helping us with the Learning Rate Schedule.
+"""
+
+"""
+**Example available on HuggingFace**
+
+| Trained Model | Demo |
+| :--: | :--: |
+| [![Generic badge](https://img.shields.io/badge/%F0%9F%A4%97%20Model-ShiftViT-brightgreen)](https://huggingface.co/keras-io/shiftvit) | [![Generic badge](https://img.shields.io/badge/%F0%9F%A4%97%20Space-ShiftViT-brightgreen)](https://huggingface.co/spaces/keras-io/shiftvit) |
+"""
diff --git a/knowledge_base/vision/siamese_contrastive.py b/knowledge_base/vision/siamese_contrastive.py
new file mode 100644
index 0000000000000000000000000000000000000000..84ec2c8d2c3334e5f8b65d490605bd13a229158b
--- /dev/null
+++ b/knowledge_base/vision/siamese_contrastive.py
@@ -0,0 +1,406 @@
+"""
+Title: Image similarity estimation using a Siamese Network with a contrastive loss
+Author: Mehdi
+Date created: 2021/05/06
+Last modified: 2022/09/10
+Description: Similarity learning using a siamese network trained with a contrastive loss.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+[Siamese Networks](https://en.wikipedia.org/wiki/Siamese_neural_network)
+are neural networks which share weights between two or more sister networks,
+each producing embedding vectors of its respective inputs.
+
+In supervised similarity learning, the networks are then trained to maximize the
+contrast (distance) between embeddings of inputs of different classes, while minimizing the distance between
+embeddings of similar classes, resulting in embedding spaces that reflect
+the class segmentation of the training inputs.
+"""
+
+"""
+## Setup
+"""
+
+import random
+import numpy as np
+import keras
+from keras import ops
+import matplotlib.pyplot as plt
+
+"""
+## Hyperparameters
+"""
+
+epochs = 10
+batch_size = 16
+margin = 1  # Margin for contrastive loss.
+
+"""
+## Load the MNIST dataset
+"""
+(x_train_val, y_train_val), (x_test, y_test) = keras.datasets.mnist.load_data()
+
+# Change the data type to a floating point format
+x_train_val = x_train_val.astype("float32")
+x_test = x_test.astype("float32")
+
+
+"""
+## Define training and validation sets
+"""
+
+# Keep 50% of train_val  in validation set
+x_train, x_val = x_train_val[:30000], x_train_val[30000:]
+y_train, y_val = y_train_val[:30000], y_train_val[30000:]
+del x_train_val, y_train_val
+
+
+"""
+## Create pairs of images
+
+We will train the model to differentiate between digits of different classes. For
+example, digit `0` needs to be differentiated from the rest of the
+digits (`1` through `9`), digit `1` - from `0` and `2` through `9`, and so on.
+To carry this out, we will select N random images from class A (for example,
+for digit `0`) and pair them with N random images from another class B
+(for example, for digit `1`). Then, we can repeat this process for all classes
+of digits (until digit `9`). Once we have paired digit `0` with other digits,
+we can repeat this process for the remaining classes for the rest of the digits
+(from `1` until `9`).
+"""
+
+
+def make_pairs(x, y):
+    """Creates a tuple containing image pairs with corresponding label.
+
+    Arguments:
+        x: List containing images, each index in this list corresponds to one image.
+        y: List containing labels, each label with datatype of `int`.
+
+    Returns:
+        Tuple containing two numpy arrays as (pairs_of_samples, labels),
+        where pairs_of_samples' shape is (2len(x), 2,n_features_dims) and
+        labels are a binary array of shape (2len(x)).
+    """
+
+    num_classes = max(y) + 1
+    digit_indices = [np.where(y == i)[0] for i in range(num_classes)]
+
+    pairs = []
+    labels = []
+
+    for idx1 in range(len(x)):
+        # add a matching example
+        x1 = x[idx1]
+        label1 = y[idx1]
+        idx2 = random.choice(digit_indices[label1])
+        x2 = x[idx2]
+
+        pairs += [[x1, x2]]
+        labels += [0]
+
+        # add a non-matching example
+        label2 = random.randint(0, num_classes - 1)
+        while label2 == label1:
+            label2 = random.randint(0, num_classes - 1)
+
+        idx2 = random.choice(digit_indices[label2])
+        x2 = x[idx2]
+
+        pairs += [[x1, x2]]
+        labels += [1]
+
+    return np.array(pairs), np.array(labels).astype("float32")
+
+
+# make train pairs
+pairs_train, labels_train = make_pairs(x_train, y_train)
+
+# make validation pairs
+pairs_val, labels_val = make_pairs(x_val, y_val)
+
+# make test pairs
+pairs_test, labels_test = make_pairs(x_test, y_test)
+
+"""
+We get:
+
+**pairs_train.shape = (60000, 2, 28, 28)**
+
+- We have 60,000 pairs
+- Each pair contains 2 images
+- Each image has shape `(28, 28)`
+"""
+
+"""
+Split the training pairs
+"""
+
+x_train_1 = pairs_train[:, 0]  # x_train_1.shape is (60000, 28, 28)
+x_train_2 = pairs_train[:, 1]
+
+"""
+Split the validation pairs
+"""
+
+x_val_1 = pairs_val[:, 0]  # x_val_1.shape = (60000, 28, 28)
+x_val_2 = pairs_val[:, 1]
+
+"""
+Split the test pairs
+"""
+
+x_test_1 = pairs_test[:, 0]  # x_test_1.shape = (20000, 28, 28)
+x_test_2 = pairs_test[:, 1]
+
+
+"""
+## Visualize pairs and their labels
+"""
+
+
+def visualize(pairs, labels, to_show=6, num_col=3, predictions=None, test=False):
+    """Creates a plot of pairs and labels, and prediction if it's test dataset.
+
+    Arguments:
+        pairs: Numpy Array, of pairs to visualize, having shape
+               (Number of pairs, 2, 28, 28).
+        to_show: Int, number of examples to visualize (default is 6)
+                `to_show` must be an integral multiple of `num_col`.
+                 Otherwise it will be trimmed if it is greater than num_col,
+                 and incremented if if it is less then num_col.
+        num_col: Int, number of images in one row - (default is 3)
+                 For test and train respectively, it should not exceed 3 and 7.
+        predictions: Numpy Array of predictions with shape (to_show, 1) -
+                     (default is None)
+                     Must be passed when test=True.
+        test: Boolean telling whether the dataset being visualized is
+              train dataset or test dataset - (default False).
+
+    Returns:
+        None.
+    """
+
+    # Define num_row
+    # If to_show % num_col != 0
+    #    trim to_show,
+    #       to trim to_show limit num_row to the point where
+    #       to_show % num_col == 0
+    #
+    # If to_show//num_col == 0
+    #    then it means num_col is greater then to_show
+    #    increment to_show
+    #       to increment to_show set num_row to 1
+    num_row = to_show // num_col if to_show // num_col != 0 else 1
+
+    # `to_show` must be an integral multiple of `num_col`
+    #  we found num_row and we have num_col
+    #  to increment or decrement to_show
+    #  to make it integral multiple of `num_col`
+    #  simply set it equal to num_row * num_col
+    to_show = num_row * num_col
+
+    # Plot the images
+    fig, axes = plt.subplots(num_row, num_col, figsize=(5, 5))
+    for i in range(to_show):
+        # If the number of rows is 1, the axes array is one-dimensional
+        if num_row == 1:
+            ax = axes[i % num_col]
+        else:
+            ax = axes[i // num_col, i % num_col]
+
+        ax.imshow(ops.concatenate([pairs[i][0], pairs[i][1]], axis=1), cmap="gray")
+        ax.set_axis_off()
+        if test:
+            ax.set_title("True: {} | Pred: {:.5f}".format(labels[i], predictions[i][0]))
+        else:
+            ax.set_title("Label: {}".format(labels[i]))
+    if test:
+        plt.tight_layout(rect=(0, 0, 1.9, 1.9), w_pad=0.0)
+    else:
+        plt.tight_layout(rect=(0, 0, 1.5, 1.5))
+    plt.show()
+
+
+"""
+Inspect training pairs
+"""
+
+visualize(pairs_train[:-1], labels_train[:-1], to_show=4, num_col=4)
+
+"""
+Inspect validation pairs
+"""
+
+visualize(pairs_val[:-1], labels_val[:-1], to_show=4, num_col=4)
+
+"""
+Inspect test pairs
+"""
+
+visualize(pairs_test[:-1], labels_test[:-1], to_show=4, num_col=4)
+
+"""
+## Define the model
+
+There are two input layers, each leading to its own network, which
+produces embeddings. A `Lambda` layer then merges them using an
+[Euclidean distance](https://en.wikipedia.org/wiki/Euclidean_distance) and the
+merged output is fed to the final network.
+"""
+
+
+# Provided two tensors t1 and t2
+# Euclidean distance = sqrt(sum(square(t1-t2)))
+def euclidean_distance(vects):
+    """Find the Euclidean distance between two vectors.
+
+    Arguments:
+        vects: List containing two tensors of same length.
+
+    Returns:
+        Tensor containing euclidean distance
+        (as floating point value) between vectors.
+    """
+
+    x, y = vects
+    sum_square = ops.sum(ops.square(x - y), axis=1, keepdims=True)
+    return ops.sqrt(ops.maximum(sum_square, keras.backend.epsilon()))
+
+
+input = keras.layers.Input((28, 28, 1))
+x = keras.layers.BatchNormalization()(input)
+x = keras.layers.Conv2D(4, (5, 5), activation="tanh")(x)
+x = keras.layers.AveragePooling2D(pool_size=(2, 2))(x)
+x = keras.layers.Conv2D(16, (5, 5), activation="tanh")(x)
+x = keras.layers.AveragePooling2D(pool_size=(2, 2))(x)
+x = keras.layers.Flatten()(x)
+
+x = keras.layers.BatchNormalization()(x)
+x = keras.layers.Dense(10, activation="tanh")(x)
+embedding_network = keras.Model(input, x)
+
+
+input_1 = keras.layers.Input((28, 28, 1))
+input_2 = keras.layers.Input((28, 28, 1))
+
+# As mentioned above, Siamese Network share weights between
+# tower networks (sister networks). To allow this, we will use
+# same embedding network for both tower networks.
+tower_1 = embedding_network(input_1)
+tower_2 = embedding_network(input_2)
+
+merge_layer = keras.layers.Lambda(euclidean_distance, output_shape=(1,))(
+    [tower_1, tower_2]
+)
+normal_layer = keras.layers.BatchNormalization()(merge_layer)
+output_layer = keras.layers.Dense(1, activation="sigmoid")(normal_layer)
+siamese = keras.Model(inputs=[input_1, input_2], outputs=output_layer)
+
+
+"""
+## Define the contrastive Loss
+"""
+
+
+def loss(margin=1):
+    """Provides 'contrastive_loss' an enclosing scope with variable 'margin'.
+
+    Arguments:
+        margin: Integer, defines the baseline for distance for which pairs
+                should be classified as dissimilar. - (default is 1).
+
+    Returns:
+        'contrastive_loss' function with data ('margin') attached.
+    """
+
+    # Contrastive loss = mean( (1-true_value) * square(prediction) +
+    #                         true_value * square( max(margin-prediction, 0) ))
+    def contrastive_loss(y_true, y_pred):
+        """Calculates the contrastive loss.
+
+        Arguments:
+            y_true: List of labels, each label is of type float32.
+            y_pred: List of predictions of same length as of y_true,
+                    each label is of type float32.
+
+        Returns:
+            A tensor containing contrastive loss as floating point value.
+        """
+
+        square_pred = ops.square(y_pred)
+        margin_square = ops.square(ops.maximum(margin - (y_pred), 0))
+        return ops.mean((1 - y_true) * square_pred + (y_true) * margin_square)
+
+    return contrastive_loss
+
+
+"""
+## Compile the model with the contrastive loss
+"""
+
+siamese.compile(loss=loss(margin=margin), optimizer="RMSprop", metrics=["accuracy"])
+siamese.summary()
+
+
+"""
+## Train the model
+"""
+
+history = siamese.fit(
+    [x_train_1, x_train_2],
+    labels_train,
+    validation_data=([x_val_1, x_val_2], labels_val),
+    batch_size=batch_size,
+    epochs=epochs,
+)
+
+"""
+## Visualize results
+"""
+
+
+def plt_metric(history, metric, title, has_valid=True):
+    """Plots the given 'metric' from 'history'.
+
+    Arguments:
+        history: history attribute of History object returned from Model.fit.
+        metric: Metric to plot, a string value present as key in 'history'.
+        title: A string to be used as title of plot.
+        has_valid: Boolean, true if valid data was passed to Model.fit else false.
+
+    Returns:
+        None.
+    """
+    plt.plot(history[metric])
+    if has_valid:
+        plt.plot(history["val_" + metric])
+        plt.legend(["train", "validation"], loc="upper left")
+    plt.title(title)
+    plt.ylabel(metric)
+    plt.xlabel("epoch")
+    plt.show()
+
+
+# Plot the accuracy
+plt_metric(history=history.history, metric="accuracy", title="Model accuracy")
+
+# Plot the contrastive loss
+plt_metric(history=history.history, metric="loss", title="Contrastive Loss")
+
+"""
+## Evaluate the model
+"""
+
+results = siamese.evaluate([x_test_1, x_test_2], labels_test)
+print("test loss, test acc:", results)
+
+"""
+## Visualize the predictions
+"""
+
+predictions = siamese.predict([x_test_1, x_test_2])
+visualize(pairs_test, labels_test, to_show=3, predictions=predictions, test=True)
diff --git a/knowledge_base/vision/siamese_network.py b/knowledge_base/vision/siamese_network.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbcf776a3e724d8648a90ddd4191baa2ecf89943
--- /dev/null
+++ b/knowledge_base/vision/siamese_network.py
@@ -0,0 +1,417 @@
+"""
+Title: Image similarity estimation using a Siamese Network with a triplet loss
+Authors: [Hazem Essam](https://twitter.com/hazemessamm) and [Santiago L. Valdarrama](https://twitter.com/svpino)
+Date created: 2021/03/25
+Last modified: 2021/03/25
+Description: Training a Siamese Network to compare the similarity of images using a triplet loss function.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+A [Siamese Network](https://en.wikipedia.org/wiki/Siamese_neural_network) is a type of network architecture that
+contains two or more identical subnetworks used to generate feature vectors for each input and compare them.
+
+Siamese Networks can be applied to different use cases, like detecting duplicates, finding anomalies, and face recognition.
+
+This example uses a Siamese Network with three identical subnetworks. We will provide three images to the model, where
+two of them will be similar (_anchor_ and _positive_ samples), and the third will be unrelated (a _negative_ example.)
+Our goal is for the model to learn to estimate the similarity between images.
+
+For the network to learn, we use a triplet loss function. You can find an introduction to triplet loss in the
+[FaceNet paper](https://arxiv.org/abs/1503.03832) by Schroff et al,. 2015. In this example, we define the triplet
+loss function as follows:
+
+`L(A, P, N) = max(‖f(A) - f(P)‖² - ‖f(A) - f(N)‖² + margin, 0)`
+
+This example uses the [Totally Looks Like dataset](https://sites.google.com/view/totally-looks-like-dataset)
+by [Rosenfeld et al., 2018](https://arxiv.org/abs/1803.01485v3).
+"""
+
+"""
+## Setup
+"""
+
+import matplotlib.pyplot as plt
+import numpy as np
+import os
+import random
+import tensorflow as tf
+from pathlib import Path
+from keras import applications
+from keras import layers
+from keras import losses
+from keras import ops
+from keras import optimizers
+from keras import metrics
+from keras import Model
+from keras.applications import resnet
+
+
+target_shape = (200, 200)
+
+
+"""
+## Load the dataset
+
+We are going to load the *Totally Looks Like* dataset and unzip it inside the `~/.keras` directory
+in the local environment.
+
+The dataset consists of two separate files:
+
+* `left.zip` contains the images that we will use as the anchor.
+* `right.zip` contains the images that we will use as the positive sample (an image that looks like the anchor).
+"""
+
+cache_dir = Path(Path.home()) / ".keras"
+anchor_images_path = cache_dir / "left"
+positive_images_path = cache_dir / "right"
+
+"""shell
+gdown --id 1jvkbTr_giSP3Ru8OwGNCg6B4PvVbcO34
+gdown --id 1EzBZUb_mh_Dp_FKD0P4XiYYSd0QBH5zW
+unzip -oq left.zip -d $cache_dir
+unzip -oq right.zip -d $cache_dir
+"""
+
+"""
+## Preparing the data
+
+We are going to use a `tf.data` pipeline to load the data and generate the triplets that we
+need to train the Siamese network.
+
+We'll set up the pipeline using a zipped list with anchor, positive, and negative filenames as
+the source. The pipeline will load and preprocess the corresponding images.
+"""
+
+
+def preprocess_image(filename):
+    """
+    Load the specified file as a JPEG image, preprocess it and
+    resize it to the target shape.
+    """
+
+    image_string = tf.io.read_file(filename)
+    image = tf.image.decode_jpeg(image_string, channels=3)
+    image = tf.image.convert_image_dtype(image, tf.float32)
+    image = tf.image.resize(image, target_shape)
+    return image
+
+
+def preprocess_triplets(anchor, positive, negative):
+    """
+    Given the filenames corresponding to the three images, load and
+    preprocess them.
+    """
+
+    return (
+        preprocess_image(anchor),
+        preprocess_image(positive),
+        preprocess_image(negative),
+    )
+
+
+"""
+Let's setup our data pipeline using a zipped list with an anchor, positive,
+and negative image filename as the source. The output of the pipeline
+contains the same triplet with every image loaded and preprocessed.
+"""
+
+# We need to make sure both the anchor and positive images are loaded in
+# sorted order so we can match them together.
+anchor_images = sorted(
+    [str(anchor_images_path / f) for f in os.listdir(anchor_images_path)]
+)
+
+positive_images = sorted(
+    [str(positive_images_path / f) for f in os.listdir(positive_images_path)]
+)
+
+image_count = len(anchor_images)
+
+anchor_dataset = tf.data.Dataset.from_tensor_slices(anchor_images)
+positive_dataset = tf.data.Dataset.from_tensor_slices(positive_images)
+
+# To generate the list of negative images, let's randomize the list of
+# available images and concatenate them together.
+rng = np.random.RandomState(seed=42)
+rng.shuffle(anchor_images)
+rng.shuffle(positive_images)
+
+negative_images = anchor_images + positive_images
+np.random.RandomState(seed=32).shuffle(negative_images)
+
+negative_dataset = tf.data.Dataset.from_tensor_slices(negative_images)
+negative_dataset = negative_dataset.shuffle(buffer_size=4096)
+
+dataset = tf.data.Dataset.zip((anchor_dataset, positive_dataset, negative_dataset))
+dataset = dataset.shuffle(buffer_size=1024)
+dataset = dataset.map(preprocess_triplets)
+
+# Let's now split our dataset in train and validation.
+train_dataset = dataset.take(round(image_count * 0.8))
+val_dataset = dataset.skip(round(image_count * 0.8))
+
+train_dataset = train_dataset.batch(32, drop_remainder=False)
+train_dataset = train_dataset.prefetch(tf.data.AUTOTUNE)
+
+val_dataset = val_dataset.batch(32, drop_remainder=False)
+val_dataset = val_dataset.prefetch(tf.data.AUTOTUNE)
+
+
+"""
+Let's take a look at a few examples of triplets. Notice how the first two images
+look alike while the third one is always different.
+"""
+
+
+def visualize(anchor, positive, negative):
+    """Visualize a few triplets from the supplied batches."""
+
+    def show(ax, image):
+        ax.imshow(image)
+        ax.get_xaxis().set_visible(False)
+        ax.get_yaxis().set_visible(False)
+
+    fig = plt.figure(figsize=(9, 9))
+
+    axs = fig.subplots(3, 3)
+    for i in range(3):
+        show(axs[i, 0], anchor[i])
+        show(axs[i, 1], positive[i])
+        show(axs[i, 2], negative[i])
+
+
+visualize(*list(train_dataset.take(1).as_numpy_iterator())[0])
+
+"""
+## Setting up the embedding generator model
+
+Our Siamese Network will generate embeddings for each of the images of the
+triplet. To do this, we will use a ResNet50 model pretrained on ImageNet and
+connect a few `Dense` layers to it so we can learn to separate these
+embeddings.
+
+We will freeze the weights of all the layers of the model up until the layer `conv5_block1_out`.
+This is important to avoid affecting the weights that the model has already learned.
+We are going to leave the bottom few layers trainable, so that we can fine-tune their weights
+during training.
+"""
+
+base_cnn = resnet.ResNet50(
+    weights="imagenet", input_shape=target_shape + (3,), include_top=False
+)
+
+flatten = layers.Flatten()(base_cnn.output)
+dense1 = layers.Dense(512, activation="relu")(flatten)
+dense1 = layers.BatchNormalization()(dense1)
+dense2 = layers.Dense(256, activation="relu")(dense1)
+dense2 = layers.BatchNormalization()(dense2)
+output = layers.Dense(256)(dense2)
+
+embedding = Model(base_cnn.input, output, name="Embedding")
+
+trainable = False
+for layer in base_cnn.layers:
+    if layer.name == "conv5_block1_out":
+        trainable = True
+    layer.trainable = trainable
+
+"""
+## Setting up the Siamese Network model
+
+The Siamese network will receive each of the triplet images as an input,
+generate the embeddings, and output the distance between the anchor and the
+positive embedding, as well as the distance between the anchor and the negative
+embedding.
+
+To compute the distance, we can use a custom layer `DistanceLayer` that
+returns both values as a tuple.
+"""
+
+
+class DistanceLayer(layers.Layer):
+    """
+    This layer is responsible for computing the distance between the anchor
+    embedding and the positive embedding, and the anchor embedding and the
+    negative embedding.
+    """
+
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+
+    def call(self, anchor, positive, negative):
+        ap_distance = ops.sum(tf.square(anchor - positive), -1)
+        an_distance = ops.sum(tf.square(anchor - negative), -1)
+        return (ap_distance, an_distance)
+
+
+anchor_input = layers.Input(name="anchor", shape=target_shape + (3,))
+positive_input = layers.Input(name="positive", shape=target_shape + (3,))
+negative_input = layers.Input(name="negative", shape=target_shape + (3,))
+
+distances = DistanceLayer()(
+    embedding(resnet.preprocess_input(anchor_input)),
+    embedding(resnet.preprocess_input(positive_input)),
+    embedding(resnet.preprocess_input(negative_input)),
+)
+
+siamese_network = Model(
+    inputs=[anchor_input, positive_input, negative_input], outputs=distances
+)
+
+"""
+## Putting everything together
+
+We now need to implement a model with custom training loop so we can compute
+the triplet loss using the three embeddings produced by the Siamese network.
+
+Let's create a `Mean` metric instance to track the loss of the training process.
+"""
+
+
+class SiameseModel(Model):
+    """The Siamese Network model with a custom training and testing loops.
+
+    Computes the triplet loss using the three embeddings produced by the
+    Siamese Network.
+
+    The triplet loss is defined as:
+       L(A, P, N) = max(‖f(A) - f(P)‖² - ‖f(A) - f(N)‖² + margin, 0)
+    """
+
+    def __init__(self, siamese_network, margin=0.5):
+        super().__init__()
+        self.siamese_network = siamese_network
+        self.margin = margin
+        self.loss_tracker = metrics.Mean(name="loss")
+
+    def call(self, inputs):
+        return self.siamese_network(inputs)
+
+    def train_step(self, data):
+        # GradientTape is a context manager that records every operation that
+        # you do inside. We are using it here to compute the loss so we can get
+        # the gradients and apply them using the optimizer specified in
+        # `compile()`.
+        with tf.GradientTape() as tape:
+            loss = self._compute_loss(data)
+
+        # Storing the gradients of the loss function with respect to the
+        # weights/parameters.
+        gradients = tape.gradient(loss, self.siamese_network.trainable_weights)
+
+        # Applying the gradients on the model using the specified optimizer
+        self.optimizer.apply_gradients(
+            zip(gradients, self.siamese_network.trainable_weights)
+        )
+
+        # Let's update and return the training loss metric.
+        self.loss_tracker.update_state(loss)
+        return {"loss": self.loss_tracker.result()}
+
+    def test_step(self, data):
+        loss = self._compute_loss(data)
+
+        # Let's update and return the loss metric.
+        self.loss_tracker.update_state(loss)
+        return {"loss": self.loss_tracker.result()}
+
+    def _compute_loss(self, data):
+        # The output of the network is a tuple containing the distances
+        # between the anchor and the positive example, and the anchor and
+        # the negative example.
+        ap_distance, an_distance = self.siamese_network(data)
+
+        # Computing the Triplet Loss by subtracting both distances and
+        # making sure we don't get a negative value.
+        loss = ap_distance - an_distance
+        loss = tf.maximum(loss + self.margin, 0.0)
+        return loss
+
+    @property
+    def metrics(self):
+        # We need to list our metrics here so the `reset_states()` can be
+        # called automatically.
+        return [self.loss_tracker]
+
+
+"""
+## Training
+
+We are now ready to train our model.
+"""
+
+siamese_model = SiameseModel(siamese_network)
+siamese_model.compile(optimizer=optimizers.Adam(0.0001))
+siamese_model.fit(train_dataset, epochs=10, validation_data=val_dataset)
+
+"""
+## Inspecting what the network has learned
+
+At this point, we can check how the network learned to separate the embeddings
+depending on whether they belong to similar images.
+
+We can use [cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity) to measure the
+similarity between embeddings.
+
+Let's pick a sample from the dataset to check the similarity between the
+embeddings generated for each image.
+"""
+sample = next(iter(train_dataset))
+visualize(*sample)
+
+anchor, positive, negative = sample
+anchor_embedding, positive_embedding, negative_embedding = (
+    embedding(resnet.preprocess_input(anchor)),
+    embedding(resnet.preprocess_input(positive)),
+    embedding(resnet.preprocess_input(negative)),
+)
+
+"""
+Finally, we can compute the cosine similarity between the anchor and positive
+images and compare it with the similarity between the anchor and the negative
+images.
+
+We should expect the similarity between the anchor and positive images to be
+larger than the similarity between the anchor and the negative images.
+"""
+
+cosine_similarity = metrics.CosineSimilarity()
+
+positive_similarity = cosine_similarity(anchor_embedding, positive_embedding)
+print("Positive similarity:", positive_similarity.numpy())
+
+negative_similarity = cosine_similarity(anchor_embedding, negative_embedding)
+print("Negative similarity", negative_similarity.numpy())
+
+
+"""
+## Summary
+
+1. The `tf.data` API enables you to build efficient input pipelines for your model. It is
+particularly useful if you have a large dataset. You can learn more about `tf.data`
+pipelines in [tf.data: Build TensorFlow input pipelines](https://www.tensorflow.org/guide/data).
+
+2. In this example, we use a pre-trained ResNet50 as part of the subnetwork that generates
+the feature embeddings. By using [transfer learning](https://www.tensorflow.org/guide/keras/transfer_learning?hl=en),
+we can significantly reduce the training time and size of the dataset.
+
+3. Notice how we are [fine-tuning](https://www.tensorflow.org/guide/keras/transfer_learning?hl=en#fine-tuning)
+the weights of the final layers of the ResNet50 network but keeping the rest of the layers untouched.
+Using the name assigned to each layer, we can freeze the weights to a certain point and keep the last few layers open.
+
+4. We can create custom layers by creating a class that inherits from `tf.keras.layers.Layer`,
+as we did in the `DistanceLayer` class.
+
+5. We used a cosine similarity metric to measure how to 2 output embeddings are similar to each other.
+
+6. You can implement a custom training loop by overriding the `train_step()` method. `train_step()` uses
+[`tf.GradientTape`](https://www.tensorflow.org/api_docs/python/tf/GradientTape),
+which records every operation that you perform inside it. In this example, we use it to access the
+gradients passed to the optimizer to update the model weights at every step. For more details, check out the
+[Intro to Keras for researchers](https://keras.io/getting_started/intro_to_keras_for_researchers/)
+and [Writing a training loop from scratch](https://www.tensorflow.org/guide/keras/writing_a_training_loop_from_scratch?hl=en).
+
+"""
diff --git a/knowledge_base/vision/simsiam.py b/knowledge_base/vision/simsiam.py
new file mode 100644
index 0000000000000000000000000000000000000000..4a671cd12f293f864e39acb4c82eacaf39916827
--- /dev/null
+++ b/knowledge_base/vision/simsiam.py
@@ -0,0 +1,450 @@
+"""
+Title: Self-supervised contrastive learning with SimSiam
+Author: [Sayak Paul](https://twitter.com/RisingSayak)
+Date created: 2021/03/19
+Last modified: 2023/12/29
+Description: Implementation of a self-supervised learning method for computer vision.
+Accelerator: GPU
+"""
+
+"""
+Self-supervised learning (SSL) is an interesting branch of study in the field of
+representation learning. SSL systems try to formulate a supervised signal from a corpus
+of unlabeled data points.  An example is we train a deep neural network to predict the
+next word from a given set of words. In literature, these tasks are known as *pretext
+tasks* or *auxiliary tasks*. If we [train such a network](https://arxiv.org/abs/1801.06146) on a huge dataset (such as
+the [Wikipedia text corpus](https://www.corpusdata.org/wikipedia.asp)) it learns very effective
+representations that transfer well to downstream tasks. Language models like
+[BERT](https://arxiv.org/abs/1810.04805), [GPT-3](https://arxiv.org/abs/2005.14165),
+[ELMo](https://allennlp.org/elmo) all benefit from this.
+
+Much like the language models we can train computer vision models using similar
+approaches. To make things work in computer vision, we need to formulate the learning
+tasks such that the underlying model (a deep neural network) is able to make sense of the
+semantic information present in vision data. One such task is to a model to _contrast_
+between two different versions of the same image. The hope is that in this way the model
+will have learn representations where the similar images are grouped as together possible
+while the dissimilar images are further away.
+
+In this example, we will be implementing one such system called **SimSiam** proposed in
+[Exploring Simple Siamese Representation Learning](https://arxiv.org/abs/2011.10566). It
+is implemented as the following:
+
+1. We create two different versions of the same dataset with a stochastic data
+augmentation pipeline. Note that the random initialization seed needs to be the same
+during create these versions.
+2. We take a ResNet without any classification head (**backbone**) and we add a shallow
+fully-connected network (**projection head**) on top of it. Collectively, this is known
+as the **encoder**.
+3. We pass the output of the encoder through a **predictor** which is again a shallow
+fully-connected network having an
+[AutoEncoder](https://en.wikipedia.org/wiki/Autoencoder) like structure.
+4. We then train our encoder to maximize the cosine similarity between the two different
+versions of our dataset.
+
+"""
+
+"""
+## Setup
+"""
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+import keras
+import keras_cv
+from keras import ops
+from keras import layers
+from keras import regularizers
+import tensorflow as tf
+
+import matplotlib.pyplot as plt
+import numpy as np
+
+"""
+## Define hyperparameters
+"""
+
+AUTO = tf.data.AUTOTUNE
+BATCH_SIZE = 128
+EPOCHS = 5
+CROP_TO = 32
+SEED = 26
+
+PROJECT_DIM = 2048
+LATENT_DIM = 512
+WEIGHT_DECAY = 0.0005
+
+"""
+## Load the CIFAR-10 dataset
+"""
+
+(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()
+print(f"Total training examples: {len(x_train)}")
+print(f"Total test examples: {len(x_test)}")
+
+"""
+## Defining our data augmentation pipeline
+
+As studied in [SimCLR](https://arxiv.org/abs/2002.05709) having the right data
+augmentation pipeline is critical for SSL systems to work effectively in computer vision.
+Two particular augmentation transforms that seem to matter the most are: 1.) Random
+resized crops and 2.) Color distortions. Most of the other SSL systems for computer
+vision (such as [BYOL](https://arxiv.org/abs/2006.07733),
+[MoCoV2](https://arxiv.org/abs/2003.04297), [SwAV](https://arxiv.org/abs/2006.09882),
+etc.) include these in their training pipelines.
+"""
+
+
+strength = [0.4, 0.4, 0.3, 0.1]
+
+random_flip = layers.RandomFlip(mode="horizontal_and_vertical")
+random_crop = layers.RandomCrop(CROP_TO, CROP_TO)
+random_brightness = layers.RandomBrightness(0.8 * strength[0])
+random_contrast = layers.RandomContrast((1 - 0.8 * strength[1], 1 + 0.8 * strength[1]))
+random_saturation = keras_cv.layers.RandomSaturation(
+    (0.5 - 0.8 * strength[2], 0.5 + 0.8 * strength[2])
+)
+random_hue = keras_cv.layers.RandomHue(0.2 * strength[3], [0, 255])
+grayscale = keras_cv.layers.Grayscale()
+
+
+def flip_random_crop(image):
+    # With random crops we also apply horizontal flipping.
+    image = random_flip(image)
+    image = random_crop(image)
+    return image
+
+
+def color_jitter(x):
+    x = random_brightness(x)
+    x = random_contrast(x)
+    x = random_saturation(x)
+    x = random_hue(x)
+    # Affine transformations can disturb the natural range of
+    # RGB images, hence this is needed.
+    x = ops.clip(x, 0, 255)
+    return x
+
+
+def color_drop(x):
+    x = grayscale(x)
+    x = ops.tile(x, [1, 1, 3])
+    return x
+
+
+def random_apply(func, x, p):
+    if keras.random.uniform([], minval=0, maxval=1) < p:
+        return func(x)
+    else:
+        return x
+
+
+def custom_augment(image):
+    # As discussed in the SimCLR paper, the series of augmentation
+    # transformations (except for random crops) need to be applied
+    # randomly to impose translational invariance.
+    image = flip_random_crop(image)
+    image = random_apply(color_jitter, image, p=0.8)
+    image = random_apply(color_drop, image, p=0.2)
+    return image
+
+
+"""
+It should be noted that an augmentation pipeline is generally dependent on various
+properties of the dataset we are dealing with. For example, if images in the dataset are
+heavily object-centric then taking random crops with a very high probability may hurt the
+training performance.
+
+Let's now apply our augmentation pipeline to our dataset and visualize a few outputs.
+"""
+
+"""
+## Convert the data into TensorFlow `Dataset` objects
+
+Here we create two different versions of our dataset *without* any ground-truth labels.
+"""
+
+ssl_ds_one = tf.data.Dataset.from_tensor_slices(x_train)
+ssl_ds_one = (
+    ssl_ds_one.shuffle(1024, seed=SEED)
+    .map(custom_augment, num_parallel_calls=AUTO)
+    .batch(BATCH_SIZE)
+    .prefetch(AUTO)
+)
+
+ssl_ds_two = tf.data.Dataset.from_tensor_slices(x_train)
+ssl_ds_two = (
+    ssl_ds_two.shuffle(1024, seed=SEED)
+    .map(custom_augment, num_parallel_calls=AUTO)
+    .batch(BATCH_SIZE)
+    .prefetch(AUTO)
+)
+
+# We then zip both of these datasets.
+ssl_ds = tf.data.Dataset.zip((ssl_ds_one, ssl_ds_two))
+
+# Visualize a few augmented images.
+sample_images_one = next(iter(ssl_ds_one))
+plt.figure(figsize=(10, 10))
+for n in range(25):
+    ax = plt.subplot(5, 5, n + 1)
+    plt.imshow(sample_images_one[n].numpy().astype("int"))
+    plt.axis("off")
+plt.show()
+
+# Ensure that the different versions of the dataset actually contain
+# identical images.
+sample_images_two = next(iter(ssl_ds_two))
+plt.figure(figsize=(10, 10))
+for n in range(25):
+    ax = plt.subplot(5, 5, n + 1)
+    plt.imshow(sample_images_two[n].numpy().astype("int"))
+    plt.axis("off")
+plt.show()
+
+"""
+Notice that the images in `samples_images_one` and `sample_images_two` are essentially
+the same but are augmented differently.
+"""
+
+"""
+## Defining the encoder and the predictor
+
+We use an implementation of ResNet20 that is specifically configured for the CIFAR10
+dataset. The code is taken from the
+[keras-idiomatic-programmer](https://github.com/GoogleCloudPlatform/keras-idiomatic-programmer/blob/master/zoo/resnet/resnet_cifar10_v2.py) repository. The hyperparameters of
+these architectures have been referred from Section 3 and Appendix A of [the original
+paper](https://arxiv.org/abs/2011.10566).
+"""
+
+"""shell
+wget -q https://git.io/JYx2x -O resnet_cifar10_v2.py
+"""
+
+import resnet_cifar10_v2
+
+N = 2
+DEPTH = N * 9 + 2
+NUM_BLOCKS = ((DEPTH - 2) // 9) - 1
+
+
+def get_encoder():
+    # Input and backbone.
+    inputs = layers.Input((CROP_TO, CROP_TO, 3))
+    x = layers.Rescaling(scale=1.0 / 127.5, offset=-1)(inputs)
+    x = resnet_cifar10_v2.stem(x)
+    x = resnet_cifar10_v2.learner(x, NUM_BLOCKS)
+    x = layers.GlobalAveragePooling2D(name="backbone_pool")(x)
+
+    # Projection head.
+    x = layers.Dense(
+        PROJECT_DIM, use_bias=False, kernel_regularizer=regularizers.l2(WEIGHT_DECAY)
+    )(x)
+    x = layers.BatchNormalization()(x)
+    x = layers.ReLU()(x)
+    x = layers.Dense(
+        PROJECT_DIM, use_bias=False, kernel_regularizer=regularizers.l2(WEIGHT_DECAY)
+    )(x)
+    outputs = layers.BatchNormalization()(x)
+    return keras.Model(inputs, outputs, name="encoder")
+
+
+def get_predictor():
+    model = keras.Sequential(
+        [
+            # Note the AutoEncoder-like structure.
+            layers.Input((PROJECT_DIM,)),
+            layers.Dense(
+                LATENT_DIM,
+                use_bias=False,
+                kernel_regularizer=regularizers.l2(WEIGHT_DECAY),
+            ),
+            layers.ReLU(),
+            layers.BatchNormalization(),
+            layers.Dense(PROJECT_DIM),
+        ],
+        name="predictor",
+    )
+    return model
+
+
+"""
+## Defining the (pre-)training loop
+
+One of the main reasons behind training networks with these kinds of approaches is to
+utilize the learned representations for downstream tasks like classification. This is why
+this particular training phase is also referred to as _pre-training_.
+
+We start by defining the loss function.
+"""
+
+
+def compute_loss(p, z):
+    # The authors of SimSiam emphasize the impact of
+    # the `stop_gradient` operator in the paper as it
+    # has an important role in the overall optimization.
+    z = ops.stop_gradient(z)
+    p = keras.utils.normalize(p, axis=1, order=2)
+    z = keras.utils.normalize(z, axis=1, order=2)
+    # Negative cosine similarity (minimizing this is
+    # equivalent to maximizing the similarity).
+    return -ops.mean(ops.sum((p * z), axis=1))
+
+
+"""
+We then define our training loop by overriding the `train_step()` function of the
+`keras.Model` class.
+"""
+
+
+class SimSiam(keras.Model):
+    def __init__(self, encoder, predictor):
+        super().__init__()
+        self.encoder = encoder
+        self.predictor = predictor
+        self.loss_tracker = keras.metrics.Mean(name="loss")
+
+    @property
+    def metrics(self):
+        return [self.loss_tracker]
+
+    def train_step(self, data):
+        # Unpack the data.
+        ds_one, ds_two = data
+
+        # Forward pass through the encoder and predictor.
+        with tf.GradientTape() as tape:
+            z1, z2 = self.encoder(ds_one), self.encoder(ds_two)
+            p1, p2 = self.predictor(z1), self.predictor(z2)
+            # Note that here we are enforcing the network to match
+            # the representations of two differently augmented batches
+            # of data.
+            loss = compute_loss(p1, z2) / 2 + compute_loss(p2, z1) / 2
+
+        # Compute gradients and update the parameters.
+        learnable_params = (
+            self.encoder.trainable_variables + self.predictor.trainable_variables
+        )
+        gradients = tape.gradient(loss, learnable_params)
+        self.optimizer.apply_gradients(zip(gradients, learnable_params))
+
+        # Monitor loss.
+        self.loss_tracker.update_state(loss)
+        return {"loss": self.loss_tracker.result()}
+
+
+"""
+## Pre-training our networks
+
+In the interest of this example, we will train the model for only 5 epochs. In reality,
+this should at least be 100 epochs.
+"""
+
+# Create a cosine decay learning scheduler.
+num_training_samples = len(x_train)
+steps = EPOCHS * (num_training_samples // BATCH_SIZE)
+lr_decayed_fn = keras.optimizers.schedules.CosineDecay(
+    initial_learning_rate=0.03, decay_steps=steps
+)
+
+# Create an early stopping callback.
+early_stopping = keras.callbacks.EarlyStopping(
+    monitor="loss", patience=5, restore_best_weights=True
+)
+
+# Compile model and start training.
+simsiam = SimSiam(get_encoder(), get_predictor())
+simsiam.compile(optimizer=keras.optimizers.SGD(lr_decayed_fn, momentum=0.6))
+history = simsiam.fit(ssl_ds, epochs=EPOCHS, callbacks=[early_stopping])
+
+# Visualize the training progress of the model.
+plt.plot(history.history["loss"])
+plt.grid()
+plt.title("Negative Cosine Similairty")
+plt.show()
+
+"""
+If your solution gets very close to -1 (minimum value of our loss) very quickly with a
+different dataset and a different backbone architecture that is likely because of
+*representation collapse*. It is a phenomenon where the encoder yields similar output for
+all the images. In that case additional hyperparameter tuning is required especially in
+the following areas:
+
+* Strength of the color distortions and their probabilities.
+* Learning rate and its schedule.
+* Architecture of both the backbone and their projection head.
+
+"""
+
+"""
+## Evaluating our SSL method
+
+The most popularly used method to evaluate a SSL method in computer vision (or any other
+pre-training method as such) is to learn a linear classifier on the frozen features of
+the trained backbone model (in this case it is ResNet20) and evaluate the classifier on
+unseen images. Other methods include
+[fine-tuning](https://keras.io/guides/transfer_learning/) on the source dataset or even a
+target dataset with 5% or 10% labels present. Practically, we can use the backbone model
+for any downstream task such as semantic segmentation, object detection, and so on where
+the backbone models are usually pre-trained with *pure supervised learning*.
+"""
+
+# We first create labeled `Dataset` objects.
+train_ds = tf.data.Dataset.from_tensor_slices((x_train, y_train))
+test_ds = tf.data.Dataset.from_tensor_slices((x_test, y_test))
+
+# Then we shuffle, batch, and prefetch this dataset for performance. We
+# also apply random resized crops as an augmentation but only to the
+# training set.
+train_ds = (
+    train_ds.shuffle(1024)
+    .map(lambda x, y: (flip_random_crop(x), y), num_parallel_calls=AUTO)
+    .batch(BATCH_SIZE)
+    .prefetch(AUTO)
+)
+test_ds = test_ds.batch(BATCH_SIZE).prefetch(AUTO)
+
+# Extract the backbone ResNet20.
+backbone = keras.Model(
+    simsiam.encoder.input, simsiam.encoder.get_layer("backbone_pool").output
+)
+
+# We then create our linear classifier and train it.
+backbone.trainable = False
+inputs = layers.Input((CROP_TO, CROP_TO, 3))
+x = backbone(inputs, training=False)
+outputs = layers.Dense(10, activation="softmax")(x)
+linear_model = keras.Model(inputs, outputs, name="linear_model")
+
+# Compile model and start training.
+linear_model.compile(
+    loss="sparse_categorical_crossentropy",
+    metrics=["accuracy"],
+    optimizer=keras.optimizers.SGD(lr_decayed_fn, momentum=0.9),
+)
+history = linear_model.fit(
+    train_ds, validation_data=test_ds, epochs=EPOCHS, callbacks=[early_stopping]
+)
+_, test_acc = linear_model.evaluate(test_ds)
+print("Test accuracy: {:.2f}%".format(test_acc * 100))
+
+"""
+
+## Notes
+* More data and longer pre-training schedule benefit SSL in general.
+* SSL is particularly very helpful when you do not have access to very limited *labeled*
+training data but you can manage to build a large corpus of unlabeled data. Recently,
+using an SSL method called [SwAV](https://arxiv.org/abs/2006.09882), a group of
+researchers at Facebook trained a [RegNet](https://arxiv.org/abs/2006.09882) on 2 Billion
+images. They were able to achieve downstream performance very close to those achieved by
+pure supervised pre-training. For some downstream tasks, their method even outperformed
+the supervised counterparts. You can check out [their
+paper](https://arxiv.org/pdf/2103.01988.pdf) to know the details.
+* If you are interested to understand why contrastive SSL helps networks learn meaningful
+representations, you can check out the following resources:
+   * [Self-supervised learning: The dark matter of
+intelligence](https://ai.facebook.com/blog/self-supervised-learning-the-dark-matter-of-intelligence/)
+   * [Understanding self-supervised learning using controlled datasets with known
+structure](https://sslneuips20.github.io/files/CameraReadys%203-77/64/CameraReady/Understanding_self_supervised_learning.pdf)
+
+"""
diff --git a/knowledge_base/vision/super_resolution_sub_pixel.py b/knowledge_base/vision/super_resolution_sub_pixel.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3cccb776b0acadbc3b4b691c559b981291f09bf
--- /dev/null
+++ b/knowledge_base/vision/super_resolution_sub_pixel.py
@@ -0,0 +1,409 @@
+"""
+Title: Image Super-Resolution using an Efficient Sub-Pixel CNN
+Author: [Xingyu Long](https://github.com/xingyu-long)
+Date created: 2020/07/28
+Last modified: 2020/08/27
+Description: Implementing Super-Resolution using Efficient sub-pixel model on BSDS500.
+Accelerator: GPU
+Converted to Keras 3 by: [Md Awsfalur Rahman](https://awsaf49.github.io)
+"""
+
+"""
+## Introduction
+
+ESPCN (Efficient Sub-Pixel CNN), proposed by [Shi, 2016](https://arxiv.org/abs/1609.05158)
+is a model that reconstructs a high-resolution version of an image given a low-resolution
+version.
+It leverages efficient "sub-pixel convolution" layers, which learns an array of
+image upscaling filters.
+
+In this code example, we will implement the model from the paper and train it on a small
+dataset,
+[BSDS500](https://www2.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/resources.html).
+[BSDS500](https://www2.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/resources.html).
+"""
+
+"""
+## Setup
+"""
+
+import keras
+from keras import layers
+from keras import ops
+from keras.utils import load_img
+from keras.utils import array_to_img
+from keras.utils import img_to_array
+from keras.preprocessing import image_dataset_from_directory
+import tensorflow as tf  #  only for data preprocessing
+
+import os
+import math
+import numpy as np
+
+from IPython.display import display
+
+"""
+## Load data: BSDS500 dataset
+
+### Download dataset
+
+We use the built-in `keras.utils.get_file` utility to retrieve the dataset.
+"""
+
+dataset_url = "http://www.eecs.berkeley.edu/Research/Projects/CS/vision/grouping/BSR/BSR_bsds500.tgz"
+data_dir = keras.utils.get_file(origin=dataset_url, fname="BSR", untar=True)
+root_dir = os.path.join(data_dir, "BSDS500/data")
+
+"""
+We create training and validation datasets via `image_dataset_from_directory`.
+"""
+
+crop_size = 300
+upscale_factor = 3
+input_size = crop_size // upscale_factor
+batch_size = 8
+
+train_ds = image_dataset_from_directory(
+    root_dir,
+    batch_size=batch_size,
+    image_size=(crop_size, crop_size),
+    validation_split=0.2,
+    subset="training",
+    seed=1337,
+    label_mode=None,
+)
+
+valid_ds = image_dataset_from_directory(
+    root_dir,
+    batch_size=batch_size,
+    image_size=(crop_size, crop_size),
+    validation_split=0.2,
+    subset="validation",
+    seed=1337,
+    label_mode=None,
+)
+
+"""
+We rescale the images to take values in the range [0, 1].
+"""
+
+
+def scaling(input_image):
+    input_image = input_image / 255.0
+    return input_image
+
+
+# Scale from (0, 255) to (0, 1)
+train_ds = train_ds.map(scaling)
+valid_ds = valid_ds.map(scaling)
+
+"""
+Let's visualize a few sample images:
+"""
+
+for batch in train_ds.take(1):
+    for img in batch:
+        display(array_to_img(img))
+
+"""
+We prepare a dataset of test image paths that we will use for
+visual evaluation at the end of this example.
+"""
+
+dataset = os.path.join(root_dir, "images")
+test_path = os.path.join(dataset, "test")
+
+test_img_paths = sorted(
+    [
+        os.path.join(test_path, fname)
+        for fname in os.listdir(test_path)
+        if fname.endswith(".jpg")
+    ]
+)
+
+"""
+## Crop and resize images
+
+Let's process image data.
+First, we convert our images from the RGB color space to the
+[YUV colour space](https://en.wikipedia.org/wiki/YUV).
+
+For the input data (low-resolution images),
+we crop the image, retrieve the `y` channel (luninance),
+and resize it with the `area` method (use `BICUBIC` if you use PIL).
+We only consider the luminance channel
+in the YUV color space because humans are more sensitive to
+luminance change.
+
+For the target data (high-resolution images), we just crop the image
+and retrieve the `y` channel.
+"""
+
+
+# Use TF Ops to process.
+def process_input(input, input_size, upscale_factor):
+    input = tf.image.rgb_to_yuv(input)
+    last_dimension_axis = len(input.shape) - 1
+    y, u, v = tf.split(input, 3, axis=last_dimension_axis)
+    return tf.image.resize(y, [input_size, input_size], method="area")
+
+
+def process_target(input):
+    input = tf.image.rgb_to_yuv(input)
+    last_dimension_axis = len(input.shape) - 1
+    y, u, v = tf.split(input, 3, axis=last_dimension_axis)
+    return y
+
+
+train_ds = train_ds.map(
+    lambda x: (process_input(x, input_size, upscale_factor), process_target(x))
+)
+train_ds = train_ds.prefetch(buffer_size=32)
+
+valid_ds = valid_ds.map(
+    lambda x: (process_input(x, input_size, upscale_factor), process_target(x))
+)
+valid_ds = valid_ds.prefetch(buffer_size=32)
+
+"""
+Let's take a look at the input and target data.
+"""
+
+for batch in train_ds.take(1):
+    for img in batch[0]:
+        display(array_to_img(img))
+    for img in batch[1]:
+        display(array_to_img(img))
+
+"""
+## Build a model
+
+Compared to the paper, we add one more layer and we use the `relu` activation function
+instead of `tanh`.
+It achieves better performance even though we train the model for fewer epochs.
+"""
+
+
+class DepthToSpace(layers.Layer):
+    def __init__(self, block_size):
+        super().__init__()
+        self.block_size = block_size
+
+    def call(self, input):
+        batch, height, width, depth = ops.shape(input)
+        depth = depth // (self.block_size**2)
+
+        x = ops.reshape(
+            input, [batch, height, width, self.block_size, self.block_size, depth]
+        )
+        x = ops.transpose(x, [0, 1, 3, 2, 4, 5])
+        x = ops.reshape(
+            x, [batch, height * self.block_size, width * self.block_size, depth]
+        )
+        return x
+
+
+def get_model(upscale_factor=3, channels=1):
+    conv_args = {
+        "activation": "relu",
+        "kernel_initializer": "orthogonal",
+        "padding": "same",
+    }
+    inputs = keras.Input(shape=(None, None, channels))
+    x = layers.Conv2D(64, 5, **conv_args)(inputs)
+    x = layers.Conv2D(64, 3, **conv_args)(x)
+    x = layers.Conv2D(32, 3, **conv_args)(x)
+    x = layers.Conv2D(channels * (upscale_factor**2), 3, **conv_args)(x)
+    outputs = DepthToSpace(upscale_factor)(x)
+
+    return keras.Model(inputs, outputs)
+
+
+"""
+## Define utility functions
+
+We need to define several utility functions to monitor our results:
+
+- `plot_results` to plot an save an image.
+- `get_lowres_image` to convert an image to its low-resolution version.
+- `upscale_image` to turn a low-resolution image to
+a high-resolution version reconstructed by the model.
+In this function, we use the `y` channel from the YUV color space
+as input to the model and then combine the output with the
+other channels to obtain an RGB image.
+"""
+
+import matplotlib.pyplot as plt
+from mpl_toolkits.axes_grid1.inset_locator import zoomed_inset_axes
+from mpl_toolkits.axes_grid1.inset_locator import mark_inset
+import PIL
+
+
+def plot_results(img, prefix, title):
+    """Plot the result with zoom-in area."""
+    img_array = img_to_array(img)
+    img_array = img_array.astype("float32") / 255.0
+
+    # Create a new figure with a default 111 subplot.
+    fig, ax = plt.subplots()
+    im = ax.imshow(img_array[::-1], origin="lower")
+
+    plt.title(title)
+    # zoom-factor: 2.0, location: upper-left
+    axins = zoomed_inset_axes(ax, 2, loc=2)
+    axins.imshow(img_array[::-1], origin="lower")
+
+    # Specify the limits.
+    x1, x2, y1, y2 = 200, 300, 100, 200
+    # Apply the x-limits.
+    axins.set_xlim(x1, x2)
+    # Apply the y-limits.
+    axins.set_ylim(y1, y2)
+
+    plt.yticks(visible=False)
+    plt.xticks(visible=False)
+
+    # Make the line.
+    mark_inset(ax, axins, loc1=1, loc2=3, fc="none", ec="blue")
+    plt.savefig(str(prefix) + "-" + title + ".png")
+    plt.show()
+
+
+def get_lowres_image(img, upscale_factor):
+    """Return low-resolution image to use as model input."""
+    return img.resize(
+        (img.size[0] // upscale_factor, img.size[1] // upscale_factor),
+        PIL.Image.BICUBIC,
+    )
+
+
+def upscale_image(model, img):
+    """Predict the result based on input image and restore the image as RGB."""
+    ycbcr = img.convert("YCbCr")
+    y, cb, cr = ycbcr.split()
+    y = img_to_array(y)
+    y = y.astype("float32") / 255.0
+
+    input = np.expand_dims(y, axis=0)
+    out = model.predict(input)
+
+    out_img_y = out[0]
+    out_img_y *= 255.0
+
+    # Restore the image in RGB color space.
+    out_img_y = out_img_y.clip(0, 255)
+    out_img_y = out_img_y.reshape((np.shape(out_img_y)[0], np.shape(out_img_y)[1]))
+    out_img_y = PIL.Image.fromarray(np.uint8(out_img_y), mode="L")
+    out_img_cb = cb.resize(out_img_y.size, PIL.Image.BICUBIC)
+    out_img_cr = cr.resize(out_img_y.size, PIL.Image.BICUBIC)
+    out_img = PIL.Image.merge("YCbCr", (out_img_y, out_img_cb, out_img_cr)).convert(
+        "RGB"
+    )
+    return out_img
+
+
+"""
+## Define callbacks to monitor training
+
+The `ESPCNCallback` object will compute and display
+the [PSNR](https://en.wikipedia.org/wiki/Peak_signal-to-noise_ratio) metric.
+This is the main metric we use to evaluate super-resolution performance.
+"""
+
+
+class ESPCNCallback(keras.callbacks.Callback):
+    def __init__(self):
+        super().__init__()
+        self.test_img = get_lowres_image(load_img(test_img_paths[0]), upscale_factor)
+
+    # Store PSNR value in each epoch.
+    def on_epoch_begin(self, epoch, logs=None):
+        self.psnr = []
+
+    def on_epoch_end(self, epoch, logs=None):
+        print("Mean PSNR for epoch: %.2f" % (np.mean(self.psnr)))
+        if epoch % 20 == 0:
+            prediction = upscale_image(self.model, self.test_img)
+            plot_results(prediction, "epoch-" + str(epoch), "prediction")
+
+    def on_test_batch_end(self, batch, logs=None):
+        self.psnr.append(10 * math.log10(1 / logs["loss"]))
+
+
+"""
+Define `ModelCheckpoint` and `EarlyStopping` callbacks.
+"""
+
+early_stopping_callback = keras.callbacks.EarlyStopping(monitor="loss", patience=10)
+
+checkpoint_filepath = "/tmp/checkpoint.keras"
+
+model_checkpoint_callback = keras.callbacks.ModelCheckpoint(
+    filepath=checkpoint_filepath,
+    save_weights_only=False,
+    monitor="loss",
+    mode="min",
+    save_best_only=True,
+)
+
+model = get_model(upscale_factor=upscale_factor, channels=1)
+model.summary()
+
+callbacks = [ESPCNCallback(), early_stopping_callback, model_checkpoint_callback]
+loss_fn = keras.losses.MeanSquaredError()
+optimizer = keras.optimizers.Adam(learning_rate=0.001)
+
+"""
+## Train the model
+"""
+
+epochs = 100
+
+model.compile(
+    optimizer=optimizer,
+    loss=loss_fn,
+)
+
+model.fit(
+    train_ds, epochs=epochs, callbacks=callbacks, validation_data=valid_ds, verbose=2
+)
+
+# The model weights (that are considered the best) are loaded into the model.
+model.load_weights(checkpoint_filepath)
+
+"""
+## Run model prediction and plot the results
+
+Let's compute the reconstructed version of a few images and save the results.
+"""
+
+total_bicubic_psnr = 0.0
+total_test_psnr = 0.0
+
+for index, test_img_path in enumerate(test_img_paths[50:60]):
+    img = load_img(test_img_path)
+    lowres_input = get_lowres_image(img, upscale_factor)
+    w = lowres_input.size[0] * upscale_factor
+    h = lowres_input.size[1] * upscale_factor
+    highres_img = img.resize((w, h))
+    prediction = upscale_image(model, lowres_input)
+    lowres_img = lowres_input.resize((w, h))
+    lowres_img_arr = img_to_array(lowres_img)
+    highres_img_arr = img_to_array(highres_img)
+    predict_img_arr = img_to_array(prediction)
+    bicubic_psnr = tf.image.psnr(lowres_img_arr, highres_img_arr, max_val=255)
+    test_psnr = tf.image.psnr(predict_img_arr, highres_img_arr, max_val=255)
+
+    total_bicubic_psnr += bicubic_psnr
+    total_test_psnr += test_psnr
+
+    print(
+        "PSNR of low resolution image and high resolution image is %.4f" % bicubic_psnr
+    )
+    print("PSNR of predict and high resolution is %.4f" % test_psnr)
+    plot_results(lowres_img, index, "lowres")
+    plot_results(highres_img, index, "highres")
+    plot_results(prediction, index, "prediction")
+
+print("Avg. PSNR of lowres images is %.4f" % (total_bicubic_psnr / 10))
+print("Avg. PSNR of reconstructions is %.4f" % (total_test_psnr / 10))
diff --git a/knowledge_base/vision/supervised-contrastive-learning.py b/knowledge_base/vision/supervised-contrastive-learning.py
new file mode 100644
index 0000000000000000000000000000000000000000..7f6bfb8686e748c7b7f23786b0bc6fe5977bc178
--- /dev/null
+++ b/knowledge_base/vision/supervised-contrastive-learning.py
@@ -0,0 +1,238 @@
+"""
+Title: Supervised Contrastive Learning
+Author: [Khalid Salama](https://www.linkedin.com/in/khalid-salama-24403144/)
+Date created: 2020/11/30
+Last modified: 2020/11/30
+Description: Using supervised contrastive learning for image classification.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+[Supervised Contrastive Learning](https://arxiv.org/abs/2004.11362)
+(Prannay Khosla et al.) is a training methodology that outperforms
+supervised training with crossentropy on classification tasks.
+
+Essentially, training an image classification model with Supervised Contrastive
+Learning is performed in two phases:
+
+1. Training an encoder to learn to produce vector representations of input images such
+that representations of images in the same class will be more similar compared to
+representations of images in different classes.
+2. Training a classifier on top of the frozen encoder.
+
+Note that this example requires [TensorFlow Addons](https://www.tensorflow.org/addons),
+which you can install using the following command:
+
+```python
+pip install tensorflow-addons
+```
+
+## Setup
+"""
+
+import tensorflow as tf
+import tensorflow_addons as tfa
+import numpy as np
+from tensorflow import keras
+from tensorflow.keras import layers
+
+"""
+## Prepare the data
+"""
+
+num_classes = 10
+input_shape = (32, 32, 3)
+
+# Load the train and test data splits
+(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()
+
+# Display shapes of train and test datasets
+print(f"x_train shape: {x_train.shape} - y_train shape: {y_train.shape}")
+print(f"x_test shape: {x_test.shape} - y_test shape: {y_test.shape}")
+
+
+"""
+## Using image data augmentation
+"""
+
+data_augmentation = keras.Sequential(
+    [
+        layers.Normalization(),
+        layers.RandomFlip("horizontal"),
+        layers.RandomRotation(0.02),
+    ]
+)
+
+# Setting the state of the normalization layer.
+data_augmentation.layers[0].adapt(x_train)
+
+"""
+## Build the encoder model
+
+The encoder model takes the image as input and turns it into a 2048-dimensional
+feature vector.
+"""
+
+
+def create_encoder():
+    resnet = keras.applications.ResNet50V2(
+        include_top=False, weights=None, input_shape=input_shape, pooling="avg"
+    )
+
+    inputs = keras.Input(shape=input_shape)
+    augmented = data_augmentation(inputs)
+    outputs = resnet(augmented)
+    model = keras.Model(inputs=inputs, outputs=outputs, name="cifar10-encoder")
+    return model
+
+
+encoder = create_encoder()
+encoder.summary()
+
+learning_rate = 0.001
+batch_size = 265
+hidden_units = 512
+projection_units = 128
+num_epochs = 50
+dropout_rate = 0.5
+temperature = 0.05
+
+"""
+## Build the classification model
+
+The classification model adds a fully-connected layer on top of the encoder,
+plus a softmax layer with the target classes.
+"""
+
+
+def create_classifier(encoder, trainable=True):
+    for layer in encoder.layers:
+        layer.trainable = trainable
+
+    inputs = keras.Input(shape=input_shape)
+    features = encoder(inputs)
+    features = layers.Dropout(dropout_rate)(features)
+    features = layers.Dense(hidden_units, activation="relu")(features)
+    features = layers.Dropout(dropout_rate)(features)
+    outputs = layers.Dense(num_classes, activation="softmax")(features)
+
+    model = keras.Model(inputs=inputs, outputs=outputs, name="cifar10-classifier")
+    model.compile(
+        optimizer=keras.optimizers.Adam(learning_rate),
+        loss=keras.losses.SparseCategoricalCrossentropy(),
+        metrics=[keras.metrics.SparseCategoricalAccuracy()],
+    )
+    return model
+
+
+"""
+## Experiment 1: Train the baseline classification model
+
+In this experiment, a baseline classifier is trained as usual, i.e., the
+encoder and the classifier parts are trained together as a single model
+to minimize the crossentropy loss.
+"""
+
+encoder = create_encoder()
+classifier = create_classifier(encoder)
+classifier.summary()
+
+history = classifier.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=num_epochs)
+
+accuracy = classifier.evaluate(x_test, y_test)[1]
+print(f"Test accuracy: {round(accuracy * 100, 2)}%")
+
+
+"""
+## Experiment 2: Use supervised contrastive learning
+
+In this experiment, the model is trained in two phases. In the first phase,
+the encoder is pretrained to optimize the supervised contrastive loss,
+described in [Prannay Khosla et al.](https://arxiv.org/abs/2004.11362).
+
+In the second phase, the classifier is trained using the trained encoder with
+its weights freezed; only the weights of fully-connected layers with the
+softmax are optimized.
+
+### 1. Supervised contrastive learning loss function
+"""
+
+
+class SupervisedContrastiveLoss(keras.losses.Loss):
+    def __init__(self, temperature=1, name=None):
+        super().__init__(name=name)
+        self.temperature = temperature
+
+    def __call__(self, labels, feature_vectors, sample_weight=None):
+        # Normalize feature vectors
+        feature_vectors_normalized = tf.math.l2_normalize(feature_vectors, axis=1)
+        # Compute logits
+        logits = tf.divide(
+            tf.matmul(
+                feature_vectors_normalized, tf.transpose(feature_vectors_normalized)
+            ),
+            self.temperature,
+        )
+        return tfa.losses.npairs_loss(tf.squeeze(labels), logits)
+
+
+def add_projection_head(encoder):
+    inputs = keras.Input(shape=input_shape)
+    features = encoder(inputs)
+    outputs = layers.Dense(projection_units, activation="relu")(features)
+    model = keras.Model(
+        inputs=inputs, outputs=outputs, name="cifar-encoder_with_projection-head"
+    )
+    return model
+
+
+"""
+### 2. Pretrain the encoder
+"""
+
+encoder = create_encoder()
+
+encoder_with_projection_head = add_projection_head(encoder)
+encoder_with_projection_head.compile(
+    optimizer=keras.optimizers.Adam(learning_rate),
+    loss=SupervisedContrastiveLoss(temperature),
+)
+
+encoder_with_projection_head.summary()
+
+history = encoder_with_projection_head.fit(
+    x=x_train, y=y_train, batch_size=batch_size, epochs=num_epochs
+)
+
+"""
+### 3. Train the classifier with the frozen encoder
+"""
+
+classifier = create_classifier(encoder, trainable=False)
+
+history = classifier.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=num_epochs)
+
+accuracy = classifier.evaluate(x_test, y_test)[1]
+print(f"Test accuracy: {round(accuracy * 100, 2)}%")
+
+"""
+We get to an improved test accuracy.
+"""
+
+"""
+## Conclusion
+
+As shown in the experiments, using the supervised contrastive learning technique
+outperformed the conventional technique in terms of the test accuracy. Note that
+the same training budget (i.e., number of epochs) was given to each technique.
+Supervised contrastive learning pays off when the encoder involves a complex
+architecture, like ResNet, and multi-class problems with many labels.
+In addition, large batch sizes and multi-layer projection heads
+improve its effectiveness. See the [Supervised Contrastive Learning](https://arxiv.org/abs/2004.11362)
+paper for more details.
+
+You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/keras-io/supervised-contrastive-learning-cifar10)
+and try the demo on [Hugging Face Spaces](https://huggingface.co/spaces/keras-io/supervised-contrastive-learning).
+"""
diff --git a/knowledge_base/vision/swin_transformers.py b/knowledge_base/vision/swin_transformers.py
new file mode 100644
index 0000000000000000000000000000000000000000..67172c7abf40bb3e633b2d5014f253da05ca6982
--- /dev/null
+++ b/knowledge_base/vision/swin_transformers.py
@@ -0,0 +1,601 @@
+"""
+Title: Image classification with Swin Transformers
+Author: [Rishit Dagli](https://twitter.com/rishit_dagli)
+Date created: 2021/09/08
+Last modified: 2021/09/08
+Description: Image classification using Swin Transformers, a general-purpose backbone for computer vision.
+Accelerator: GPU
+"""
+
+"""
+This example implements
+[Swin Transformer: Hierarchical Vision Transformer using Shifted Windows](https://arxiv.org/abs/2103.14030)
+by Liu et al. for image classification, and demonstrates it on the
+[CIFAR-100 dataset](https://www.cs.toronto.edu/~kriz/cifar.html).
+
+Swin Transformer (**S**hifted **Win**dow Transformer) can serve as a
+general-purpose backbone for computer vision. Swin Transformer is a hierarchical
+Transformer whose representations are computed with _shifted windows_. The
+shifted window scheme brings greater efficiency by limiting self-attention
+computation to non-overlapping local windows while also allowing for
+cross-window connections. This architecture has the flexibility to model
+information at various scales and has a linear computational complexity with
+respect to image size.
+
+This example requires TensorFlow 2.5 or higher.
+"""
+
+"""
+## Setup
+"""
+
+import matplotlib.pyplot as plt
+import numpy as np
+import tensorflow as tf  # For tf.data and preprocessing only.
+import keras
+from keras import layers
+from keras import ops
+
+"""
+## Configure the hyperparameters
+
+A key parameter to pick is the `patch_size`, the size of the input patches.
+In order to use each pixel as an individual input, you can set `patch_size` to
+`(1, 1)`. Below, we take inspiration from the original paper settings for
+training on ImageNet-1K, keeping most of the original settings for this example.
+"""
+
+num_classes = 100
+input_shape = (32, 32, 3)
+
+patch_size = (2, 2)  # 2-by-2 sized patches
+dropout_rate = 0.03  # Dropout rate
+num_heads = 8  # Attention heads
+embed_dim = 64  # Embedding dimension
+num_mlp = 256  # MLP layer size
+# Convert embedded patches to query, key, and values with a learnable additive
+# value
+qkv_bias = True
+window_size = 2  # Size of attention window
+shift_size = 1  # Size of shifting window
+image_dimension = 32  # Initial image size
+
+num_patch_x = input_shape[0] // patch_size[0]
+num_patch_y = input_shape[1] // patch_size[1]
+
+learning_rate = 1e-3
+batch_size = 128
+num_epochs = 40
+validation_split = 0.1
+weight_decay = 0.0001
+label_smoothing = 0.1
+
+"""
+## Prepare the data
+
+We load the CIFAR-100 dataset through `keras.datasets`,
+normalize the images, and convert the integer labels to one-hot encoded vectors.
+"""
+
+(x_train, y_train), (x_test, y_test) = keras.datasets.cifar100.load_data()
+x_train, x_test = x_train / 255.0, x_test / 255.0
+y_train = keras.utils.to_categorical(y_train, num_classes)
+y_test = keras.utils.to_categorical(y_test, num_classes)
+num_train_samples = int(len(x_train) * (1 - validation_split))
+num_val_samples = len(x_train) - num_train_samples
+x_train, x_val = np.split(x_train, [num_train_samples])
+y_train, y_val = np.split(y_train, [num_train_samples])
+print(f"x_train shape: {x_train.shape} - y_train shape: {y_train.shape}")
+print(f"x_test shape: {x_test.shape} - y_test shape: {y_test.shape}")
+
+plt.figure(figsize=(10, 10))
+for i in range(25):
+    plt.subplot(5, 5, i + 1)
+    plt.xticks([])
+    plt.yticks([])
+    plt.grid(False)
+    plt.imshow(x_train[i])
+plt.show()
+
+
+"""
+## Helper functions
+
+We create two helper functions to help us get a sequence of
+patches from the image, merge patches, and apply dropout.
+"""
+
+
+def window_partition(x, window_size):
+    _, height, width, channels = x.shape
+    patch_num_y = height // window_size
+    patch_num_x = width // window_size
+    x = ops.reshape(
+        x,
+        (
+            -1,
+            patch_num_y,
+            window_size,
+            patch_num_x,
+            window_size,
+            channels,
+        ),
+    )
+    x = ops.transpose(x, (0, 1, 3, 2, 4, 5))
+    windows = ops.reshape(x, (-1, window_size, window_size, channels))
+    return windows
+
+
+def window_reverse(windows, window_size, height, width, channels):
+    patch_num_y = height // window_size
+    patch_num_x = width // window_size
+    x = ops.reshape(
+        windows,
+        (
+            -1,
+            patch_num_y,
+            patch_num_x,
+            window_size,
+            window_size,
+            channels,
+        ),
+    )
+    x = ops.transpose(x, (0, 1, 3, 2, 4, 5))
+    x = ops.reshape(x, (-1, height, width, channels))
+    return x
+
+
+"""
+## Window based multi-head self-attention
+
+Usually Transformers perform global self-attention, where the relationships
+between a token and all other tokens are computed. The global computation leads
+to quadratic complexity with respect to the number of tokens. Here, as the
+[original paper](https://arxiv.org/abs/2103.14030) suggests, we compute
+self-attention within local windows, in a non-overlapping manner. Global
+self-attention leads to quadratic computational complexity in the number of
+patches, whereas window-based self-attention leads to linear complexity and is
+easily scalable.
+"""
+
+
+class WindowAttention(layers.Layer):
+    def __init__(
+        self,
+        dim,
+        window_size,
+        num_heads,
+        qkv_bias=True,
+        dropout_rate=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.dim = dim
+        self.window_size = window_size
+        self.num_heads = num_heads
+        self.scale = (dim // num_heads) ** -0.5
+        self.qkv = layers.Dense(dim * 3, use_bias=qkv_bias)
+        self.dropout = layers.Dropout(dropout_rate)
+        self.proj = layers.Dense(dim)
+
+        num_window_elements = (2 * self.window_size[0] - 1) * (
+            2 * self.window_size[1] - 1
+        )
+        self.relative_position_bias_table = self.add_weight(
+            shape=(num_window_elements, self.num_heads),
+            initializer=keras.initializers.Zeros(),
+            trainable=True,
+        )
+        coords_h = np.arange(self.window_size[0])
+        coords_w = np.arange(self.window_size[1])
+        coords_matrix = np.meshgrid(coords_h, coords_w, indexing="ij")
+        coords = np.stack(coords_matrix)
+        coords_flatten = coords.reshape(2, -1)
+        relative_coords = coords_flatten[:, :, None] - coords_flatten[:, None, :]
+        relative_coords = relative_coords.transpose([1, 2, 0])
+        relative_coords[:, :, 0] += self.window_size[0] - 1
+        relative_coords[:, :, 1] += self.window_size[1] - 1
+        relative_coords[:, :, 0] *= 2 * self.window_size[1] - 1
+        relative_position_index = relative_coords.sum(-1)
+
+        self.relative_position_index = keras.Variable(
+            initializer=relative_position_index,
+            shape=relative_position_index.shape,
+            dtype="int",
+            trainable=False,
+        )
+
+    def call(self, x, mask=None):
+        _, size, channels = x.shape
+        head_dim = channels // self.num_heads
+        x_qkv = self.qkv(x)
+        x_qkv = ops.reshape(x_qkv, (-1, size, 3, self.num_heads, head_dim))
+        x_qkv = ops.transpose(x_qkv, (2, 0, 3, 1, 4))
+        q, k, v = x_qkv[0], x_qkv[1], x_qkv[2]
+        q = q * self.scale
+        k = ops.transpose(k, (0, 1, 3, 2))
+        attn = q @ k
+
+        num_window_elements = self.window_size[0] * self.window_size[1]
+        relative_position_index_flat = ops.reshape(self.relative_position_index, (-1,))
+        relative_position_bias = ops.take(
+            self.relative_position_bias_table,
+            relative_position_index_flat,
+            axis=0,
+        )
+        relative_position_bias = ops.reshape(
+            relative_position_bias,
+            (num_window_elements, num_window_elements, -1),
+        )
+        relative_position_bias = ops.transpose(relative_position_bias, (2, 0, 1))
+        attn = attn + ops.expand_dims(relative_position_bias, axis=0)
+
+        if mask is not None:
+            nW = mask.shape[0]
+            mask_float = ops.cast(
+                ops.expand_dims(ops.expand_dims(mask, axis=1), axis=0),
+                "float32",
+            )
+            attn = ops.reshape(attn, (-1, nW, self.num_heads, size, size)) + mask_float
+            attn = ops.reshape(attn, (-1, self.num_heads, size, size))
+            attn = keras.activations.softmax(attn, axis=-1)
+        else:
+            attn = keras.activations.softmax(attn, axis=-1)
+        attn = self.dropout(attn)
+
+        x_qkv = attn @ v
+        x_qkv = ops.transpose(x_qkv, (0, 2, 1, 3))
+        x_qkv = ops.reshape(x_qkv, (-1, size, channels))
+        x_qkv = self.proj(x_qkv)
+        x_qkv = self.dropout(x_qkv)
+        return x_qkv
+
+
+"""
+## The complete Swin Transformer model
+
+Finally, we put together the complete Swin Transformer by replacing the standard
+multi-head attention (MHA) with shifted windows attention. As suggested in the
+original paper, we create a model comprising of a shifted window-based MHA
+layer, followed by a 2-layer MLP with GELU nonlinearity in between, applying
+`LayerNormalization` before each MSA layer and each MLP, and a residual
+connection after each of these layers.
+
+Notice that we only create a simple MLP with 2 Dense and
+2 Dropout layers. Often you will see models using ResNet-50 as the MLP which is
+quite standard in the literature. However in this paper the authors use a
+2-layer MLP with GELU nonlinearity in between.
+"""
+
+
+class SwinTransformer(layers.Layer):
+    def __init__(
+        self,
+        dim,
+        num_patch,
+        num_heads,
+        window_size=7,
+        shift_size=0,
+        num_mlp=1024,
+        qkv_bias=True,
+        dropout_rate=0.0,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        self.dim = dim  # number of input dimensions
+        self.num_patch = num_patch  # number of embedded patches
+        self.num_heads = num_heads  # number of attention heads
+        self.window_size = window_size  # size of window
+        self.shift_size = shift_size  # size of window shift
+        self.num_mlp = num_mlp  # number of MLP nodes
+
+        self.norm1 = layers.LayerNormalization(epsilon=1e-5)
+        self.attn = WindowAttention(
+            dim,
+            window_size=(self.window_size, self.window_size),
+            num_heads=num_heads,
+            qkv_bias=qkv_bias,
+            dropout_rate=dropout_rate,
+        )
+        self.drop_path = layers.Dropout(dropout_rate)
+        self.norm2 = layers.LayerNormalization(epsilon=1e-5)
+
+        self.mlp = keras.Sequential(
+            [
+                layers.Dense(num_mlp),
+                layers.Activation(keras.activations.gelu),
+                layers.Dropout(dropout_rate),
+                layers.Dense(dim),
+                layers.Dropout(dropout_rate),
+            ]
+        )
+
+        if min(self.num_patch) < self.window_size:
+            self.shift_size = 0
+            self.window_size = min(self.num_patch)
+
+    def build(self, input_shape):
+        if self.shift_size == 0:
+            self.attn_mask = None
+        else:
+            height, width = self.num_patch
+            h_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            w_slices = (
+                slice(0, -self.window_size),
+                slice(-self.window_size, -self.shift_size),
+                slice(-self.shift_size, None),
+            )
+            mask_array = np.zeros((1, height, width, 1))
+            count = 0
+            for h in h_slices:
+                for w in w_slices:
+                    mask_array[:, h, w, :] = count
+                    count += 1
+            mask_array = ops.convert_to_tensor(mask_array)
+
+            # mask array to windows
+            mask_windows = window_partition(mask_array, self.window_size)
+            mask_windows = ops.reshape(
+                mask_windows, [-1, self.window_size * self.window_size]
+            )
+            attn_mask = ops.expand_dims(mask_windows, axis=1) - ops.expand_dims(
+                mask_windows, axis=2
+            )
+            attn_mask = ops.where(attn_mask != 0, -100.0, attn_mask)
+            attn_mask = ops.where(attn_mask == 0, 0.0, attn_mask)
+            self.attn_mask = keras.Variable(
+                initializer=attn_mask,
+                shape=attn_mask.shape,
+                dtype=attn_mask.dtype,
+                trainable=False,
+            )
+
+    def call(self, x, training=False):
+        height, width = self.num_patch
+        _, num_patches_before, channels = x.shape
+        x_skip = x
+        x = self.norm1(x)
+        x = ops.reshape(x, (-1, height, width, channels))
+        if self.shift_size > 0:
+            shifted_x = ops.roll(
+                x, shift=[-self.shift_size, -self.shift_size], axis=[1, 2]
+            )
+        else:
+            shifted_x = x
+
+        x_windows = window_partition(shifted_x, self.window_size)
+        x_windows = ops.reshape(
+            x_windows, (-1, self.window_size * self.window_size, channels)
+        )
+        attn_windows = self.attn(x_windows, mask=self.attn_mask)
+
+        attn_windows = ops.reshape(
+            attn_windows,
+            (-1, self.window_size, self.window_size, channels),
+        )
+        shifted_x = window_reverse(
+            attn_windows, self.window_size, height, width, channels
+        )
+        if self.shift_size > 0:
+            x = ops.roll(
+                shifted_x, shift=[self.shift_size, self.shift_size], axis=[1, 2]
+            )
+        else:
+            x = shifted_x
+
+        x = ops.reshape(x, (-1, height * width, channels))
+        x = self.drop_path(x, training=training)
+        x = x_skip + x
+        x_skip = x
+        x = self.norm2(x)
+        x = self.mlp(x)
+        x = self.drop_path(x)
+        x = x_skip + x
+        return x
+
+
+"""
+## Model training and evaluation
+
+### Extract and embed patches
+
+We first create 3 layers to help us extract, embed and merge patches from the
+images on top of which we will later use the Swin Transformer class we built.
+"""
+
+
+# Using tf ops since it is only used in tf.data.
+def patch_extract(images):
+    batch_size = tf.shape(images)[0]
+    patches = tf.image.extract_patches(
+        images=images,
+        sizes=(1, patch_size[0], patch_size[1], 1),
+        strides=(1, patch_size[0], patch_size[1], 1),
+        rates=(1, 1, 1, 1),
+        padding="VALID",
+    )
+    patch_dim = patches.shape[-1]
+    patch_num = patches.shape[1]
+    return tf.reshape(patches, (batch_size, patch_num * patch_num, patch_dim))
+
+
+class PatchEmbedding(layers.Layer):
+    def __init__(self, num_patch, embed_dim, **kwargs):
+        super().__init__(**kwargs)
+        self.num_patch = num_patch
+        self.proj = layers.Dense(embed_dim)
+        self.pos_embed = layers.Embedding(input_dim=num_patch, output_dim=embed_dim)
+
+    def call(self, patch):
+        pos = ops.arange(start=0, stop=self.num_patch)
+        return self.proj(patch) + self.pos_embed(pos)
+
+
+class PatchMerging(keras.layers.Layer):
+    def __init__(self, num_patch, embed_dim):
+        super().__init__()
+        self.num_patch = num_patch
+        self.embed_dim = embed_dim
+        self.linear_trans = layers.Dense(2 * embed_dim, use_bias=False)
+
+    def call(self, x):
+        height, width = self.num_patch
+        _, _, C = x.shape
+        x = ops.reshape(x, (-1, height, width, C))
+        x0 = x[:, 0::2, 0::2, :]
+        x1 = x[:, 1::2, 0::2, :]
+        x2 = x[:, 0::2, 1::2, :]
+        x3 = x[:, 1::2, 1::2, :]
+        x = ops.concatenate((x0, x1, x2, x3), axis=-1)
+        x = ops.reshape(x, (-1, (height // 2) * (width // 2), 4 * C))
+        return self.linear_trans(x)
+
+
+"""
+### Prepare the tf.data.Dataset
+
+We do all the steps, which do not have trainable weights with tf.data.
+Prepare the training, validation and testing sets.
+
+"""
+
+
+def augment(x):
+    x = tf.image.random_crop(x, size=(image_dimension, image_dimension, 3))
+    x = tf.image.random_flip_left_right(x)
+    return x
+
+
+dataset = (
+    tf.data.Dataset.from_tensor_slices((x_train, y_train))
+    .map(lambda x, y: (augment(x), y))
+    .batch(batch_size=batch_size)
+    .map(lambda x, y: (patch_extract(x), y))
+    .prefetch(tf.data.experimental.AUTOTUNE)
+)
+
+dataset_val = (
+    tf.data.Dataset.from_tensor_slices((x_val, y_val))
+    .batch(batch_size=batch_size)
+    .map(lambda x, y: (patch_extract(x), y))
+    .prefetch(tf.data.experimental.AUTOTUNE)
+)
+
+dataset_test = (
+    tf.data.Dataset.from_tensor_slices((x_test, y_test))
+    .batch(batch_size=batch_size)
+    .map(lambda x, y: (patch_extract(x), y))
+    .prefetch(tf.data.experimental.AUTOTUNE)
+)
+
+"""
+### Build the model
+
+We put together the Swin Transformer model.
+"""
+
+input = layers.Input(shape=(256, 12))
+x = PatchEmbedding(num_patch_x * num_patch_y, embed_dim)(input)
+x = SwinTransformer(
+    dim=embed_dim,
+    num_patch=(num_patch_x, num_patch_y),
+    num_heads=num_heads,
+    window_size=window_size,
+    shift_size=0,
+    num_mlp=num_mlp,
+    qkv_bias=qkv_bias,
+    dropout_rate=dropout_rate,
+)(x)
+x = SwinTransformer(
+    dim=embed_dim,
+    num_patch=(num_patch_x, num_patch_y),
+    num_heads=num_heads,
+    window_size=window_size,
+    shift_size=shift_size,
+    num_mlp=num_mlp,
+    qkv_bias=qkv_bias,
+    dropout_rate=dropout_rate,
+)(x)
+x = PatchMerging((num_patch_x, num_patch_y), embed_dim=embed_dim)(x)
+x = layers.GlobalAveragePooling1D()(x)
+output = layers.Dense(num_classes, activation="softmax")(x)
+
+"""
+### Train on CIFAR-100
+
+We train the model on CIFAR-100. Here, we only train the model
+for 40 epochs to keep the training time short in this example.
+In practice, you should train for 150 epochs to reach convergence.
+"""
+
+model = keras.Model(input, output)
+model.compile(
+    loss=keras.losses.CategoricalCrossentropy(label_smoothing=label_smoothing),
+    optimizer=keras.optimizers.AdamW(
+        learning_rate=learning_rate, weight_decay=weight_decay
+    ),
+    metrics=[
+        keras.metrics.CategoricalAccuracy(name="accuracy"),
+        keras.metrics.TopKCategoricalAccuracy(5, name="top-5-accuracy"),
+    ],
+)
+
+history = model.fit(
+    dataset,
+    batch_size=batch_size,
+    epochs=num_epochs,
+    validation_data=dataset_val,
+)
+
+"""
+Let's visualize the training progress of the model.
+"""
+
+plt.plot(history.history["loss"], label="train_loss")
+plt.plot(history.history["val_loss"], label="val_loss")
+plt.xlabel("Epochs")
+plt.ylabel("Loss")
+plt.title("Train and Validation Losses Over Epochs", fontsize=14)
+plt.legend()
+plt.grid()
+plt.show()
+
+"""
+Let's display the final results of the training on CIFAR-100.
+"""
+
+loss, accuracy, top_5_accuracy = model.evaluate(dataset_test)
+print(f"Test loss: {round(loss, 2)}")
+print(f"Test accuracy: {round(accuracy * 100, 2)}%")
+print(f"Test top 5 accuracy: {round(top_5_accuracy * 100, 2)}%")
+
+"""
+The Swin Transformer model we just trained has just 152K parameters, and it gets
+us to ~75% test top-5 accuracy within just 40 epochs without any signs of
+overfitting as well as seen in above graph. This means we can train this network
+for longer (perhaps with a bit more regularization) and obtain even better
+performance. This performance can further be improved by additional techniques
+like cosine decay learning rate schedule, other data augmentation techniques.
+While experimenting, I tried training the model for 150 epochs with a slightly
+higher dropout and greater embedding dimensions which pushes the performance to
+~72% test accuracy on CIFAR-100 as you can see in the screenshot.
+
+![Results of training for longer](https://i.imgur.com/9vnQesZ.png)
+
+The authors present a top-1 accuracy of 87.3% on ImageNet. The authors also
+present a number of experiments to study how input sizes, optimizers etc. affect
+the final performance of this model. The authors further present using this
+model for object detection, semantic segmentation and instance segmentation as
+well and report competitive results for these. You are strongly advised to also
+check out the [original paper](https://arxiv.org/abs/2103.14030).
+
+This example takes inspiration from the official
+[PyTorch](https://github.com/microsoft/Swin-Transformer) and
+[TensorFlow](https://github.com/VcampSoldiers/Swin-Transformer-Tensorflow)
+implementations.
+"""
diff --git a/knowledge_base/vision/temporal_latent_bottleneck.py b/knowledge_base/vision/temporal_latent_bottleneck.py
new file mode 100644
index 0000000000000000000000000000000000000000..21279501c5bc353ca9653a8f322121d796db2e93
--- /dev/null
+++ b/knowledge_base/vision/temporal_latent_bottleneck.py
@@ -0,0 +1,909 @@
+"""
+Title: When Recurrence meets Transformers
+Author: [Aritra Roy Gosthipaty](https://twitter.com/ariG23498), [Suvaditya Mukherjee](https://twitter.com/halcyonrayes)
+Date created: 2023/03/12
+Last modified: 2024/10/29
+Description: Image Classification with Temporal Latent Bottleneck Networks.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+A simple Recurrent Neural Network (RNN) displays a strong inductive bias towards learning
+**temporally compressed representations**. **Equation 1** shows the recurrence formula,
+where `h_t` is the compressed representation (a single vector) of the entire input
+sequence `x`.
+
+| ![Equation of RNN](https://i.imgur.com/Kdyj2jr.png) |
+| :--: |
+| **Equation 1**: The recurrence equation. (Source: Aritra and Suvaditya)|
+
+On the other hand, Transformers ([Vaswani et. al](https://arxiv.org/abs/1706.03762)) have
+little inductive bias towards learning temporally compressed representations.
+Transformer has achieved SoTA results in Natural Language Processing (NLP)
+and Vision tasks with its pairwise attention mechanism.
+
+While the Transformer has the ability to **attend** to different sections of the input
+sequence, the computation of attention is quadratic in nature.
+
+[Didolkar et. al](https://arxiv.org/abs/2205.14794) argue that having a more compressed
+representation of a sequence may be beneficial for *generalization*, as it can be easily
+**re-used** and **re-purposed** with fewer irrelevant details. While compression is good,
+they also notice that too much of it can harm expressiveness.
+
+The authors propose a solution that divides computation into **two streams**. A *slow
+stream* that is recurrent in nature and a *fast stream* that is parameterized as a
+Transformer. While this method has the novelty of introducing different processing
+streams in order to preserve and process latent states, it has parallels drawn in other
+works like the [Perceiver Mechanism (by Jaegle et. al.)](https://arxiv.org/abs/2103.03206)
+and [Grounded Language Learning Fast and Slow (by Hill et. al.)](https://arxiv.org/abs/2009.01719).
+
+The following example explores how we can make use of the new Temporal Latent Bottleneck
+mechanism to perform image classification on the CIFAR-10 dataset. We implement this
+model by making a custom `RNNCell` implementation in order to make a **performant** and
+**vectorized** design.
+"""
+
+"""
+## Setup imports
+"""
+import os
+
+import keras
+from keras import layers, ops, mixed_precision
+from keras.optimizers import AdamW
+import numpy as np
+import random
+from matplotlib import pyplot as plt
+
+# Set seed for reproducibility.
+keras.utils.set_random_seed(42)
+
+"""
+## Setting required configuration
+
+We set a few configuration parameters that are needed within the pipeline we have
+designed. The current parameters are for use with the
+[CIFAR10 dataset](https://www.cs.toronto.edu/~kriz/cifar.html).
+
+
+The model also supports `mixed-precision` settings, which would quantize the model to use
+`16-bit` float numbers where it can, while keeping some parameters in `32-bit` as needed
+for numerical stability. This brings performance benefits as the footprint of the model
+decreases significantly while bringing speed boosts at inference-time.
+"""
+
+config = {
+    "mixed_precision": True,
+    "dataset": "cifar10",
+    "train_slice": 40_000,
+    "batch_size": 2048,
+    "buffer_size": 2048 * 2,
+    "input_shape": [32, 32, 3],
+    "image_size": 48,
+    "num_classes": 10,
+    "learning_rate": 1e-4,
+    "weight_decay": 1e-4,
+    "epochs": 30,
+    "patch_size": 4,
+    "embed_dim": 64,
+    "chunk_size": 8,
+    "r": 2,
+    "num_layers": 4,
+    "ffn_drop": 0.2,
+    "attn_drop": 0.2,
+    "num_heads": 1,
+}
+
+if config["mixed_precision"]:
+    policy = mixed_precision.Policy("mixed_float16")
+    mixed_precision.set_global_policy(policy)
+
+"""
+## Loading the CIFAR-10 dataset
+
+We are going to use the CIFAR10 dataset for running our experiments. This dataset
+contains a training set of `50,000` images for `10` classes with the standard image size
+of `(32, 32, 3)`.
+
+It also has a separate set of `10,000` images with similar characteristics. More
+information about the dataset may be found at the official site for the dataset as well
+as [`keras.datasets.cifar10`](https://keras.io/api/datasets/cifar10/) API reference
+"""
+
+(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()
+(x_train, y_train), (x_val, y_val) = (
+    (x_train[: config["train_slice"]], y_train[: config["train_slice"]]),
+    (x_train[config["train_slice"] :], y_train[config["train_slice"] :]),
+)
+
+"""
+## Define data augmentation for the training and validation/test pipelines
+
+We define separate pipelines for performing image augmentation on our data. This step is
+important to make the model more robust to changes, helping it generalize better.
+The preprocessing and augmentation steps we perform are as follows:
+
+- `Rescaling` (training, test): This step is performed to normalize all image pixel
+values from the `[0,255]` range to `[0,1)`. This helps in maintaining numerical stability
+later ahead during training.
+
+- `Resizing` (training, test): We resize the image from it's original size of (32, 32) to
+(52, 52). This is done to account for the Random Crop, as well as comply with the
+specifications of the data given in the paper.
+
+- `RandomCrop` (training): This layer randomly selects a crop/sub-region of the image
+with size `(48, 48)`.
+
+- `RandomFlip` (training): This layer randomly flips all the images horizontally,
+keeping image sizes the same.
+"""
+
+# Build the `train` augmentation pipeline.
+train_augmentation = keras.Sequential(
+    [
+        layers.Rescaling(1 / 255.0, dtype="float32"),
+        layers.Resizing(
+            config["input_shape"][0] + 20,
+            config["input_shape"][0] + 20,
+            dtype="float32",
+        ),
+        layers.RandomCrop(config["image_size"], config["image_size"], dtype="float32"),
+        layers.RandomFlip("horizontal", dtype="float32"),
+    ],
+    name="train_data_augmentation",
+)
+
+# Build the `val` and `test` data pipeline.
+test_augmentation = keras.Sequential(
+    [
+        layers.Rescaling(1 / 255.0, dtype="float32"),
+        layers.Resizing(config["image_size"], config["image_size"], dtype="float32"),
+    ],
+    name="test_data_augmentation",
+)
+
+# We define functions in place of simple lambda functions to run through the
+# `keras.Sequential`in order to solve this warning:
+# (https://github.com/tensorflow/tensorflow/issues/56089)
+
+
+def train_map_fn(image, label):
+    return train_augmentation(image), label
+
+
+def test_map_fn(image, label):
+    return test_augmentation(image), label
+
+
+"""
+## Load dataset into `PyDataset` object
+
+- We take the `np.ndarray` instance of the datasets and wrap a class around it,
+wrapping a `keras.utils.PyDataset` and apply augmentations with keras
+preprocessing layers.
+"""
+
+
+class Dataset(keras.utils.PyDataset):
+    def __init__(
+        self, x_data, y_data, batch_size, preprocess_fn=None, shuffle=False, **kwargs
+    ):
+        if shuffle:
+            perm = np.random.permutation(len(x_data))
+            x_data = x_data[perm]
+            y_data = y_data[perm]
+        self.x_data = x_data
+        self.y_data = y_data
+        self.preprocess_fn = preprocess_fn
+        self.batch_size = batch_size
+        super().__init__(*kwargs)
+
+    def __len__(self):
+        return len(self.x_data) // self.batch_size
+
+    def __getitem__(self, idx):
+        batch_x, batch_y = [], []
+        for i in range(idx * self.batch_size, (idx + 1) * self.batch_size):
+            x, y = self.x_data[i], self.y_data[i]
+            if self.preprocess_fn:
+                x, y = self.preprocess_fn(x, y)
+            batch_x.append(x)
+            batch_y.append(y)
+        batch_x = ops.stack(batch_x, axis=0)
+        batch_y = ops.stack(batch_y, axis=0)
+        return batch_x, batch_y
+
+
+train_ds = Dataset(
+    x_train, y_train, config["batch_size"], preprocess_fn=train_map_fn, shuffle=True
+)
+val_ds = Dataset(x_val, y_val, config["batch_size"], preprocess_fn=test_map_fn)
+test_ds = Dataset(x_test, y_test, config["batch_size"], preprocess_fn=test_map_fn)
+
+"""
+## Temporal Latent Bottleneck
+
+An excerpt from the paper:
+
+> In the brain, short-term and long-term memory have developed in a specialized way.
+Short-term memory is allowed to change very quickly to react to immediate sensory inputs
+and perception. By contrast, long-term memory changes slowly, is highly selective and
+involves repeated consolidation.
+
+Inspired from the short-term and long-term memory the authors introduce the fast stream
+and slow stream computation. The fast stream has a short-term memory with a high capacity
+that reacts quickly to sensory input (Transformers). The slow stream has long-term memory
+which updates at a slower rate and summarizes the most relevant information (Recurrence).
+
+To implement this idea we need to:
+
+- Take a sequence of data.
+- Divide the sequence into fixed-size chunks.
+- Fast stream operates within each chunk. It provides fine-grained local information.
+- Slow stream consolidates and aggregates information across chunks. It provides
+coarse-grained distant information.
+
+The fast and slow stream induce what is called **information asymmetry**. The two streams
+interact with each other through a bottleneck of attention. **Figure 1** shows the
+architecture of the model.
+
+| ![Architecture of the model](https://i.imgur.com/bxdLPNH.png) |
+| :--: |
+| Figure 1: Architecture of the model. (Source: https://arxiv.org/abs/2205.14794) |
+
+A PyTorch-style pseudocode is also proposed by the authors as shown in **Algorithm 1**.
+
+| ![Pseudocode of the model](https://i.imgur.com/s8a5Vz9.png) |
+| :--: |
+| Algorithm 1: PyTorch style pseudocode. (Source: https://arxiv.org/abs/2205.14794) |
+
+"""
+
+"""
+### `PatchEmbedding` layer
+
+This custom `keras.layers.Layer` is useful for generating patches from the image and
+transform them into a higher-dimensional embedding space using `keras.layers.Embedding`.
+The patching operation is done using a `keras.layers.Conv2D` instance.
+
+Once the patching of images is complete, we reshape the image patches in order to get a
+flattened representation where the number of dimensions is the embedding dimension. At
+this stage, we also inject positional information to the tokens.
+
+After we obtain the tokens we chunk them. The chunking operation involves taking
+fixed-size sequences from the embedding output to create 'chunks', which will then be
+used as the final input to the model.
+"""
+
+
+class PatchEmbedding(layers.Layer):
+    """Image to Patch Embedding.
+    Args:
+        image_size (`Tuple[int]`): Size of the input image.
+        patch_size (`Tuple[int]`): Size of the patch.
+        embed_dim (`int`): Dimension of the embedding.
+        chunk_size (`int`): Number of patches to be chunked.
+    """
+
+    def __init__(
+        self,
+        image_size,
+        patch_size,
+        embed_dim,
+        chunk_size,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+
+        # Compute the patch resolution.
+        patch_resolution = [
+            image_size[0] // patch_size[0],
+            image_size[1] // patch_size[1],
+        ]
+
+        # Store the parameters.
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.embed_dim = embed_dim
+        self.patch_resolution = patch_resolution
+        self.num_patches = patch_resolution[0] * patch_resolution[1]
+
+        # Define the positions of the patches.
+        self.positions = ops.arange(start=0, stop=self.num_patches, step=1)
+
+        # Create the layers.
+        self.projection = layers.Conv2D(
+            filters=embed_dim,
+            kernel_size=patch_size,
+            strides=patch_size,
+            name="projection",
+        )
+        self.flatten = layers.Reshape(
+            target_shape=(-1, embed_dim),
+            name="flatten",
+        )
+        self.position_embedding = layers.Embedding(
+            input_dim=self.num_patches,
+            output_dim=embed_dim,
+            name="position_embedding",
+        )
+        self.layernorm = keras.layers.LayerNormalization(
+            epsilon=1e-5,
+            name="layernorm",
+        )
+        self.chunking_layer = layers.Reshape(
+            target_shape=(self.num_patches // chunk_size, chunk_size, embed_dim),
+            name="chunking_layer",
+        )
+
+    def call(self, inputs):
+        # Project the inputs to the embedding dimension.
+        x = self.projection(inputs)
+
+        # Flatten the pathces and add position embedding.
+        x = self.flatten(x)
+        x = x + self.position_embedding(self.positions)
+
+        # Normalize the embeddings.
+        x = self.layernorm(x)
+
+        # Chunk the tokens.
+        x = self.chunking_layer(x)
+
+        return x
+
+
+"""
+### `FeedForwardNetwork` Layer
+
+This custom `keras.layers.Layer` instance allows us to define a generic FFN along with a
+dropout.
+"""
+
+
+class FeedForwardNetwork(layers.Layer):
+    """Feed Forward Network.
+    Args:
+        dims (`int`): Number of units in FFN.
+        dropout (`float`): Dropout probability for FFN.
+    """
+
+    def __init__(self, dims, dropout, **kwargs):
+        super().__init__(**kwargs)
+
+        # Create the layers.
+        self.ffn = keras.Sequential(
+            [
+                layers.Dense(units=4 * dims, activation="gelu"),
+                layers.Dense(units=dims),
+                layers.Dropout(rate=dropout),
+            ],
+            name="ffn",
+        )
+        self.layernorm = layers.LayerNormalization(
+            epsilon=1e-5,
+            name="layernorm",
+        )
+
+    def call(self, inputs):
+        # Apply the FFN.
+        x = self.layernorm(inputs)
+        x = inputs + self.ffn(x)
+        return x
+
+
+"""
+### `BaseAttention` layer
+
+This custom `keras.layers.Layer` instance is a `super`/`base` class that wraps a
+`keras.layers.MultiHeadAttention` layer along with some other components. This gives us
+basic common denominator functionality for all the Attention layers/modules in our model.
+"""
+
+
+class BaseAttention(layers.Layer):
+    """Base Attention Module.
+    Args:
+        num_heads (`int`): Number of attention heads.
+        key_dim (`int`): Size of each attention head for key.
+        dropout (`float`): Dropout probability for attention module.
+    """
+
+    def __init__(self, num_heads, key_dim, dropout, **kwargs):
+        super().__init__(**kwargs)
+        self.multi_head_attention = layers.MultiHeadAttention(
+            num_heads=num_heads,
+            key_dim=key_dim,
+            dropout=dropout,
+            name="mha",
+        )
+        self.query_layernorm = layers.LayerNormalization(
+            epsilon=1e-5,
+            name="q_layernorm",
+        )
+        self.key_layernorm = layers.LayerNormalization(
+            epsilon=1e-5,
+            name="k_layernorm",
+        )
+        self.value_layernorm = layers.LayerNormalization(
+            epsilon=1e-5,
+            name="v_layernorm",
+        )
+
+        self.attention_scores = None
+
+    def call(self, input_query, key, value):
+        # Apply the attention module.
+        query = self.query_layernorm(input_query)
+        key = self.key_layernorm(key)
+        value = self.value_layernorm(value)
+        (attention_outputs, attention_scores) = self.multi_head_attention(
+            query=query,
+            key=key,
+            value=value,
+            return_attention_scores=True,
+        )
+
+        # Save the attention scores for later visualization.
+        self.attention_scores = attention_scores
+
+        # Add the input to the attention output.
+        x = input_query + attention_outputs
+        return x
+
+
+"""
+### `Attention` with `FeedForwardNetwork` layer
+
+This custom `keras.layers.Layer` implementation combines the `BaseAttention` and
+`FeedForwardNetwork` components to develop one block which will be used repeatedly within
+the model. This module is highly customizable and flexible, allowing for changes within
+the internal layers.
+"""
+
+
+class AttentionWithFFN(layers.Layer):
+    """Attention with Feed Forward Network.
+    Args:
+        ffn_dims (`int`): Number of units in FFN.
+        ffn_dropout (`float`): Dropout probability for FFN.
+        num_heads (`int`): Number of attention heads.
+        key_dim (`int`): Size of each attention head for key.
+        attn_dropout (`float`): Dropout probability for attention module.
+    """
+
+    def __init__(
+        self,
+        ffn_dims,
+        ffn_dropout,
+        num_heads,
+        key_dim,
+        attn_dropout,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        # Create the layers.
+        self.fast_stream_attention = BaseAttention(
+            num_heads=num_heads,
+            key_dim=key_dim,
+            dropout=attn_dropout,
+            name="base_attn",
+        )
+        self.slow_stream_attention = BaseAttention(
+            num_heads=num_heads,
+            key_dim=key_dim,
+            dropout=attn_dropout,
+            name="base_attn",
+        )
+        self.ffn = FeedForwardNetwork(
+            dims=ffn_dims,
+            dropout=ffn_dropout,
+            name="ffn",
+        )
+
+        self.attention_scores = None
+
+    def build(self, input_shape):
+        self.built = True
+
+    def call(self, query, key, value, stream="fast"):
+        # Apply the attention module.
+        attention_layer = {
+            "fast": self.fast_stream_attention,
+            "slow": self.slow_stream_attention,
+        }[stream]
+        if len(query.shape) == 2:
+            query = ops.expand_dims(query, -1)
+        if len(key.shape) == 2:
+            key = ops.expand_dims(key, -1)
+        if len(value.shape) == 2:
+            value = ops.expand_dims(value, -1)
+        x = attention_layer(query, key, value)
+
+        # Save the attention scores for later visualization.
+        self.attention_scores = attention_layer.attention_scores
+
+        # Apply the FFN.
+        x = self.ffn(x)
+        return x
+
+
+"""
+### Custom RNN Cell for **Temporal Latent Bottleneck** and **Perceptual Module**
+
+**Algorithm 1** (the pseudocode) depicts recurrence with the help of for loops. Looping
+does make the implementation simpler, harming the training time. In this section we wrap
+the custom recurrence logic inside of the `CustomRecurrentCell`. This custom cell will
+then be wrapped with the [Keras RNN API](https://keras.io/api/layers/recurrent_layers/rnn/)
+that makes the entire code vectorizable.
+
+This custom cell, implemented as a `keras.layers.Layer`, is the integral part of the
+logic for the model.
+The cell's functionality can be divided into 2 parts:
+- **Slow Stream (Temporal Latent Bottleneck):**
+
+- This module consists of a single `AttentionWithFFN` layer that parses the output of the
+previous Slow Stream, an intermediate hidden representation (which is the *latent* in
+Temporal Latent Bottleneck) as the Query, and the output of the latest Fast Stream as Key
+and Value. This layer can also be construed as a *CrossAttention* layer.
+
+- **Fast Stream (Perceptual Module):**
+
+- This module consists of intertwined `AttentionWithFFN` layers. This stream consists of
+*n* layers of `SelfAttention` and `CrossAttention` in a sequential manner.
+- Here, some layers take the chunked input as the Query, Key and Value (Also referred to
+as the *SelfAttention* layer).
+- The other layers take the intermediate state outputs from within the Temporal Latent
+Bottleneck module as the Query while using the output of the previous Self-Attention
+layers before it as the Key and Value.
+"""
+
+
+class CustomRecurrentCell(layers.Layer):
+    """Custom Recurrent Cell.
+    Args:
+        chunk_size (`int`): Number of tokens in a chunk.
+        r (`int`): One Cross Attention per **r** Self Attention.
+        num_layers (`int`): Number of layers.
+        ffn_dims (`int`): Number of units in FFN.
+        ffn_dropout (`float`): Dropout probability for FFN.
+        num_heads (`int`): Number of attention heads.
+        key_dim (`int`): Size of each attention head for key.
+        attn_dropout (`float`): Dropout probability for attention module.
+    """
+
+    def __init__(
+        self,
+        chunk_size,
+        r,
+        num_layers,
+        ffn_dims,
+        ffn_dropout,
+        num_heads,
+        key_dim,
+        attn_dropout,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        # Save the arguments.
+        self.chunk_size = chunk_size
+        self.r = r
+        self.num_layers = num_layers
+        self.ffn_dims = ffn_dims
+        self.ffn_droput = ffn_dropout
+        self.num_heads = num_heads
+        self.key_dim = key_dim
+        self.attn_dropout = attn_dropout
+
+        # Create state_size. This is important for
+        # custom recurrence logic.
+        self.state_size = chunk_size * ffn_dims
+
+        self.get_attention_scores = False
+        self.attention_scores = []
+
+        # Perceptual Module
+        perceptual_module = list()
+        for layer_idx in range(num_layers):
+            perceptual_module.append(
+                AttentionWithFFN(
+                    ffn_dims=ffn_dims,
+                    ffn_dropout=ffn_dropout,
+                    num_heads=num_heads,
+                    key_dim=key_dim,
+                    attn_dropout=attn_dropout,
+                    name=f"pm_self_attn_{layer_idx}",
+                )
+            )
+            if layer_idx % r == 0:
+                perceptual_module.append(
+                    AttentionWithFFN(
+                        ffn_dims=ffn_dims,
+                        ffn_dropout=ffn_dropout,
+                        num_heads=num_heads,
+                        key_dim=key_dim,
+                        attn_dropout=attn_dropout,
+                        name=f"pm_cross_attn_ffn_{layer_idx}",
+                    )
+                )
+        self.perceptual_module = perceptual_module
+
+        # Temporal Latent Bottleneck Module
+        self.tlb_module = AttentionWithFFN(
+            ffn_dims=ffn_dims,
+            ffn_dropout=ffn_dropout,
+            num_heads=num_heads,
+            key_dim=key_dim,
+            attn_dropout=attn_dropout,
+            name=f"tlb_cross_attn_ffn",
+        )
+
+    def build(self, input_shape):
+        self.built = True
+
+    def call(self, inputs, states):
+        # inputs => (batch, chunk_size, dims)
+        # states => [(batch, chunk_size, units)]
+        slow_stream = ops.reshape(states[0], (-1, self.chunk_size, self.ffn_dims))
+        fast_stream = inputs
+
+        for layer_idx, layer in enumerate(self.perceptual_module):
+            fast_stream = layer(
+                query=fast_stream, key=fast_stream, value=fast_stream, stream="fast"
+            )
+
+            if layer_idx % self.r == 0:
+                fast_stream = layer(
+                    query=fast_stream, key=slow_stream, value=slow_stream, stream="slow"
+                )
+
+        slow_stream = self.tlb_module(
+            query=slow_stream, key=fast_stream, value=fast_stream
+        )
+
+        # Save the attention scores for later visualization.
+        if self.get_attention_scores:
+            self.attention_scores.append(self.tlb_module.attention_scores)
+
+        return fast_stream, [
+            ops.reshape(slow_stream, (-1, self.chunk_size * self.ffn_dims))
+        ]
+
+
+"""
+### `TemporalLatentBottleneckModel` to encapsulate full model
+
+Here, we just wrap the full model as to expose it for training.
+"""
+
+
+class TemporalLatentBottleneckModel(keras.Model):
+    """Model Trainer.
+    Args:
+        patch_layer (`keras.layers.Layer`): Patching layer.
+        custom_cell (`keras.layers.Layer`): Custom Recurrent Cell.
+    """
+
+    def __init__(self, patch_layer, custom_cell, unroll_loops=False, **kwargs):
+        super().__init__(**kwargs)
+        self.patch_layer = patch_layer
+        self.rnn = layers.RNN(custom_cell, unroll=unroll_loops, name="rnn")
+        self.gap = layers.GlobalAveragePooling1D(name="gap")
+        self.head = layers.Dense(10, activation="softmax", dtype="float32", name="head")
+
+    def call(self, inputs):
+        x = self.patch_layer(inputs)
+        x = self.rnn(x)
+        x = self.gap(x)
+        outputs = self.head(x)
+        return outputs
+
+
+"""
+## Build the model
+
+To begin training, we now define the components individually and pass them as arguments
+to our wrapper class, which will prepare the final model for training. We define a
+`PatchEmbed` layer, and the `CustomCell`-based RNN.
+"""
+
+# Build the model.
+patch_layer = PatchEmbedding(
+    image_size=(config["image_size"], config["image_size"]),
+    patch_size=(config["patch_size"], config["patch_size"]),
+    embed_dim=config["embed_dim"],
+    chunk_size=config["chunk_size"],
+)
+custom_rnn_cell = CustomRecurrentCell(
+    chunk_size=config["chunk_size"],
+    r=config["r"],
+    num_layers=config["num_layers"],
+    ffn_dims=config["embed_dim"],
+    ffn_dropout=config["ffn_drop"],
+    num_heads=config["num_heads"],
+    key_dim=config["embed_dim"],
+    attn_dropout=config["attn_drop"],
+)
+model = TemporalLatentBottleneckModel(
+    patch_layer=patch_layer,
+    custom_cell=custom_rnn_cell,
+)
+
+"""
+## Metrics and Callbacks
+
+We use the `AdamW` optimizer since it has been shown to perform very well on several benchmark
+tasks from an optimization perspective. It is a version of the `keras.optimizers.Adam`
+optimizer, along with Weight Decay in place.
+
+For a loss function, we make use of the `keras.losses.SparseCategoricalCrossentropy`
+function that makes use of simple Cross-entropy between prediction and actual logits. We
+also calculate accuracy on our data as a sanity-check.
+"""
+
+optimizer = AdamW(
+    learning_rate=config["learning_rate"], weight_decay=config["weight_decay"]
+)
+model.compile(
+    optimizer=optimizer,
+    loss="sparse_categorical_crossentropy",
+    metrics=["accuracy"],
+)
+
+"""
+## Train the model with `model.fit()`
+
+We pass the training dataset and run training.
+"""
+
+history = model.fit(
+    train_ds,
+    epochs=config["epochs"],
+    validation_data=val_ds,
+)
+
+"""
+## Visualize training metrics
+
+The `model.fit()` will return a `history` object, which stores the values of the metrics
+generated during the training run (but it is ephemeral and needs to be saved manually).
+
+We now display the Loss and Accuracy curves for the training and validation sets.
+"""
+
+plt.plot(history.history["loss"], label="loss")
+plt.plot(history.history["val_loss"], label="val_loss")
+plt.legend()
+plt.show()
+
+plt.plot(history.history["accuracy"], label="accuracy")
+plt.plot(history.history["val_accuracy"], label="val_accuracy")
+plt.legend()
+plt.show()
+
+"""
+## Visualize attention maps from the Temporal Latent Bottleneck
+
+Now that we have trained our model, it is time for some visualizations. The Fast Stream
+(Transformers) processes a chunk of tokens. The Slow Stream processes each chunk and
+attends to tokens that are useful for the task.
+
+In this section we visualize the attention map of the Slow Stream. This is done by
+extracting the attention scores from the TLB layer at each chunk's intersection and
+storing it within the RNN's state. This is followed by 'ballooning' it up and returning
+these values.
+"""
+
+
+def score_to_viz(chunk_score):
+    # get the most attended token
+    chunk_viz = ops.max(chunk_score, axis=-2)
+    # get the mean across heads
+    chunk_viz = ops.mean(chunk_viz, axis=1)
+    return chunk_viz
+
+
+# Get a batch of images and labels from the testing dataset
+images, labels = next(iter(test_ds))
+
+# Create a new model instance that is executed eagerly to allow saving
+# attention scores. This also requires unrolling loops
+eager_model = TemporalLatentBottleneckModel(
+    patch_layer=patch_layer, custom_cell=custom_rnn_cell, unroll_loops=True
+)
+eager_model.compile(run_eagerly=True, jit_compile=False)
+model.save("weights.keras")
+eager_model.load_weights("weights.keras")
+
+# Set the get_attn_scores flag to True
+eager_model.rnn.cell.get_attention_scores = True
+
+# Run the model with the testing images and grab the
+# attention scores.
+outputs = eager_model(images)
+list_chunk_scores = eager_model.rnn.cell.attention_scores
+
+# Process the attention scores in order to visualize them
+num_chunks = (config["image_size"] // config["patch_size"]) ** 2 // config["chunk_size"]
+list_chunk_viz = [score_to_viz(x) for x in list_chunk_scores[-num_chunks:]]
+chunk_viz = ops.concatenate(list_chunk_viz, axis=-1)
+chunk_viz = ops.reshape(
+    chunk_viz,
+    (
+        config["batch_size"],
+        config["image_size"] // config["patch_size"],
+        config["image_size"] // config["patch_size"],
+        1,
+    ),
+)
+upsampled_heat_map = layers.UpSampling2D(
+    size=(4, 4), interpolation="bilinear", dtype="float32"
+)(chunk_viz)
+
+"""
+Run the following code snippet to get different images and their attention maps.
+"""
+
+# Sample a random image
+index = random.randint(0, config["batch_size"])
+orig_image = images[index]
+overlay_image = upsampled_heat_map[index, ..., 0]
+
+if keras.backend.backend() == "torch":
+    # when using the torch backend, we are required to ensure that the
+    # image is copied from the GPU
+    orig_image = orig_image.cpu().detach().numpy()
+    overlay_image = overlay_image.cpu().detach().numpy()
+
+# Plot the visualization
+fig, ax = plt.subplots(nrows=1, ncols=2, figsize=(10, 5))
+
+ax[0].imshow(orig_image)
+ax[0].set_title("Original:")
+ax[0].axis("off")
+
+image = ax[1].imshow(orig_image)
+ax[1].imshow(
+    overlay_image,
+    cmap="inferno",
+    alpha=0.6,
+    extent=image.get_extent(),
+)
+ax[1].set_title("TLB Attention:")
+
+plt.show()
+
+"""
+## Conclusion
+
+This example has demonstrated an implementation of the Temporal Latent Bottleneck
+mechanism. The example highlights the use of compression and storage of historical states
+in the form of a Temporal Latent Bottleneck with regular updates from a Perceptual Module
+as an effective method to do so.
+
+In the original paper, the authors have conducted highly extensive tests around different
+modalities ranging from Supervised Image Classification to applications in Reinforcement
+Learning.
+
+While we have only displayed a method to apply this mechanism to Image Classification, it
+can be extended to other modalities too with minimal changes.
+
+*Note*: While building this example we did not have the official code to refer to. This
+means that our implementation is inspired by the paper with no claims of being a
+complete reproduction. For more details on the training process one can head over to
+[our GitHub repository](https://github.com/suvadityamuk/Temporal-Latent-Bottleneck-TF).
+"""
+
+"""
+## Acknowledgement
+
+Thanks to [Aniket Didolkar](https://www.aniketdidolkar.in/) (the first author) and
+[Anirudh Goyal](https://anirudh9119.github.io/) (the third author)
+for revieweing our work.
+
+We would like to thank
+[PyImageSearch](https://pyimagesearch.com/) for a Colab Pro account and
+[JarvisLabs.ai](https://cloud.jarvislabs.ai/) for the GPU credits.
+"""
diff --git a/knowledge_base/vision/token_learner.py b/knowledge_base/vision/token_learner.py
new file mode 100644
index 0000000000000000000000000000000000000000..f86a5983ec8e5acf8a7a8ef25ba5646e1b727d3c
--- /dev/null
+++ b/knowledge_base/vision/token_learner.py
@@ -0,0 +1,515 @@
+"""
+Title: Learning to tokenize in Vision Transformers
+Authors: [Aritra Roy Gosthipaty](https://twitter.com/ariG23498), [Sayak Paul](https://twitter.com/RisingSayak) (equal contribution), converted to Keras 3 by [Muhammad Anas Raza](https://anasrz.com)
+Date created: 2021/12/10
+Last modified: 2023/08/14
+Description: Adaptively generating a smaller number of tokens for Vision Transformers.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+Vision Transformers ([Dosovitskiy et al.](https://arxiv.org/abs/2010.11929)) and many
+other Transformer-based architectures ([Liu et al.](https://arxiv.org/abs/2103.14030),
+[Yuan et al.](https://arxiv.org/abs/2101.11986), etc.) have shown strong results in
+image recognition. The following provides a brief overview of the components involved in the
+Vision Transformer architecture for image classification:
+
+* Extract small patches from input images.
+* Linearly project those patches.
+* Add positional embeddings to these linear projections.
+* Run these projections through a series of Transformer ([Vaswani et al.](https://arxiv.org/abs/1706.03762))
+blocks.
+* Finally, take the representation from the final Transformer block and add a
+classification head.
+
+If we take 224x224 images and extract 16x16 patches, we get a total of 196 patches (also
+called tokens) for each image. The number of patches increases as we increase the
+resolution, leading to higher memory footprint. Could we use a reduced
+number of patches without having to compromise performance?
+Ryoo et al. investigate this question in
+[TokenLearner: Adaptive Space-Time Tokenization for Videos](https://openreview.net/forum?id=z-l1kpDXs88).
+They introduce a novel module called **TokenLearner** that can help reduce the number
+of patches used by a Vision Transformer (ViT) in an adaptive manner. With TokenLearner
+incorporated in the standard ViT architecture, they are able to reduce the amount of
+compute (measured in FLOPS) used by the model.
+
+In this example, we implement the TokenLearner module and demonstrate its
+performance with a mini ViT and the CIFAR-10 dataset. We make use of the following
+references:
+
+* [Official TokenLearner code](https://github.com/google-research/scenic/blob/main/scenic/projects/token_learner/model.py)
+* [Image Classification with ViTs on keras.io](https://keras.io/examples/vision/image_classification_with_vision_transformer/)
+* [TokenLearner slides from NeurIPS 2021](https://nips.cc/media/neurips-2021/Slides/26578.pdf)
+"""
+
+
+"""
+## Imports
+"""
+
+import keras
+from keras import layers
+from keras import ops
+from tensorflow import data as tf_data
+
+
+from datetime import datetime
+import matplotlib.pyplot as plt
+import numpy as np
+
+import math
+
+"""
+## Hyperparameters
+
+Please feel free to change the hyperparameters and check your results. The best way to
+develop intuition about the architecture is to experiment with it.
+"""
+
+# DATA
+BATCH_SIZE = 256
+AUTO = tf_data.AUTOTUNE
+INPUT_SHAPE = (32, 32, 3)
+NUM_CLASSES = 10
+
+# OPTIMIZER
+LEARNING_RATE = 1e-3
+WEIGHT_DECAY = 1e-4
+
+# TRAINING
+EPOCHS = 1
+
+# AUGMENTATION
+IMAGE_SIZE = 48  # We will resize input images to this size.
+PATCH_SIZE = 6  # Size of the patches to be extracted from the input images.
+NUM_PATCHES = (IMAGE_SIZE // PATCH_SIZE) ** 2
+
+# ViT ARCHITECTURE
+LAYER_NORM_EPS = 1e-6
+PROJECTION_DIM = 128
+NUM_HEADS = 4
+NUM_LAYERS = 4
+MLP_UNITS = [
+    PROJECTION_DIM * 2,
+    PROJECTION_DIM,
+]
+
+# TOKENLEARNER
+NUM_TOKENS = 4
+
+"""
+## Load and prepare the CIFAR-10 dataset
+"""
+
+# Load the CIFAR-10 dataset.
+(x_train, y_train), (x_test, y_test) = keras.datasets.cifar10.load_data()
+(x_train, y_train), (x_val, y_val) = (
+    (x_train[:40000], y_train[:40000]),
+    (x_train[40000:], y_train[40000:]),
+)
+print(f"Training samples: {len(x_train)}")
+print(f"Validation samples: {len(x_val)}")
+print(f"Testing samples: {len(x_test)}")
+
+# Convert to tf.data.Dataset objects.
+train_ds = tf_data.Dataset.from_tensor_slices((x_train, y_train))
+train_ds = train_ds.shuffle(BATCH_SIZE * 100).batch(BATCH_SIZE).prefetch(AUTO)
+
+val_ds = tf_data.Dataset.from_tensor_slices((x_val, y_val))
+val_ds = val_ds.batch(BATCH_SIZE).prefetch(AUTO)
+
+test_ds = tf_data.Dataset.from_tensor_slices((x_test, y_test))
+test_ds = test_ds.batch(BATCH_SIZE).prefetch(AUTO)
+
+"""
+## Data augmentation
+
+The augmentation pipeline consists of:
+
+- Rescaling
+- Resizing
+- Random cropping (fixed-sized or random sized)
+- Random horizontal flipping
+"""
+
+data_augmentation = keras.Sequential(
+    [
+        layers.Rescaling(1 / 255.0),
+        layers.Resizing(INPUT_SHAPE[0] + 20, INPUT_SHAPE[0] + 20),
+        layers.RandomCrop(IMAGE_SIZE, IMAGE_SIZE),
+        layers.RandomFlip("horizontal"),
+    ],
+    name="data_augmentation",
+)
+
+"""
+Note that image data augmentation layers do not apply data transformations at inference time.
+This means that when these layers are called with `training=False` they behave differently. Refer
+[to the documentation](https://keras.io/api/layers/preprocessing_layers/image_augmentation/) for more
+details.
+"""
+
+"""
+## Positional embedding module
+
+A [Transformer](https://arxiv.org/abs/1706.03762) architecture consists of **multi-head
+self attention** layers and **fully-connected feed forward** networks (MLP) as the main
+components. Both these components are _permutation invariant_: they're not aware of
+feature order.
+
+To overcome this problem we inject tokens with positional information. The
+`position_embedding` function adds this positional information to the linearly projected
+tokens.
+"""
+
+
+class PatchEncoder(layers.Layer):
+    def __init__(self, num_patches, projection_dim):
+        super().__init__()
+        self.num_patches = num_patches
+        self.position_embedding = layers.Embedding(
+            input_dim=num_patches, output_dim=projection_dim
+        )
+
+    def call(self, patch):
+        positions = ops.expand_dims(
+            ops.arange(start=0, stop=self.num_patches, step=1), axis=0
+        )
+        encoded = patch + self.position_embedding(positions)
+        return encoded
+
+    def get_config(self):
+        config = super().get_config()
+        config.update({"num_patches": self.num_patches})
+        return config
+
+
+"""
+## MLP block for Transformer
+
+This serves as the Fully Connected Feed Forward block for our Transformer.
+"""
+
+
+def mlp(x, dropout_rate, hidden_units):
+    # Iterate over the hidden units and
+    # add Dense => Dropout.
+    for units in hidden_units:
+        x = layers.Dense(units, activation=ops.gelu)(x)
+        x = layers.Dropout(dropout_rate)(x)
+    return x
+
+
+"""
+## TokenLearner module
+
+The following figure presents a pictorial overview of the module
+([source](https://ai.googleblog.com/2021/12/improving-vision-transformer-efficiency.html)).
+
+![TokenLearner module GIF](https://blogger.googleusercontent.com/img/a/AVvXsEiylT3_nmd9-tzTnz3g3Vb4eTn-L5sOwtGJOad6t2we7FsjXSpbLDpuPrlInAhtE5hGCA_PfYTJtrIOKfLYLYGcYXVh1Ksfh_C1ZC-C8gw6GKtvrQesKoMrEA_LU_Gd5srl5-3iZDgJc1iyCELoXtfuIXKJ2ADDHOBaUjhU8lXTVdr2E7bCVaFgVHHkmA=w640-h208)
+
+The TokenLearner module takes as input an image-shaped tensor. It then passes it through
+multiple single-channel convolutional layers extracting different spatial attention maps
+focusing on different parts of the input. These attention maps are then element-wise
+multiplied to the input and result is aggregated with pooling. This pooled output can be
+trated as a summary of the input and has much lesser number of patches (8, for example)
+than the original one (196, for example).
+
+Using multiple convolution layers helps with expressivity. Imposing a form of spatial
+attention helps retain relevant information from the inputs. Both of these components are
+crucial to make TokenLearner work, especially when we are significantly reducing the number of patches.
+"""
+
+
+def token_learner(inputs, number_of_tokens=NUM_TOKENS):
+    # Layer normalize the inputs.
+    x = layers.LayerNormalization(epsilon=LAYER_NORM_EPS)(inputs)  # (B, H, W, C)
+
+    # Applying Conv2D => Reshape => Permute
+    # The reshape and permute is done to help with the next steps of
+    # multiplication and Global Average Pooling.
+    attention_maps = keras.Sequential(
+        [
+            # 3 layers of conv with gelu activation as suggested
+            # in the paper.
+            layers.Conv2D(
+                filters=number_of_tokens,
+                kernel_size=(3, 3),
+                activation=ops.gelu,
+                padding="same",
+                use_bias=False,
+            ),
+            layers.Conv2D(
+                filters=number_of_tokens,
+                kernel_size=(3, 3),
+                activation=ops.gelu,
+                padding="same",
+                use_bias=False,
+            ),
+            layers.Conv2D(
+                filters=number_of_tokens,
+                kernel_size=(3, 3),
+                activation=ops.gelu,
+                padding="same",
+                use_bias=False,
+            ),
+            # This conv layer will generate the attention maps
+            layers.Conv2D(
+                filters=number_of_tokens,
+                kernel_size=(3, 3),
+                activation="sigmoid",  # Note sigmoid for [0, 1] output
+                padding="same",
+                use_bias=False,
+            ),
+            # Reshape and Permute
+            layers.Reshape((-1, number_of_tokens)),  # (B, H*W, num_of_tokens)
+            layers.Permute((2, 1)),
+        ]
+    )(
+        x
+    )  # (B, num_of_tokens, H*W)
+
+    # Reshape the input to align it with the output of the conv block.
+    num_filters = inputs.shape[-1]
+    inputs = layers.Reshape((1, -1, num_filters))(inputs)  # inputs == (B, 1, H*W, C)
+
+    # Element-Wise multiplication of the attention maps and the inputs
+    attended_inputs = (
+        ops.expand_dims(attention_maps, axis=-1) * inputs
+    )  # (B, num_tokens, H*W, C)
+
+    # Global average pooling the element wise multiplication result.
+    outputs = ops.mean(attended_inputs, axis=2)  # (B, num_tokens, C)
+    return outputs
+
+
+"""
+## Transformer block
+"""
+
+
+def transformer(encoded_patches):
+    # Layer normalization 1.
+    x1 = layers.LayerNormalization(epsilon=LAYER_NORM_EPS)(encoded_patches)
+
+    # Multi Head Self Attention layer 1.
+    attention_output = layers.MultiHeadAttention(
+        num_heads=NUM_HEADS, key_dim=PROJECTION_DIM, dropout=0.1
+    )(x1, x1)
+
+    # Skip connection 1.
+    x2 = layers.Add()([attention_output, encoded_patches])
+
+    # Layer normalization 2.
+    x3 = layers.LayerNormalization(epsilon=LAYER_NORM_EPS)(x2)
+
+    # MLP layer 1.
+    x4 = mlp(x3, hidden_units=MLP_UNITS, dropout_rate=0.1)
+
+    # Skip connection 2.
+    encoded_patches = layers.Add()([x4, x2])
+    return encoded_patches
+
+
+"""
+## ViT model with the TokenLearner module
+"""
+
+
+def create_vit_classifier(use_token_learner=True, token_learner_units=NUM_TOKENS):
+    inputs = layers.Input(shape=INPUT_SHAPE)  # (B, H, W, C)
+
+    # Augment data.
+    augmented = data_augmentation(inputs)
+
+    # Create patches and project the pathces.
+    projected_patches = layers.Conv2D(
+        filters=PROJECTION_DIM,
+        kernel_size=(PATCH_SIZE, PATCH_SIZE),
+        strides=(PATCH_SIZE, PATCH_SIZE),
+        padding="VALID",
+    )(augmented)
+    _, h, w, c = projected_patches.shape
+    projected_patches = layers.Reshape((h * w, c))(
+        projected_patches
+    )  # (B, number_patches, projection_dim)
+
+    # Add positional embeddings to the projected patches.
+    encoded_patches = PatchEncoder(
+        num_patches=NUM_PATCHES, projection_dim=PROJECTION_DIM
+    )(
+        projected_patches
+    )  # (B, number_patches, projection_dim)
+    encoded_patches = layers.Dropout(0.1)(encoded_patches)
+
+    # Iterate over the number of layers and stack up blocks of
+    # Transformer.
+    for i in range(NUM_LAYERS):
+        # Add a Transformer block.
+        encoded_patches = transformer(encoded_patches)
+
+        # Add TokenLearner layer in the middle of the
+        # architecture. The paper suggests that anywhere
+        # between 1/2 or 3/4 will work well.
+        if use_token_learner and i == NUM_LAYERS // 2:
+            _, hh, c = encoded_patches.shape
+            h = int(math.sqrt(hh))
+            encoded_patches = layers.Reshape((h, h, c))(
+                encoded_patches
+            )  # (B, h, h, projection_dim)
+            encoded_patches = token_learner(
+                encoded_patches, token_learner_units
+            )  # (B, num_tokens, c)
+
+    # Layer normalization and Global average pooling.
+    representation = layers.LayerNormalization(epsilon=LAYER_NORM_EPS)(encoded_patches)
+    representation = layers.GlobalAvgPool1D()(representation)
+
+    # Classify outputs.
+    outputs = layers.Dense(NUM_CLASSES, activation="softmax")(representation)
+
+    # Create the Keras model.
+    model = keras.Model(inputs=inputs, outputs=outputs)
+    return model
+
+
+"""
+As shown in the [TokenLearner paper](https://openreview.net/forum?id=z-l1kpDXs88), it is
+almost always advantageous to include the TokenLearner module in the middle of the
+network.
+"""
+
+"""
+## Training utility
+"""
+
+
+def run_experiment(model):
+    # Initialize the AdamW optimizer.
+    optimizer = keras.optimizers.AdamW(
+        learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY
+    )
+
+    # Compile the model with the optimizer, loss function
+    # and the metrics.
+    model.compile(
+        optimizer=optimizer,
+        loss="sparse_categorical_crossentropy",
+        metrics=[
+            keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
+            keras.metrics.SparseTopKCategoricalAccuracy(5, name="top-5-accuracy"),
+        ],
+    )
+
+    # Define callbacks
+    checkpoint_filepath = "/tmp/checkpoint.weights.h5"
+    checkpoint_callback = keras.callbacks.ModelCheckpoint(
+        checkpoint_filepath,
+        monitor="val_accuracy",
+        save_best_only=True,
+        save_weights_only=True,
+    )
+
+    # Train the model.
+    _ = model.fit(
+        train_ds,
+        epochs=EPOCHS,
+        validation_data=val_ds,
+        callbacks=[checkpoint_callback],
+    )
+
+    model.load_weights(checkpoint_filepath)
+    _, accuracy, top_5_accuracy = model.evaluate(test_ds)
+    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
+    print(f"Test top 5 accuracy: {round(top_5_accuracy * 100, 2)}%")
+
+
+"""
+## Train and evaluate a ViT with TokenLearner
+"""
+
+vit_token_learner = create_vit_classifier()
+run_experiment(vit_token_learner)
+
+"""
+## Results
+
+We experimented with and without the TokenLearner inside the mini ViT we implemented
+(with the same hyperparameters presented in this example). Here are our results:
+
+| **TokenLearner** | **# tokens in<br> TokenLearner** | **Top-1 Acc<br>(Averaged across 5 runs)** | **GFLOPs** | **TensorBoard** |
+|:---:|:---:|:---:|:---:|:---:|
+| N | - | 56.112% | 0.0184 | [Link](https://tensorboard.dev/experiment/vkCwM49dQZ2RiK0ZT4mj7w/) |
+| Y | 8 | **56.55%** | **0.0153** | [Link](https://tensorboard.dev/experiment/vkCwM49dQZ2RiK0ZT4mj7w/) |
+| N | - | 56.37% | 0.0184 | [Link](https://tensorboard.dev/experiment/hdyJ4wznQROwqZTgbtmztQ/) |
+| Y | 4 | **56.4980%** | **0.0147** | [Link](https://tensorboard.dev/experiment/hdyJ4wznQROwqZTgbtmztQ/) |
+| N | - (# Transformer layers: 8) | 55.36% | 0.0359 | [Link](https://tensorboard.dev/experiment/sepBK5zNSaOtdCeEG6SV9w/) |
+
+TokenLearner is able to consistently outperform our mini ViT without the module. It is
+also interesting to notice that it was also able to outperform a deeper version of our
+mini ViT (with 8 layers). The authors also report similar observations in the paper and
+they attribute this to the adaptiveness of TokenLearner.
+
+One should also note that the FLOPs count **decreases** considerably with the addition of
+the TokenLearner module. With less FLOPs count the TokenLearner module is able to
+deliver better results. This aligns very well with the authors' findings.
+
+Additionally, the authors [introduced](https://github.com/google-research/scenic/blob/main/scenic/projects/token_learner/model.py#L104)
+a newer version of the TokenLearner for smaller training data regimes. Quoting the authors:
+
+> Instead of using 4 conv. layers with small channels to implement spatial attention,
+  this version uses 2 grouped conv. layers with more channels. It also uses softmax
+  instead of sigmoid. We confirmed that this version works better when having limited
+  training data, such as training with ImageNet1K from scratch.
+
+We experimented with this module and in the following table we summarize the results:
+
+| **# Groups** | **# Tokens** | **Top-1 Acc** | **GFLOPs** | **TensorBoard** |
+|:---:|:---:|:---:|:---:|:---:|
+| 4 | 4 | 54.638% | 0.0149 | [Link](https://tensorboard.dev/experiment/KmfkGqAGQjikEw85phySmw/) |
+| 8 | 8 | 54.898% | 0.0146 | [Link](https://tensorboard.dev/experiment/0PpgYOq9RFWV9njX6NJQ2w/) |
+| 4 | 8 | 55.196% | 0.0149 | [Link](https://tensorboard.dev/experiment/WUkrHbZASdu3zrfmY4ETZg/) |
+
+Please note that we used the same hyperparameters presented in this example. Our
+implementation is available
+[in this notebook](https://github.com/ariG23498/TokenLearner/blob/master/TokenLearner-V1.1.ipynb).
+We acknowledge that the results with this new TokenLearner module are slightly off
+than expected and this might mitigate with hyperparameter tuning.
+
+*Note*: To compute the FLOPs of our models we used
+[this utility](https://github.com/AdityaKane2001/regnety/blob/main/regnety/utils/model_utils.py#L27)
+from [this repository](https://github.com/AdityaKane2001/regnety).
+"""
+
+"""
+## Number of parameters
+
+You may have noticed that adding the TokenLearner module increases the number of
+parameters of the base network. But that does not mean it is less efficient as shown by
+[Dehghani et al.](https://arxiv.org/abs/2110.12894). Similar findings were reported
+by [Bello et al.](https://arxiv.org/abs/2103.07579) as well. The TokenLearner module
+helps reducing the FLOPS in the overall network thereby helping to reduce the memory
+footprint.
+"""
+
+"""
+## Final notes
+
+* TokenFuser: The authors of the paper also propose another module named TokenFuser. This
+module helps in remapping the representation of the TokenLearner output back to its
+original spatial resolution. To reuse the TokenLearner in the ViT architecture, the
+TokenFuser is a must. We first learn the tokens from the TokenLearner, build a
+representation of the tokens from a Transformer layer and then remap the representation
+into the original spatial resolution, so that it can again be consumed by a TokenLearner.
+Note here that you can only use the TokenLearner module once in entire ViT model if not
+paired with the TokenFuser.
+* Use of these modules for video: The authors also suggest that TokenFuser goes really
+well with Vision Transformers for Videos ([Arnab et al.](https://arxiv.org/abs/2103.15691)).
+
+We are grateful to [JarvisLabs](https://jarvislabs.ai/) and
+[Google Developers Experts](https://developers.google.com/programs/experts/)
+program for helping with GPU credits. Also, we are thankful to Michael Ryoo (first
+author of TokenLearner) for fruitful discussions.
+"""
diff --git a/knowledge_base/vision/video_classification.py b/knowledge_base/vision/video_classification.py
new file mode 100644
index 0000000000000000000000000000000000000000..2d3805164a90ed3ec9af70269806aaf735836da2
--- /dev/null
+++ b/knowledge_base/vision/video_classification.py
@@ -0,0 +1,384 @@
+"""
+Title: Video Classification with a CNN-RNN Architecture
+Author: [Sayak Paul](https://twitter.com/RisingSayak)
+Date created: 2021/05/28
+Last modified: 2023/12/08
+Description: Training a video classifier with transfer learning and a recurrent model on the UCF101 dataset.
+Accelerator: GPU
+"""
+
+"""
+This example demonstrates video classification, an important use-case with
+applications in recommendations, security, and so on.
+We will be using the [UCF101 dataset](https://www.crcv.ucf.edu/data/UCF101.php)
+to build our video classifier. The dataset consists of videos categorized into different
+actions, like cricket shot, punching, biking, etc. This dataset is commonly used to
+build action recognizers, which are an application of video classification.
+
+A video consists of an ordered sequence of frames. Each frame contains *spatial*
+information, and the sequence of those frames contains *temporal* information. To model
+both of these aspects, we use a hybrid architecture that consists of convolutions
+(for spatial processing) as well as recurrent layers (for temporal processing).
+Specifically, we'll use a Convolutional Neural Network (CNN) and a Recurrent Neural
+Network (RNN) consisting of [GRU layers](https://keras.io/api/layers/recurrent_layers/gru/).
+This kind of hybrid architecture is popularly known as a **CNN-RNN**.
+
+This example requires TensorFlow 2.5 or higher, as well as TensorFlow Docs, which can be
+installed using the following command:
+"""
+
+"""shell
+pip install -q git+https://github.com/tensorflow/docs
+"""
+
+"""
+## Data collection
+
+In order to keep the runtime of this example relatively short, we will be using a
+subsampled version of the original UCF101 dataset. You can refer to
+[this notebook](https://colab.research.google.com/github/sayakpaul/Action-Recognition-in-TensorFlow/blob/main/Data_Preparation_UCF101.ipynb)
+to know how the subsampling was done.
+"""
+
+"""shell
+!wget -q https://github.com/sayakpaul/Action-Recognition-in-TensorFlow/releases/download/v1.0.0/ucf101_top5.tar.gz
+tar xf ucf101_top5.tar.gz
+"""
+
+"""
+## Setup
+"""
+import os
+
+import keras
+from imutils import paths
+
+import matplotlib.pyplot as plt
+import pandas as pd
+import numpy as np
+import imageio
+import cv2
+from IPython.display import Image
+
+"""
+## Define hyperparameters
+"""
+
+IMG_SIZE = 224
+BATCH_SIZE = 64
+EPOCHS = 10
+
+MAX_SEQ_LENGTH = 20
+NUM_FEATURES = 2048
+
+"""
+## Data preparation
+"""
+
+train_df = pd.read_csv("train.csv")
+test_df = pd.read_csv("test.csv")
+
+print(f"Total videos for training: {len(train_df)}")
+print(f"Total videos for testing: {len(test_df)}")
+
+train_df.sample(10)
+
+"""
+One of the many challenges of training video classifiers is figuring out a way to feed
+the videos to a network. [This blog post](https://blog.coast.ai/five-video-classification-methods-implemented-in-keras-and-tensorflow-99cad29cc0b5)
+discusses five such methods. Since a video is an ordered sequence of frames, we could
+just extract the frames and put them in a 3D tensor. But the number of frames may differ
+from video to video which would prevent us from stacking them into batches
+(unless we use padding). As an alternative, we can **save video frames at a fixed
+interval until a maximum frame count is reached**. In this example we will do
+the following:
+
+1. Capture the frames of a video.
+2. Extract frames from the videos until a maximum frame count is reached.
+3. In the case, where a video's frame count is lesser than the maximum frame count we
+will pad the video with zeros.
+
+Note that this workflow is identical to [problems involving texts sequences](https://developers.google.com/machine-learning/guides/text-classification/). Videos of the UCF101 dataset is [known](https://www.crcv.ucf.edu/papers/UCF101_CRCV-TR-12-01.pdf)
+to not contain extreme variations in objects and actions across frames. Because of this,
+it may be okay to only consider a few frames for the learning task. But this approach may
+not generalize well to other video classification problems. We will be using
+[OpenCV's `VideoCapture()` method](https://docs.opencv.org/master/dd/d43/tutorial_py_video_display.html)
+to read frames from videos.
+"""
+
+# The following two methods are taken from this tutorial:
+# https://www.tensorflow.org/hub/tutorials/action_recognition_with_tf_hub
+
+
+def crop_center_square(frame):
+    y, x = frame.shape[0:2]
+    min_dim = min(y, x)
+    start_x = (x // 2) - (min_dim // 2)
+    start_y = (y // 2) - (min_dim // 2)
+    return frame[start_y : start_y + min_dim, start_x : start_x + min_dim]
+
+
+def load_video(path, max_frames=0, resize=(IMG_SIZE, IMG_SIZE)):
+    cap = cv2.VideoCapture(path)
+    frames = []
+    try:
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            frame = crop_center_square(frame)
+            frame = cv2.resize(frame, resize)
+            frame = frame[:, :, [2, 1, 0]]
+            frames.append(frame)
+
+            if len(frames) == max_frames:
+                break
+    finally:
+        cap.release()
+    return np.array(frames)
+
+
+"""
+We can use a pre-trained network to extract meaningful features from the extracted
+frames. The [`Keras Applications`](https://keras.io/api/applications/) module provides
+a number of state-of-the-art models pre-trained on the [ImageNet-1k dataset](http://image-net.org/).
+We will be using the [InceptionV3 model](https://arxiv.org/abs/1512.00567) for this purpose.
+"""
+
+
+def build_feature_extractor():
+    feature_extractor = keras.applications.InceptionV3(
+        weights="imagenet",
+        include_top=False,
+        pooling="avg",
+        input_shape=(IMG_SIZE, IMG_SIZE, 3),
+    )
+    preprocess_input = keras.applications.inception_v3.preprocess_input
+
+    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
+    preprocessed = preprocess_input(inputs)
+
+    outputs = feature_extractor(preprocessed)
+    return keras.Model(inputs, outputs, name="feature_extractor")
+
+
+feature_extractor = build_feature_extractor()
+
+"""
+The labels of the videos are strings. Neural networks do not understand string values,
+so they must be converted to some numerical form before they are fed to the model. Here
+we will use the [`StringLookup`](https://keras.io/api/layers/preprocessing_layers/categorical/string_lookup)
+layer encode the class labels as integers.
+"""
+
+label_processor = keras.layers.StringLookup(
+    num_oov_indices=0, vocabulary=np.unique(train_df["tag"])
+)
+print(label_processor.get_vocabulary())
+
+"""
+Finally, we can put all the pieces together to create our data processing utility.
+"""
+
+
+def prepare_all_videos(df, root_dir):
+    num_samples = len(df)
+    video_paths = df["video_name"].values.tolist()
+    labels = df["tag"].values
+    labels = keras.ops.convert_to_numpy(label_processor(labels[..., None]))
+
+    # `frame_masks` and `frame_features` are what we will feed to our sequence model.
+    # `frame_masks` will contain a bunch of booleans denoting if a timestep is
+    # masked with padding or not.
+    frame_masks = np.zeros(shape=(num_samples, MAX_SEQ_LENGTH), dtype="bool")
+    frame_features = np.zeros(
+        shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
+    )
+
+    # For each video.
+    for idx, path in enumerate(video_paths):
+        # Gather all its frames and add a batch dimension.
+        frames = load_video(os.path.join(root_dir, path))
+        frames = frames[None, ...]
+
+        # Initialize placeholders to store the masks and features of the current video.
+        temp_frame_mask = np.zeros(
+            shape=(
+                1,
+                MAX_SEQ_LENGTH,
+            ),
+            dtype="bool",
+        )
+        temp_frame_features = np.zeros(
+            shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
+        )
+
+        # Extract features from the frames of the current video.
+        for i, batch in enumerate(frames):
+            video_length = batch.shape[0]
+            length = min(MAX_SEQ_LENGTH, video_length)
+            for j in range(length):
+                temp_frame_features[i, j, :] = feature_extractor.predict(
+                    batch[None, j, :],
+                    verbose=0,
+                )
+            temp_frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked
+
+        frame_features[idx,] = temp_frame_features.squeeze()
+        frame_masks[idx,] = temp_frame_mask.squeeze()
+
+    return (frame_features, frame_masks), labels
+
+
+train_data, train_labels = prepare_all_videos(train_df, "train")
+test_data, test_labels = prepare_all_videos(test_df, "test")
+
+print(f"Frame features in train set: {train_data[0].shape}")
+print(f"Frame masks in train set: {train_data[1].shape}")
+
+"""
+The above code block will take ~20 minutes to execute depending on the machine it's being
+executed.
+"""
+
+"""
+## The sequence model
+
+Now, we can feed this data to a sequence model consisting of recurrent layers like `GRU`.
+
+"""
+
+
+# Utility for our sequence model.
+def get_sequence_model():
+    class_vocab = label_processor.get_vocabulary()
+
+    frame_features_input = keras.Input((MAX_SEQ_LENGTH, NUM_FEATURES))
+    mask_input = keras.Input((MAX_SEQ_LENGTH,), dtype="bool")
+
+    # Refer to the following tutorial to understand the significance of using `mask`:
+    # https://keras.io/api/layers/recurrent_layers/gru/
+    x = keras.layers.GRU(16, return_sequences=True)(
+        frame_features_input, mask=mask_input
+    )
+    x = keras.layers.GRU(8)(x)
+    x = keras.layers.Dropout(0.4)(x)
+    x = keras.layers.Dense(8, activation="relu")(x)
+    output = keras.layers.Dense(len(class_vocab), activation="softmax")(x)
+
+    rnn_model = keras.Model([frame_features_input, mask_input], output)
+
+    rnn_model.compile(
+        loss="sparse_categorical_crossentropy", optimizer="adam", metrics=["accuracy"]
+    )
+    return rnn_model
+
+
+# Utility for running experiments.
+def run_experiment():
+    filepath = "/tmp/video_classifier/ckpt.weights.h5"
+    checkpoint = keras.callbacks.ModelCheckpoint(
+        filepath, save_weights_only=True, save_best_only=True, verbose=1
+    )
+
+    seq_model = get_sequence_model()
+    history = seq_model.fit(
+        [train_data[0], train_data[1]],
+        train_labels,
+        validation_split=0.3,
+        epochs=EPOCHS,
+        callbacks=[checkpoint],
+    )
+
+    seq_model.load_weights(filepath)
+    _, accuracy = seq_model.evaluate([test_data[0], test_data[1]], test_labels)
+    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
+
+    return history, seq_model
+
+
+_, sequence_model = run_experiment()
+
+"""
+**Note**: To keep the runtime of this example relatively short, we just used a few
+training examples. This number of training examples is low with respect to the sequence
+model being used that has 99,909 trainable parameters. You are encouraged to sample more
+data from the UCF101 dataset using [the notebook](https://colab.research.google.com/github/sayakpaul/Action-Recognition-in-TensorFlow/blob/main/Data_Preparation_UCF101.ipynb) mentioned above and train the same model.
+"""
+
+"""
+## Inference
+"""
+
+
+def prepare_single_video(frames):
+    frames = frames[None, ...]
+    frame_mask = np.zeros(
+        shape=(
+            1,
+            MAX_SEQ_LENGTH,
+        ),
+        dtype="bool",
+    )
+    frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")
+
+    for i, batch in enumerate(frames):
+        video_length = batch.shape[0]
+        length = min(MAX_SEQ_LENGTH, video_length)
+        for j in range(length):
+            frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :])
+        frame_mask[i, :length] = 1  # 1 = not masked, 0 = masked
+
+    return frame_features, frame_mask
+
+
+def sequence_prediction(path):
+    class_vocab = label_processor.get_vocabulary()
+
+    frames = load_video(os.path.join("test", path))
+    frame_features, frame_mask = prepare_single_video(frames)
+    probabilities = sequence_model.predict([frame_features, frame_mask])[0]
+
+    for i in np.argsort(probabilities)[::-1]:
+        print(f"  {class_vocab[i]}: {probabilities[i] * 100:5.2f}%")
+    return frames
+
+
+# This utility is for visualization.
+# Referenced from:
+# https://www.tensorflow.org/hub/tutorials/action_recognition_with_tf_hub
+def to_gif(images):
+    converted_images = images.astype(np.uint8)
+    imageio.mimsave("animation.gif", converted_images, duration=100)
+    return Image("animation.gif")
+
+
+test_video = np.random.choice(test_df["video_name"].values.tolist())
+print(f"Test video path: {test_video}")
+test_frames = sequence_prediction(test_video)
+to_gif(test_frames[:MAX_SEQ_LENGTH])
+
+"""
+## Next steps
+
+* In this example, we made use of transfer learning for extracting meaningful features
+from video frames. You could also fine-tune the pre-trained network to notice how that
+affects the end results.
+* For speed-accuracy trade-offs, you can try out other models present inside
+`keras.applications`.
+* Try different combinations of `MAX_SEQ_LENGTH` to observe how that affects the
+performance.
+* Train on a higher number of classes and see if you are able to get good performance.
+* Following [this tutorial](https://www.tensorflow.org/hub/tutorials/action_recognition_with_tf_hub), try a
+[pre-trained action recognition model](https://arxiv.org/abs/1705.07750) from DeepMind.
+* Rolling-averaging can be useful technique for video classification and it can be
+combined with a standard image classification model to infer on videos.
+[This tutorial](https://www.pyimagesearch.com/2019/07/15/video-classification-with-keras-and-deep-learning/)
+will help understand how to use rolling-averaging with an image classifier.
+* When there are variations in between the frames of a video not all the frames might be
+equally important to decide its category. In those situations, putting a
+[self-attention layer](https://www.tensorflow.org/api_docs/python/tf/keras/layers/Attention) in the
+sequence model will likely yield better results.
+* Following [this book chapter](https://livebook.manning.com/book/deep-learning-with-python-second-edition/chapter-11),
+you can implement Transformers-based models for processing videos.
+"""
diff --git a/knowledge_base/vision/video_transformers.py b/knowledge_base/vision/video_transformers.py
new file mode 100644
index 0000000000000000000000000000000000000000..18b21e1ae3a598588f3327a38c7115e3858448a4
--- /dev/null
+++ b/knowledge_base/vision/video_transformers.py
@@ -0,0 +1,414 @@
+"""
+Title: Video Classification with Transformers
+Author: [Sayak Paul](https://twitter.com/RisingSayak)
+Date created: 2021/06/08
+Last modified: 2023/22/07
+Description: Training a video classifier with hybrid transformers.
+Accelerator: GPU
+Converted to Keras 3 by: [Soumik Rakshit](http://github.com/soumik12345)
+"""
+
+"""
+This example is a follow-up to the
+[Video Classification with a CNN-RNN Architecture](https://keras.io/examples/vision/video_classification/)
+example. This time, we will be using a Transformer-based model
+([Vaswani et al.](https://arxiv.org/abs/1706.03762)) to classify videos. You can follow
+[this book chapter](https://livebook.manning.com/book/deep-learning-with-python-second-edition/chapter-11)
+in case you need an introduction to Transformers (with code). After reading this
+example, you will know how to develop hybrid Transformer-based models for video
+classification that operate on CNN feature maps.
+"""
+
+"""shell
+pip install -q git+https://github.com/tensorflow/docs
+"""
+
+"""
+## Data collection
+
+As done in the [predecessor](https://keras.io/examples/vision/video_classification/) to
+this example, we will be using a subsampled version of the
+[UCF101 dataset](https://www.crcv.ucf.edu/data/UCF101.php),
+a well-known benchmark dataset. In case you want to operate on a larger subsample or
+even the entire dataset, please refer to
+[this notebook](https://colab.research.google.com/github/sayakpaul/Action-Recognition-in-TensorFlow/blob/main/Data_Preparation_UCF101.ipynb).
+"""
+
+"""shell
+wget -q https://github.com/sayakpaul/Action-Recognition-in-TensorFlow/releases/download/v1.0.0/ucf101_top5.tar.gz
+tar -xf ucf101_top5.tar.gz
+"""
+
+"""
+## Setup
+"""
+
+import os
+import keras
+from keras import layers
+from keras.applications.densenet import DenseNet121
+
+from tensorflow_docs.vis import embed
+
+import matplotlib.pyplot as plt
+import pandas as pd
+import numpy as np
+import imageio
+import cv2
+
+keras.utils.set_random_seed(1234)
+
+"""
+## Define hyperparameters
+"""
+
+MAX_SEQ_LENGTH = 20
+NUM_FEATURES = 1024
+IMG_SIZE = 128
+
+EPOCHS = 5
+
+"""
+## Data preparation
+
+We will mostly be following the same data preparation steps in this example, except for
+the following changes:
+
+* We reduce the image size to 128x128 instead of 224x224 to speed up computation.
+* Instead of using a pre-trained [InceptionV3](https://arxiv.org/abs/1512.00567) network,
+we use a pre-trained
+[DenseNet121](http://openaccess.thecvf.com/content_cvpr_2017/papers/Huang_Densely_Connected_Convolutional_CVPR_2017_paper.pdf)
+for feature extraction.
+* We directly pad shorter videos to length `MAX_SEQ_LENGTH`.
+
+First, let's load up the
+[DataFrames](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.html).
+"""
+
+train_df = pd.read_csv("train.csv")
+test_df = pd.read_csv("test.csv")
+
+print(f"Total videos for training: {len(train_df)}")
+print(f"Total videos for testing: {len(test_df)}")
+
+center_crop_layer = layers.CenterCrop(IMG_SIZE, IMG_SIZE)
+
+
+def crop_center(frame):
+    cropped = center_crop_layer(frame[None, ...])
+    cropped = keras.ops.convert_to_numpy(cropped)
+    cropped = keras.ops.squeeze(cropped)
+    return cropped
+
+
+# Following method is modified from this tutorial:
+# https://www.tensorflow.org/hub/tutorials/action_recognition_with_tf_hub
+def load_video(path, max_frames=0, offload_to_cpu=False):
+    cap = cv2.VideoCapture(path)
+    frames = []
+    try:
+        while True:
+            ret, frame = cap.read()
+            if not ret:
+                break
+            frame = frame[:, :, [2, 1, 0]]
+            frame = crop_center(frame)
+            if offload_to_cpu and keras.backend.backend() == "torch":
+                frame = frame.to("cpu")
+            frames.append(frame)
+
+            if len(frames) == max_frames:
+                break
+    finally:
+        cap.release()
+    if offload_to_cpu and keras.backend.backend() == "torch":
+        return np.array([frame.to("cpu").numpy() for frame in frames])
+    return np.array(frames)
+
+
+def build_feature_extractor():
+    feature_extractor = DenseNet121(
+        weights="imagenet",
+        include_top=False,
+        pooling="avg",
+        input_shape=(IMG_SIZE, IMG_SIZE, 3),
+    )
+    preprocess_input = keras.applications.densenet.preprocess_input
+
+    inputs = keras.Input((IMG_SIZE, IMG_SIZE, 3))
+    preprocessed = preprocess_input(inputs)
+
+    outputs = feature_extractor(preprocessed)
+    return keras.Model(inputs, outputs, name="feature_extractor")
+
+
+feature_extractor = build_feature_extractor()
+
+
+# Label preprocessing with StringLookup.
+label_processor = keras.layers.StringLookup(
+    num_oov_indices=0, vocabulary=np.unique(train_df["tag"]), mask_token=None
+)
+print(label_processor.get_vocabulary())
+
+
+def prepare_all_videos(df, root_dir):
+    num_samples = len(df)
+    video_paths = df["video_name"].values.tolist()
+    labels = df["tag"].values
+    labels = label_processor(labels[..., None]).numpy()
+
+    # `frame_features` are what we will feed to our sequence model.
+    frame_features = np.zeros(
+        shape=(num_samples, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
+    )
+
+    # For each video.
+    for idx, path in enumerate(video_paths):
+        # Gather all its frames and add a batch dimension.
+        frames = load_video(os.path.join(root_dir, path))
+
+        # Pad shorter videos.
+        if len(frames) < MAX_SEQ_LENGTH:
+            diff = MAX_SEQ_LENGTH - len(frames)
+            padding = np.zeros((diff, IMG_SIZE, IMG_SIZE, 3))
+            frames = np.concatenate(frames, padding)
+
+        frames = frames[None, ...]
+
+        # Initialize placeholder to store the features of the current video.
+        temp_frame_features = np.zeros(
+            shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32"
+        )
+
+        # Extract features from the frames of the current video.
+        for i, batch in enumerate(frames):
+            video_length = batch.shape[0]
+            length = min(MAX_SEQ_LENGTH, video_length)
+            for j in range(length):
+                if np.mean(batch[j, :]) > 0.0:
+                    temp_frame_features[i, j, :] = feature_extractor.predict(
+                        batch[None, j, :]
+                    )
+
+                else:
+                    temp_frame_features[i, j, :] = 0.0
+
+        frame_features[idx,] = temp_frame_features.squeeze()
+
+    return frame_features, labels
+
+
+"""
+Calling `prepare_all_videos()` on `train_df` and `test_df` takes ~20 minutes to
+complete. For this reason, to save time, here we download already preprocessed NumPy arrays:
+"""
+
+"""shell
+!wget -q https://git.io/JZmf4 -O top5_data_prepared.tar.gz
+!tar -xf top5_data_prepared.tar.gz
+"""
+
+train_data, train_labels = np.load("train_data.npy"), np.load("train_labels.npy")
+test_data, test_labels = np.load("test_data.npy"), np.load("test_labels.npy")
+
+print(f"Frame features in train set: {train_data.shape}")
+
+"""
+## Building the Transformer-based model
+
+We will be building on top of the code shared in
+[this book chapter](https://livebook.manning.com/book/deep-learning-with-python-second-edition/chapter-11) of
+[Deep Learning with Python (Second ed.)](https://www.manning.com/books/deep-learning-with-python)
+by François Chollet.
+
+First, self-attention layers that form the basic blocks of a Transformer are
+order-agnostic. Since videos are ordered sequences of frames, we need our
+Transformer model to take into account order information.
+We do this via **positional encoding**.
+We simply embed the positions of the frames present inside videos with an
+[`Embedding` layer](https://keras.io/api/layers/core_layers/embedding). We then
+add these positional embeddings to the precomputed CNN feature maps.
+"""
+
+
+class PositionalEmbedding(layers.Layer):
+    def __init__(self, sequence_length, output_dim, **kwargs):
+        super().__init__(**kwargs)
+        self.position_embeddings = layers.Embedding(
+            input_dim=sequence_length, output_dim=output_dim
+        )
+        self.sequence_length = sequence_length
+        self.output_dim = output_dim
+
+    def build(self, input_shape):
+        self.position_embeddings.build(input_shape)
+
+    def call(self, inputs):
+        # The inputs are of shape: `(batch_size, frames, num_features)`
+        inputs = keras.ops.cast(inputs, self.compute_dtype)
+        length = keras.ops.shape(inputs)[1]
+        positions = keras.ops.arange(start=0, stop=length, step=1)
+        embedded_positions = self.position_embeddings(positions)
+        return inputs + embedded_positions
+
+
+"""
+Now, we can create a subclassed layer for the Transformer.
+"""
+
+
+class TransformerEncoder(layers.Layer):
+    def __init__(self, embed_dim, dense_dim, num_heads, **kwargs):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+        self.dense_dim = dense_dim
+        self.num_heads = num_heads
+        self.attention = layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=embed_dim, dropout=0.3
+        )
+        self.dense_proj = keras.Sequential(
+            [
+                layers.Dense(dense_dim, activation=keras.activations.gelu),
+                layers.Dense(embed_dim),
+            ]
+        )
+        self.layernorm_1 = layers.LayerNormalization()
+        self.layernorm_2 = layers.LayerNormalization()
+
+    def call(self, inputs, mask=None):
+        attention_output = self.attention(inputs, inputs, attention_mask=mask)
+        proj_input = self.layernorm_1(inputs + attention_output)
+        proj_output = self.dense_proj(proj_input)
+        return self.layernorm_2(proj_input + proj_output)
+
+
+"""
+## Utility functions for training
+"""
+
+
+def get_compiled_model(shape):
+    sequence_length = MAX_SEQ_LENGTH
+    embed_dim = NUM_FEATURES
+    dense_dim = 4
+    num_heads = 1
+    classes = len(label_processor.get_vocabulary())
+
+    inputs = keras.Input(shape=shape)
+    x = PositionalEmbedding(
+        sequence_length, embed_dim, name="frame_position_embedding"
+    )(inputs)
+    x = TransformerEncoder(embed_dim, dense_dim, num_heads, name="transformer_layer")(x)
+    x = layers.GlobalMaxPooling1D()(x)
+    x = layers.Dropout(0.5)(x)
+    outputs = layers.Dense(classes, activation="softmax")(x)
+    model = keras.Model(inputs, outputs)
+
+    model.compile(
+        optimizer="adam",
+        loss="sparse_categorical_crossentropy",
+        metrics=["accuracy"],
+    )
+    return model
+
+
+def run_experiment():
+    filepath = "/tmp/video_classifier.weights.h5"
+    checkpoint = keras.callbacks.ModelCheckpoint(
+        filepath, save_weights_only=True, save_best_only=True, verbose=1
+    )
+
+    model = get_compiled_model(train_data.shape[1:])
+    history = model.fit(
+        train_data,
+        train_labels,
+        validation_split=0.15,
+        epochs=EPOCHS,
+        callbacks=[checkpoint],
+    )
+
+    model.load_weights(filepath)
+    _, accuracy = model.evaluate(test_data, test_labels)
+    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
+
+    return model
+
+
+"""
+## Model training and inference
+"""
+
+trained_model = run_experiment()
+
+"""
+**Note**: This model has ~4.23 Million parameters, which is way more than the sequence
+model (99918 parameters) we used in the prequel of this example.  This kind of
+Transformer model works best with a larger dataset and a longer pre-training schedule.
+"""
+
+
+def prepare_single_video(frames):
+    frame_features = np.zeros(shape=(1, MAX_SEQ_LENGTH, NUM_FEATURES), dtype="float32")
+
+    # Pad shorter videos.
+    if len(frames) < MAX_SEQ_LENGTH:
+        diff = MAX_SEQ_LENGTH - len(frames)
+        padding = np.zeros((diff, IMG_SIZE, IMG_SIZE, 3))
+        frames = np.concatenate(frames, padding)
+
+    frames = frames[None, ...]
+
+    # Extract features from the frames of the current video.
+    for i, batch in enumerate(frames):
+        video_length = batch.shape[0]
+        length = min(MAX_SEQ_LENGTH, video_length)
+        for j in range(length):
+            if np.mean(batch[j, :]) > 0.0:
+                frame_features[i, j, :] = feature_extractor.predict(batch[None, j, :])
+            else:
+                frame_features[i, j, :] = 0.0
+
+    return frame_features
+
+
+def predict_action(path):
+    class_vocab = label_processor.get_vocabulary()
+
+    frames = load_video(os.path.join("test", path), offload_to_cpu=True)
+    frame_features = prepare_single_video(frames)
+    probabilities = trained_model.predict(frame_features)[0]
+
+    plot_x_axis, plot_y_axis = [], []
+
+    for i in np.argsort(probabilities)[::-1]:
+        plot_x_axis.append(class_vocab[i])
+        plot_y_axis.append(probabilities[i])
+        print(f"  {class_vocab[i]}: {probabilities[i] * 100:5.2f}%")
+
+    plt.bar(plot_x_axis, plot_y_axis, label=plot_x_axis)
+    plt.xlabel("class_label")
+    plt.xlabel("Probability")
+    plt.show()
+
+    return frames
+
+
+# This utility is for visualization.
+# Referenced from:
+# https://www.tensorflow.org/hub/tutorials/action_recognition_with_tf_hub
+def to_gif(images):
+    converted_images = images.astype(np.uint8)
+    imageio.mimsave("animation.gif", converted_images, fps=10)
+    return embed.embed_file("animation.gif")
+
+
+test_video = np.random.choice(test_df["video_name"].values.tolist())
+print(f"Test video path: {test_video}")
+test_frames = predict_action(test_video)
+to_gif(test_frames[:MAX_SEQ_LENGTH])
+
+"""
+The performance of our model is far from optimal, because it was trained on a
+small dataset.
+"""
diff --git a/knowledge_base/vision/visualizing_what_convnets_learn.py b/knowledge_base/vision/visualizing_what_convnets_learn.py
new file mode 100644
index 0000000000000000000000000000000000000000..d855ec84e8a00dd1b4e9d50147437227694a5f58
--- /dev/null
+++ b/knowledge_base/vision/visualizing_what_convnets_learn.py
@@ -0,0 +1,204 @@
+"""
+Title: Visualizing what convnets learn
+Author: [fchollet](https://twitter.com/fchollet)
+Date created: 2020/05/29
+Last modified: 2020/05/29
+Description: Displaying the visual patterns that convnet filters respond to.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+In this example, we look into what sort of visual patterns image classification models
+learn. We'll be using the `ResNet50V2` model, trained on the ImageNet dataset.
+
+Our process is simple: we will create input images that maximize the activation of
+specific filters in a target layer (picked somewhere in the middle of the model: layer
+`conv3_block4_out`). Such images represent a visualization of the
+pattern that the filter responds to.
+"""
+
+"""
+## Setup
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import keras
+import numpy as np
+import tensorflow as tf
+
+# The dimensions of our input image
+img_width = 180
+img_height = 180
+# Our target layer: we will visualize the filters from this layer.
+# See `model.summary()` for list of layer names, if you want to change this.
+layer_name = "conv3_block4_out"
+
+"""
+## Build a feature extraction model
+"""
+
+# Build a ResNet50V2 model loaded with pre-trained ImageNet weights
+model = keras.applications.ResNet50V2(weights="imagenet", include_top=False)
+
+# Set up a model that returns the activation values for our target layer
+layer = model.get_layer(name=layer_name)
+feature_extractor = keras.Model(inputs=model.inputs, outputs=layer.output)
+
+"""
+## Set up the gradient ascent process
+
+The "loss" we will maximize is simply the mean of the activation of a specific filter in
+our target layer. To avoid border effects, we exclude border pixels.
+"""
+
+
+def compute_loss(input_image, filter_index):
+    activation = feature_extractor(input_image)
+    # We avoid border artifacts by only involving non-border pixels in the loss.
+    filter_activation = activation[:, 2:-2, 2:-2, filter_index]
+    return tf.reduce_mean(filter_activation)
+
+
+"""
+Our gradient ascent function simply computes the gradients of the loss above
+with regard to the input image, and update the update image so as to move it
+towards a state that will activate the target filter more strongly.
+"""
+
+
+@tf.function
+def gradient_ascent_step(img, filter_index, learning_rate):
+    with tf.GradientTape() as tape:
+        tape.watch(img)
+        loss = compute_loss(img, filter_index)
+    # Compute gradients.
+    grads = tape.gradient(loss, img)
+    # Normalize gradients.
+    grads = tf.math.l2_normalize(grads)
+    img += learning_rate * grads
+    return loss, img
+
+
+"""
+## Set up the end-to-end filter visualization loop
+
+Our process is as follow:
+
+- Start from a random image that is close to "all gray" (i.e. visually netural)
+- Repeatedly apply the gradient ascent step function defined above
+- Convert the resulting input image back to a displayable form, by normalizing it,
+center-cropping it, and restricting it to the [0, 255] range.
+"""
+
+
+def initialize_image():
+    # We start from a gray image with some random noise
+    img = tf.random.uniform((1, img_width, img_height, 3))
+    # ResNet50V2 expects inputs in the range [-1, +1].
+    # Here we scale our random inputs to [-0.125, +0.125]
+    return (img - 0.5) * 0.25
+
+
+def visualize_filter(filter_index):
+    # We run gradient ascent for 20 steps
+    iterations = 30
+    learning_rate = 10.0
+    img = initialize_image()
+    for iteration in range(iterations):
+        loss, img = gradient_ascent_step(img, filter_index, learning_rate)
+
+    # Decode the resulting input image
+    img = deprocess_image(img[0].numpy())
+    return loss, img
+
+
+def deprocess_image(img):
+    # Normalize array: center on 0., ensure variance is 0.15
+    img -= img.mean()
+    img /= img.std() + 1e-5
+    img *= 0.15
+
+    # Center crop
+    img = img[25:-25, 25:-25, :]
+
+    # Clip to [0, 1]
+    img += 0.5
+    img = np.clip(img, 0, 1)
+
+    # Convert to RGB array
+    img *= 255
+    img = np.clip(img, 0, 255).astype("uint8")
+    return img
+
+
+"""
+Let's try it out with filter 0 in the target layer:
+"""
+
+from IPython.display import Image, display
+
+loss, img = visualize_filter(0)
+keras.utils.save_img("0.png", img)
+
+"""
+This is what an input that maximizes the response of filter 0 in the target layer would
+look like:
+"""
+
+display(Image("0.png"))
+
+"""
+## Visualize the first 64 filters in the target layer
+
+Now, let's make a 8x8 grid of the first 64 filters
+in the target layer to get of feel for the range
+of different visual patterns that the model has learned.
+"""
+
+# Compute image inputs that maximize per-filter activations
+# for the first 64 filters of our target layer
+all_imgs = []
+for filter_index in range(64):
+    print("Processing filter %d" % (filter_index,))
+    loss, img = visualize_filter(filter_index)
+    all_imgs.append(img)
+
+# Build a black picture with enough space for
+# our 8 x 8 filters of size 128 x 128, with a 5px margin in between
+margin = 5
+n = 8
+cropped_width = img_width - 25 * 2
+cropped_height = img_height - 25 * 2
+width = n * cropped_width + (n - 1) * margin
+height = n * cropped_height + (n - 1) * margin
+stitched_filters = np.zeros((width, height, 3))
+
+# Fill the picture with our saved filters
+for i in range(n):
+    for j in range(n):
+        img = all_imgs[i * n + j]
+        stitched_filters[
+            (cropped_width + margin) * i : (cropped_width + margin) * i + cropped_width,
+            (cropped_height + margin) * j : (cropped_height + margin) * j
+            + cropped_height,
+            :,
+        ] = img
+keras.utils.save_img("stiched_filters.png", stitched_filters)
+
+from IPython.display import Image, display
+
+display(Image("stiched_filters.png"))
+
+"""
+Image classification models see the world by decomposing their inputs over a "vector
+basis" of texture filters such as these.
+
+See also
+[this old blog post](https://blog.keras.io/how-convolutional-neural-networks-see-the-world.html)
+for analysis and interpretation.
+"""
diff --git a/knowledge_base/vision/vit_small_ds.py b/knowledge_base/vision/vit_small_ds.py
new file mode 100644
index 0000000000000000000000000000000000000000..525967fca619e09dc2867dd46ba437942917ec94
--- /dev/null
+++ b/knowledge_base/vision/vit_small_ds.py
@@ -0,0 +1,552 @@
+"""
+Title: Train a Vision Transformer on small datasets
+Author: [Aritra Roy Gosthipaty](https://twitter.com/ariG23498)
+Date created: 2022/01/07
+Last modified: 2024/11/27
+Description: Training a ViT from scratch on smaller datasets with shifted patch tokenization and locality self-attention.
+Accelerator: GPU
+Converted to Keras 3 by: [Sitam Meur](https://github.com/sitamgithub-MSIT)
+"""
+
+"""
+## Introduction
+
+In the academic paper
+[An Image is Worth 16x16 Words: Transformers for Image Recognition at Scale](https://arxiv.org/abs/2010.11929),
+the authors mention that Vision Transformers (ViT) are data-hungry. Therefore,
+pretraining a ViT on a large-sized dataset like JFT300M and fine-tuning
+it on medium-sized datasets (like ImageNet) is the only way to beat
+state-of-the-art Convolutional Neural Network models.
+
+The self-attention layer of ViT lacks **locality inductive bias** (the notion that
+image pixels are locally correlated and that their correlation maps are translation-invariant).
+This is the reason why ViTs need more data. On the other hand, CNNs look at images through
+spatial sliding windows, which helps them get better results with smaller datasets.
+
+In the academic paper
+[Vision Transformer for Small-Size Datasets](https://arxiv.org/abs/2112.13492v1),
+the authors set out to tackle the problem of locality inductive bias in ViTs.
+
+The main ideas are:
+
+- **Shifted Patch Tokenization**
+- **Locality Self Attention**
+
+This example implements the ideas of the paper. A large part of this
+example is inspired from
+[Image classification with Vision Transformer](https://keras.io/examples/vision/image_classification_with_vision_transformer/).
+
+_Note_: This example requires TensorFlow 2.6 or higher.
+```
+"""
+"""
+## Setup
+"""
+import math
+import numpy as np
+import keras
+from keras import ops
+from keras import layers
+import tensorflow as tf
+import matplotlib.pyplot as plt
+
+# Setting seed for reproducibiltiy
+SEED = 42
+keras.utils.set_random_seed(SEED)
+
+"""
+## Prepare the data
+"""
+
+NUM_CLASSES = 100
+INPUT_SHAPE = (32, 32, 3)
+
+(x_train, y_train), (x_test, y_test) = keras.datasets.cifar100.load_data()
+
+print(f"x_train shape: {x_train.shape} - y_train shape: {y_train.shape}")
+print(f"x_test shape: {x_test.shape} - y_test shape: {y_test.shape}")
+
+"""
+## Configure the hyperparameters
+
+The hyperparameters are different from the paper. Feel free to tune
+the hyperparameters yourself.
+"""
+
+# DATA
+BUFFER_SIZE = 512
+BATCH_SIZE = 256
+
+# AUGMENTATION
+IMAGE_SIZE = 72
+PATCH_SIZE = 6
+NUM_PATCHES = (IMAGE_SIZE // PATCH_SIZE) ** 2
+
+# OPTIMIZER
+LEARNING_RATE = 0.001
+WEIGHT_DECAY = 0.0001
+
+# TRAINING
+EPOCHS = 50
+
+# ARCHITECTURE
+LAYER_NORM_EPS = 1e-6
+TRANSFORMER_LAYERS = 8
+PROJECTION_DIM = 64
+NUM_HEADS = 4
+TRANSFORMER_UNITS = [
+    PROJECTION_DIM * 2,
+    PROJECTION_DIM,
+]
+MLP_HEAD_UNITS = [2048, 1024]
+
+"""
+## Use data augmentation
+
+A snippet from the paper:
+
+*"According to DeiT, various techniques are required to effectively
+train ViTs. Thus, we applied data augmentations such as CutMix, Mixup,
+Auto Augment, Repeated Augment to all models."*
+
+In this example, we will focus solely on the novelty of the approach
+and not on reproducing the paper results. For this reason, we
+don't use the mentioned data augmentation schemes. Please feel
+free to add to or remove from the augmentation pipeline.
+"""
+
+data_augmentation = keras.Sequential(
+    [
+        layers.Normalization(),
+        layers.Resizing(IMAGE_SIZE, IMAGE_SIZE),
+        layers.RandomFlip("horizontal"),
+        layers.RandomRotation(factor=0.02),
+        layers.RandomZoom(height_factor=0.2, width_factor=0.2),
+    ],
+    name="data_augmentation",
+)
+# Compute the mean and the variance of the training data for normalization.
+data_augmentation.layers[0].adapt(x_train)
+
+"""
+## Implement Shifted Patch Tokenization
+
+In a ViT pipeline, the input images are divided into patches that are
+then linearly projected into tokens. Shifted patch tokenization (STP)
+is introduced to combat the low receptive field of ViTs. The steps
+for Shifted Patch Tokenization are as follows:
+
+- Start with an image.
+- Shift the image in diagonal directions.
+- Concat the diagonally shifted images with the original image.
+- Extract patches of the concatenated images.
+- Flatten the spatial dimension of all patches.
+- Layer normalize the flattened patches and then project it.
+
+| ![Shifted Patch Toekenization](https://i.imgur.com/bUnHxd0.png) |
+| :--: |
+| Shifted Patch Tokenization [Source](https://arxiv.org/abs/2112.13492v1) |
+"""
+
+
+class ShiftedPatchTokenization(layers.Layer):
+    def __init__(
+        self,
+        image_size=IMAGE_SIZE,
+        patch_size=PATCH_SIZE,
+        num_patches=NUM_PATCHES,
+        projection_dim=PROJECTION_DIM,
+        vanilla=False,
+        **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.vanilla = vanilla  # Flag to swtich to vanilla patch extractor
+        self.image_size = image_size
+        self.patch_size = patch_size
+        self.half_patch = patch_size // 2
+        self.flatten_patches = layers.Reshape((num_patches, -1))
+        self.projection = layers.Dense(units=projection_dim)
+        self.layer_norm = layers.LayerNormalization(epsilon=LAYER_NORM_EPS)
+
+    def crop_shift_pad(self, images, mode):
+        # Build the diagonally shifted images
+        if mode == "left-up":
+            crop_height = self.half_patch
+            crop_width = self.half_patch
+            shift_height = 0
+            shift_width = 0
+        elif mode == "left-down":
+            crop_height = 0
+            crop_width = self.half_patch
+            shift_height = self.half_patch
+            shift_width = 0
+        elif mode == "right-up":
+            crop_height = self.half_patch
+            crop_width = 0
+            shift_height = 0
+            shift_width = self.half_patch
+        else:
+            crop_height = 0
+            crop_width = 0
+            shift_height = self.half_patch
+            shift_width = self.half_patch
+
+        # Crop the shifted images and pad them
+        crop = ops.image.crop_images(
+            images,
+            top_cropping=crop_height,
+            left_cropping=crop_width,
+            target_height=self.image_size - self.half_patch,
+            target_width=self.image_size - self.half_patch,
+        )
+        shift_pad = ops.image.pad_images(
+            crop,
+            top_padding=shift_height,
+            left_padding=shift_width,
+            target_height=self.image_size,
+            target_width=self.image_size,
+        )
+        return shift_pad
+
+    def call(self, images):
+        if not self.vanilla:
+            # Concat the shifted images with the original image
+            images = ops.concatenate(
+                [
+                    images,
+                    self.crop_shift_pad(images, mode="left-up"),
+                    self.crop_shift_pad(images, mode="left-down"),
+                    self.crop_shift_pad(images, mode="right-up"),
+                    self.crop_shift_pad(images, mode="right-down"),
+                ],
+                axis=-1,
+            )
+        # Patchify the images and flatten it
+        patches = ops.image.extract_patches(
+            images=images,
+            size=(self.patch_size, self.patch_size),
+            strides=[1, self.patch_size, self.patch_size, 1],
+            dilation_rate=1,
+            padding="VALID",
+        )
+        flat_patches = self.flatten_patches(patches)
+        if not self.vanilla:
+            # Layer normalize the flat patches and linearly project it
+            tokens = self.layer_norm(flat_patches)
+            tokens = self.projection(tokens)
+        else:
+            # Linearly project the flat patches
+            tokens = self.projection(flat_patches)
+        return (tokens, patches)
+
+
+"""
+### Visualize the patches
+"""
+
+# Get a random image from the training dataset
+# and resize the image
+image = x_train[np.random.choice(range(x_train.shape[0]))]
+resized_image = ops.cast(
+    ops.image.resize(ops.convert_to_tensor([image]), size=(IMAGE_SIZE, IMAGE_SIZE)),
+    dtype="float32",
+)
+
+# Vanilla patch maker: This takes an image and divides into
+# patches as in the original ViT paper
+(token, patch) = ShiftedPatchTokenization(vanilla=True)(resized_image / 255.0)
+(token, patch) = (token[0], patch[0])
+n = patch.shape[0]
+count = 1
+plt.figure(figsize=(4, 4))
+for row in range(n):
+    for col in range(n):
+        plt.subplot(n, n, count)
+        count = count + 1
+        image = ops.reshape(patch[row][col], (PATCH_SIZE, PATCH_SIZE, 3))
+        plt.imshow(image)
+        plt.axis("off")
+plt.show()
+
+# Shifted Patch Tokenization: This layer takes the image, shifts it
+# diagonally and then extracts patches from the concatinated images
+(token, patch) = ShiftedPatchTokenization(vanilla=False)(resized_image / 255.0)
+(token, patch) = (token[0], patch[0])
+n = patch.shape[0]
+shifted_images = ["ORIGINAL", "LEFT-UP", "LEFT-DOWN", "RIGHT-UP", "RIGHT-DOWN"]
+for index, name in enumerate(shifted_images):
+    print(name)
+    count = 1
+    plt.figure(figsize=(4, 4))
+    for row in range(n):
+        for col in range(n):
+            plt.subplot(n, n, count)
+            count = count + 1
+            image = ops.reshape(patch[row][col], (PATCH_SIZE, PATCH_SIZE, 5 * 3))
+            plt.imshow(image[..., 3 * index : 3 * index + 3])
+            plt.axis("off")
+    plt.show()
+
+"""
+## Implement the patch encoding layer
+
+This layer accepts projected patches and then adds positional
+information to them.
+"""
+
+
+class PatchEncoder(layers.Layer):
+    def __init__(
+        self, num_patches=NUM_PATCHES, projection_dim=PROJECTION_DIM, **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.num_patches = num_patches
+        self.position_embedding = layers.Embedding(
+            input_dim=num_patches, output_dim=projection_dim
+        )
+        self.positions = ops.arange(start=0, stop=self.num_patches, step=1)
+
+    def call(self, encoded_patches):
+        encoded_positions = self.position_embedding(self.positions)
+        encoded_patches = encoded_patches + encoded_positions
+        return encoded_patches
+
+
+"""
+## Implement Locality Self Attention
+
+The regular attention equation is stated below.
+
+| ![Equation of attention](https://miro.medium.com/max/396/1*P9sV1xXM10t943bXy_G9yg.png) |
+| :--: |
+| [Source](https://towardsdatascience.com/attention-is-all-you-need-discovering-the-transformer-paper-73e5ff5e0634) |
+
+The attention module takes a query, key, and value. First, we compute the
+similarity between the query and key via a dot product. Then, the result
+is scaled by the square root of the key dimension. The scaling prevents
+the softmax function from having an overly small gradient. Softmax is then
+applied to the scaled dot product to produce the attention weights.
+The value is then modulated via the attention weights.
+
+In self-attention, query, key and value come from the same input.
+The dot product would result in large self-token relations rather than
+inter-token relations. This also means that the softmax gives higher
+probabilities to self-token relations than the inter-token relations.
+To combat this, the authors propose masking the diagonal of the dot product.
+This way, we force the attention module to pay more attention to the
+inter-token relations.
+
+The scaling factor is a constant in the regular attention module.
+This acts like a temperature term that can modulate the softmax function.
+The authors suggest a learnable temperature term instead of a constant.
+
+| ![Implementation of LSA](https://i.imgur.com/GTV99pk.png) |
+| :--: |
+| Locality Self Attention [Source](https://arxiv.org/abs/2112.13492v1) |
+
+The above two pointers make the Locality Self Attention. We have subclassed the
+[`layers.MultiHeadAttention`](https://www.tensorflow.org/api_docs/python/tf/keras/layers/MultiHeadAttention)
+and implemented the trainable temperature. The attention mask is built
+at a later stage.
+"""
+
+
+class MultiHeadAttentionLSA(layers.MultiHeadAttention):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        # The trainable temperature term. The initial value is
+        # the square root of the key dimension.
+        self.tau = keras.Variable(math.sqrt(float(self._key_dim)), trainable=True)
+
+    def _compute_attention(self, query, key, value, attention_mask=None, training=None):
+        query = ops.multiply(query, 1.0 / self.tau)
+        attention_scores = ops.einsum(self._dot_product_equation, key, query)
+        attention_scores = self._masked_softmax(attention_scores, attention_mask)
+        attention_scores_dropout = self._dropout_layer(
+            attention_scores, training=training
+        )
+        attention_output = ops.einsum(
+            self._combine_equation, attention_scores_dropout, value
+        )
+        return attention_output, attention_scores
+
+
+"""
+## Implement the MLP
+"""
+
+
+def mlp(x, hidden_units, dropout_rate):
+    for units in hidden_units:
+        x = layers.Dense(units, activation="gelu")(x)
+        x = layers.Dropout(dropout_rate)(x)
+    return x
+
+
+# Build the diagonal attention mask
+diag_attn_mask = 1 - ops.eye(NUM_PATCHES)
+diag_attn_mask = ops.cast([diag_attn_mask], dtype="int8")
+
+"""
+## Build the ViT
+"""
+
+
+def create_vit_classifier(vanilla=False):
+    inputs = layers.Input(shape=INPUT_SHAPE)
+    # Augment data.
+    augmented = data_augmentation(inputs)
+    # Create patches.
+    (tokens, _) = ShiftedPatchTokenization(vanilla=vanilla)(augmented)
+    # Encode patches.
+    encoded_patches = PatchEncoder()(tokens)
+
+    # Create multiple layers of the Transformer block.
+    for _ in range(TRANSFORMER_LAYERS):
+        # Layer normalization 1.
+        x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
+        # Create a multi-head attention layer.
+        if not vanilla:
+            attention_output = MultiHeadAttentionLSA(
+                num_heads=NUM_HEADS, key_dim=PROJECTION_DIM, dropout=0.1
+            )(x1, x1, attention_mask=diag_attn_mask)
+        else:
+            attention_output = layers.MultiHeadAttention(
+                num_heads=NUM_HEADS, key_dim=PROJECTION_DIM, dropout=0.1
+            )(x1, x1)
+        # Skip connection 1.
+        x2 = layers.Add()([attention_output, encoded_patches])
+        # Layer normalization 2.
+        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
+        # MLP.
+        x3 = mlp(x3, hidden_units=TRANSFORMER_UNITS, dropout_rate=0.1)
+        # Skip connection 2.
+        encoded_patches = layers.Add()([x3, x2])
+
+    # Create a [batch_size, projection_dim] tensor.
+    representation = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
+    representation = layers.Flatten()(representation)
+    representation = layers.Dropout(0.5)(representation)
+    # Add MLP.
+    features = mlp(representation, hidden_units=MLP_HEAD_UNITS, dropout_rate=0.5)
+    # Classify outputs.
+    logits = layers.Dense(NUM_CLASSES)(features)
+    # Create the Keras model.
+    model = keras.Model(inputs=inputs, outputs=logits)
+    return model
+
+
+"""
+## Compile, train, and evaluate the mode
+"""
+
+
+# Some code is taken from:
+# https://www.kaggle.com/ashusma/training-rfcx-tensorflow-tpu-effnet-b2.
+class WarmUpCosine(keras.optimizers.schedules.LearningRateSchedule):
+    def __init__(
+        self, learning_rate_base, total_steps, warmup_learning_rate, warmup_steps
+    ):
+        super().__init__()
+
+        self.learning_rate_base = learning_rate_base
+        self.total_steps = total_steps
+        self.warmup_learning_rate = warmup_learning_rate
+        self.warmup_steps = warmup_steps
+        self.pi = ops.array(np.pi)
+
+    def __call__(self, step):
+        if self.total_steps < self.warmup_steps:
+            raise ValueError("Total_steps must be larger or equal to warmup_steps.")
+
+        cos_annealed_lr = ops.cos(
+            self.pi
+            * (ops.cast(step, dtype="float32") - self.warmup_steps)
+            / float(self.total_steps - self.warmup_steps)
+        )
+        learning_rate = 0.5 * self.learning_rate_base * (1 + cos_annealed_lr)
+
+        if self.warmup_steps > 0:
+            if self.learning_rate_base < self.warmup_learning_rate:
+                raise ValueError(
+                    "Learning_rate_base must be larger or equal to "
+                    "warmup_learning_rate."
+                )
+            slope = (
+                self.learning_rate_base - self.warmup_learning_rate
+            ) / self.warmup_steps
+            warmup_rate = (
+                slope * ops.cast(step, dtype="float32") + self.warmup_learning_rate
+            )
+            learning_rate = ops.where(
+                step < self.warmup_steps, warmup_rate, learning_rate
+            )
+        return ops.where(
+            step > self.total_steps, 0.0, learning_rate, name="learning_rate"
+        )
+
+
+def run_experiment(model):
+    total_steps = int((len(x_train) / BATCH_SIZE) * EPOCHS)
+    warmup_epoch_percentage = 0.10
+    warmup_steps = int(total_steps * warmup_epoch_percentage)
+    scheduled_lrs = WarmUpCosine(
+        learning_rate_base=LEARNING_RATE,
+        total_steps=total_steps,
+        warmup_learning_rate=0.0,
+        warmup_steps=warmup_steps,
+    )
+
+    optimizer = keras.optimizers.AdamW(
+        learning_rate=LEARNING_RATE, weight_decay=WEIGHT_DECAY
+    )
+
+    model.compile(
+        optimizer=optimizer,
+        loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
+        metrics=[
+            keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
+            keras.metrics.SparseTopKCategoricalAccuracy(5, name="top-5-accuracy"),
+        ],
+    )
+
+    history = model.fit(
+        x=x_train,
+        y=y_train,
+        batch_size=BATCH_SIZE,
+        epochs=EPOCHS,
+        validation_split=0.1,
+    )
+    _, accuracy, top_5_accuracy = model.evaluate(x_test, y_test, batch_size=BATCH_SIZE)
+    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
+    print(f"Test top 5 accuracy: {round(top_5_accuracy * 100, 2)}%")
+
+    return history
+
+
+# Run experiments with the vanilla ViT
+vit = create_vit_classifier(vanilla=True)
+history = run_experiment(vit)
+
+# Run experiments with the Shifted Patch Tokenization and
+# Locality Self Attention modified ViT
+vit_sl = create_vit_classifier(vanilla=False)
+history = run_experiment(vit_sl)
+
+"""
+# Final Notes
+
+With the help of Shifted Patch Tokenization and Locality Self Attention,
+we were able to get ~**3-4%** top-1 accuracy gains on CIFAR100.
+
+The ideas on Shifted Patch Tokenization and Locality Self Attention
+are very intuitive and easy to implement. The authors also ablates of
+different shifting strategies for Shifted Patch Tokenization in the
+supplementary of the paper.
+
+I would like to thank [Jarvislabs.ai](https://jarvislabs.ai/) for
+generously helping with GPU credits.
+
+You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/keras-io/vit_small_ds_v2)
+and try the demo on [Hugging Face Spaces](https://huggingface.co/spaces/keras-io/vit-small-ds).
+"""
diff --git a/knowledge_base/vision/vivit.py b/knowledge_base/vision/vivit.py
new file mode 100644
index 0000000000000000000000000000000000000000..116bddf14229548c31e897f1c18c408c3d81f868
--- /dev/null
+++ b/knowledge_base/vision/vivit.py
@@ -0,0 +1,447 @@
+"""
+Title: Video Vision Transformer
+Author: [Aritra Roy Gosthipaty](https://twitter.com/ariG23498), [Ayush Thakur](https://twitter.com/ayushthakur0) (equal contribution)
+Date created: 2022/01/12
+Last modified:  2024/01/15
+Description: A Transformer-based architecture for video classification.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+
+Videos are sequences of images. Let's assume you have an image
+representation model (CNN, ViT, etc.) and a sequence model
+(RNN, LSTM, etc.) at hand. We ask you to tweak the model for video
+classification. The simplest approach would be to apply the image
+model to individual frames, use the sequence model to learn
+sequences of image features, then apply a classification head on
+the learned sequence representation.
+The Keras example
+[Video Classification with a CNN-RNN Architecture](https://keras.io/examples/vision/video_classification/)
+explains this approach in detail. Alernatively, you can also
+build a hybrid Transformer-based model for video classification as shown in the Keras example
+[Video Classification with Transformers](https://keras.io/examples/vision/video_transformers/).
+
+In this example, we minimally implement
+[ViViT: A Video Vision Transformer](https://arxiv.org/abs/2103.15691)
+by Arnab et al., a **pure Transformer-based** model
+for video classification. The authors propose a novel embedding scheme
+and a number of Transformer variants to model video clips. We implement
+the embedding scheme and one of the variants of the Transformer
+architecture, for simplicity.
+
+This example requires  `medmnist` package, which can be installed
+by running the code cell below.
+"""
+
+"""shell
+pip install -qq medmnist
+"""
+
+"""
+## Imports
+"""
+
+import os
+import io
+import imageio
+import medmnist
+import ipywidgets
+import numpy as np
+import tensorflow as tf  # for data preprocessing only
+import keras
+from keras import layers, ops
+
+# Setting seed for reproducibility
+SEED = 42
+os.environ["TF_CUDNN_DETERMINISTIC"] = "1"
+keras.utils.set_random_seed(SEED)
+
+"""
+## Hyperparameters
+
+The hyperparameters are chosen via hyperparameter
+search. You can learn more about the process in the "conclusion" section.
+"""
+
+# DATA
+DATASET_NAME = "organmnist3d"
+BATCH_SIZE = 32
+AUTO = tf.data.AUTOTUNE
+INPUT_SHAPE = (28, 28, 28, 1)
+NUM_CLASSES = 11
+
+# OPTIMIZER
+LEARNING_RATE = 1e-4
+WEIGHT_DECAY = 1e-5
+
+# TRAINING
+EPOCHS = 60
+
+# TUBELET EMBEDDING
+PATCH_SIZE = (8, 8, 8)
+NUM_PATCHES = (INPUT_SHAPE[0] // PATCH_SIZE[0]) ** 2
+
+# ViViT ARCHITECTURE
+LAYER_NORM_EPS = 1e-6
+PROJECTION_DIM = 128
+NUM_HEADS = 8
+NUM_LAYERS = 8
+
+"""
+## Dataset
+
+For our example we use the
+[MedMNIST v2: A Large-Scale Lightweight Benchmark for 2D and 3D Biomedical Image Classification](https://medmnist.com/)
+dataset. The videos are lightweight and easy to train on.
+"""
+
+
+def download_and_prepare_dataset(data_info: dict):
+    """Utility function to download the dataset.
+
+    Arguments:
+        data_info (dict): Dataset metadata.
+    """
+    data_path = keras.utils.get_file(origin=data_info["url"], md5_hash=data_info["MD5"])
+
+    with np.load(data_path) as data:
+        # Get videos
+        train_videos = data["train_images"]
+        valid_videos = data["val_images"]
+        test_videos = data["test_images"]
+
+        # Get labels
+        train_labels = data["train_labels"].flatten()
+        valid_labels = data["val_labels"].flatten()
+        test_labels = data["test_labels"].flatten()
+
+    return (
+        (train_videos, train_labels),
+        (valid_videos, valid_labels),
+        (test_videos, test_labels),
+    )
+
+
+# Get the metadata of the dataset
+info = medmnist.INFO[DATASET_NAME]
+
+# Get the dataset
+prepared_dataset = download_and_prepare_dataset(info)
+(train_videos, train_labels) = prepared_dataset[0]
+(valid_videos, valid_labels) = prepared_dataset[1]
+(test_videos, test_labels) = prepared_dataset[2]
+
+"""
+### `tf.data` pipeline
+"""
+
+
+def preprocess(frames: tf.Tensor, label: tf.Tensor):
+    """Preprocess the frames tensors and parse the labels."""
+    # Preprocess images
+    frames = tf.image.convert_image_dtype(
+        frames[
+            ..., tf.newaxis
+        ],  # The new axis is to help for further processing with Conv3D layers
+        tf.float32,
+    )
+    # Parse label
+    label = tf.cast(label, tf.float32)
+    return frames, label
+
+
+def prepare_dataloader(
+    videos: np.ndarray,
+    labels: np.ndarray,
+    loader_type: str = "train",
+    batch_size: int = BATCH_SIZE,
+):
+    """Utility function to prepare the dataloader."""
+    dataset = tf.data.Dataset.from_tensor_slices((videos, labels))
+
+    if loader_type == "train":
+        dataset = dataset.shuffle(BATCH_SIZE * 2)
+
+    dataloader = (
+        dataset.map(preprocess, num_parallel_calls=tf.data.AUTOTUNE)
+        .batch(batch_size)
+        .prefetch(tf.data.AUTOTUNE)
+    )
+    return dataloader
+
+
+trainloader = prepare_dataloader(train_videos, train_labels, "train")
+validloader = prepare_dataloader(valid_videos, valid_labels, "valid")
+testloader = prepare_dataloader(test_videos, test_labels, "test")
+
+"""
+## Tubelet Embedding
+
+In ViTs, an image is divided into patches, which are then spatially
+flattened, a process known as tokenization. For a video, one can
+repeat this process for individual frames. **Uniform frame sampling**
+as suggested by the authors is a tokenization scheme in which we
+sample frames from the video clip and perform simple ViT tokenization.
+
+| ![uniform frame sampling](https://i.imgur.com/aaPyLPX.png) |
+| :--: |
+| Uniform Frame Sampling [Source](https://arxiv.org/abs/2103.15691) |
+
+**Tubelet Embedding** is different in terms of capturing temporal
+information from the video.
+First, we extract volumes from the video -- these volumes contain
+patches of the frame and the temporal information as well. The volumes
+are then flattened to build video tokens.
+
+| ![tubelet embedding](https://i.imgur.com/9G7QTfV.png) |
+| :--: |
+| Tubelet Embedding [Source](https://arxiv.org/abs/2103.15691) |
+"""
+
+
+class TubeletEmbedding(layers.Layer):
+    def __init__(self, embed_dim, patch_size, **kwargs):
+        super().__init__(**kwargs)
+        self.projection = layers.Conv3D(
+            filters=embed_dim,
+            kernel_size=patch_size,
+            strides=patch_size,
+            padding="VALID",
+        )
+        self.flatten = layers.Reshape(target_shape=(-1, embed_dim))
+
+    def call(self, videos):
+        projected_patches = self.projection(videos)
+        flattened_patches = self.flatten(projected_patches)
+        return flattened_patches
+
+
+"""
+## Positional Embedding
+
+This layer adds positional information to the encoded video tokens.
+"""
+
+
+class PositionalEncoder(layers.Layer):
+    def __init__(self, embed_dim, **kwargs):
+        super().__init__(**kwargs)
+        self.embed_dim = embed_dim
+
+    def build(self, input_shape):
+        _, num_tokens, _ = input_shape
+        self.position_embedding = layers.Embedding(
+            input_dim=num_tokens, output_dim=self.embed_dim
+        )
+        self.positions = ops.arange(0, num_tokens, 1)
+
+    def call(self, encoded_tokens):
+        # Encode the positions and add it to the encoded tokens
+        encoded_positions = self.position_embedding(self.positions)
+        encoded_tokens = encoded_tokens + encoded_positions
+        return encoded_tokens
+
+
+"""
+## Video Vision Transformer
+
+The authors suggest 4 variants of Vision Transformer:
+
+- Spatio-temporal attention
+- Factorized encoder
+- Factorized self-attention
+- Factorized dot-product attention
+
+In this example, we will implement the **Spatio-temporal attention**
+model for simplicity. The following code snippet is heavily inspired from
+[Image classification with Vision Transformer](https://keras.io/examples/vision/image_classification_with_vision_transformer/).
+One can also refer to the
+[official repository of ViViT](https://github.com/google-research/scenic/tree/main/scenic/projects/vivit)
+which contains all the variants, implemented in JAX.
+"""
+
+
+def create_vivit_classifier(
+    tubelet_embedder,
+    positional_encoder,
+    input_shape=INPUT_SHAPE,
+    transformer_layers=NUM_LAYERS,
+    num_heads=NUM_HEADS,
+    embed_dim=PROJECTION_DIM,
+    layer_norm_eps=LAYER_NORM_EPS,
+    num_classes=NUM_CLASSES,
+):
+    # Get the input layer
+    inputs = layers.Input(shape=input_shape)
+    # Create patches.
+    patches = tubelet_embedder(inputs)
+    # Encode patches.
+    encoded_patches = positional_encoder(patches)
+
+    # Create multiple layers of the Transformer block.
+    for _ in range(transformer_layers):
+        # Layer normalization and MHSA
+        x1 = layers.LayerNormalization(epsilon=1e-6)(encoded_patches)
+        attention_output = layers.MultiHeadAttention(
+            num_heads=num_heads, key_dim=embed_dim // num_heads, dropout=0.1
+        )(x1, x1)
+
+        # Skip connection
+        x2 = layers.Add()([attention_output, encoded_patches])
+
+        # Layer Normalization and MLP
+        x3 = layers.LayerNormalization(epsilon=1e-6)(x2)
+        x3 = keras.Sequential(
+            [
+                layers.Dense(units=embed_dim * 4, activation=ops.gelu),
+                layers.Dense(units=embed_dim, activation=ops.gelu),
+            ]
+        )(x3)
+
+        # Skip connection
+        encoded_patches = layers.Add()([x3, x2])
+
+    # Layer normalization and Global average pooling.
+    representation = layers.LayerNormalization(epsilon=layer_norm_eps)(encoded_patches)
+    representation = layers.GlobalAvgPool1D()(representation)
+
+    # Classify outputs.
+    outputs = layers.Dense(units=num_classes, activation="softmax")(representation)
+
+    # Create the Keras model.
+    model = keras.Model(inputs=inputs, outputs=outputs)
+    return model
+
+
+"""
+## Train
+"""
+
+
+def run_experiment():
+    # Initialize model
+    model = create_vivit_classifier(
+        tubelet_embedder=TubeletEmbedding(
+            embed_dim=PROJECTION_DIM, patch_size=PATCH_SIZE
+        ),
+        positional_encoder=PositionalEncoder(embed_dim=PROJECTION_DIM),
+    )
+
+    # Compile the model with the optimizer, loss function
+    # and the metrics.
+    optimizer = keras.optimizers.Adam(learning_rate=LEARNING_RATE)
+    model.compile(
+        optimizer=optimizer,
+        loss="sparse_categorical_crossentropy",
+        metrics=[
+            keras.metrics.SparseCategoricalAccuracy(name="accuracy"),
+            keras.metrics.SparseTopKCategoricalAccuracy(5, name="top-5-accuracy"),
+        ],
+    )
+
+    # Train the model.
+    _ = model.fit(trainloader, epochs=EPOCHS, validation_data=validloader)
+
+    _, accuracy, top_5_accuracy = model.evaluate(testloader)
+    print(f"Test accuracy: {round(accuracy * 100, 2)}%")
+    print(f"Test top 5 accuracy: {round(top_5_accuracy * 100, 2)}%")
+
+    return model
+
+
+model = run_experiment()
+
+"""
+## Inference
+"""
+
+NUM_SAMPLES_VIZ = 25
+testsamples, labels = next(iter(testloader))
+testsamples, labels = testsamples[:NUM_SAMPLES_VIZ], labels[:NUM_SAMPLES_VIZ]
+
+ground_truths = []
+preds = []
+videos = []
+
+for i, (testsample, label) in enumerate(zip(testsamples, labels)):
+    # Generate gif
+    testsample = np.reshape(testsample.numpy(), (-1, 28, 28))
+    with io.BytesIO() as gif:
+        imageio.mimsave(gif, (testsample * 255).astype("uint8"), "GIF", fps=5)
+        videos.append(gif.getvalue())
+
+    # Get model prediction
+    output = model.predict(ops.expand_dims(testsample, axis=0))[0]
+    pred = np.argmax(output, axis=0)
+
+    ground_truths.append(label.numpy().astype("int"))
+    preds.append(pred)
+
+
+def make_box_for_grid(image_widget, fit):
+    """Make a VBox to hold caption/image for demonstrating option_fit values.
+
+    Source: https://ipywidgets.readthedocs.io/en/latest/examples/Widget%20Styling.html
+    """
+    # Make the caption
+    if fit is not None:
+        fit_str = "'{}'".format(fit)
+    else:
+        fit_str = str(fit)
+
+    h = ipywidgets.HTML(value="" + str(fit_str) + "")
+
+    # Make the green box with the image widget inside it
+    boxb = ipywidgets.widgets.Box()
+    boxb.children = [image_widget]
+
+    # Compose into a vertical box
+    vb = ipywidgets.widgets.VBox()
+    vb.layout.align_items = "center"
+    vb.children = [h, boxb]
+    return vb
+
+
+boxes = []
+for i in range(NUM_SAMPLES_VIZ):
+    ib = ipywidgets.widgets.Image(value=videos[i], width=100, height=100)
+    true_class = info["label"][str(ground_truths[i])]
+    pred_class = info["label"][str(preds[i])]
+    caption = f"T: {true_class} | P: {pred_class}"
+
+    boxes.append(make_box_for_grid(ib, caption))
+
+ipywidgets.widgets.GridBox(
+    boxes, layout=ipywidgets.widgets.Layout(grid_template_columns="repeat(5, 200px)")
+)
+
+"""
+## Final thoughts
+
+With a vanilla implementation, we achieve ~79-80% Top-1 accuracy on the
+test dataset.
+
+The hyperparameters used in this tutorial were finalized by running a
+hyperparameter search using
+[W&B Sweeps](https://docs.wandb.ai/guides/sweeps).
+You can find out our sweeps result
+[here](https://wandb.ai/minimal-implementations/vivit/sweeps/66fp0lhz)
+and our quick analysis of the results
+[here](https://wandb.ai/minimal-implementations/vivit/reports/Hyperparameter-Tuning-Analysis--VmlldzoxNDEwNzcx).
+
+For further improvement, you could look into the following:
+
+- Using data augmentation for videos.
+- Using a better regularization scheme for training.
+- Apply different variants of the transformer model as in the paper.
+
+We would like to thank [Anurag Arnab](https://anuragarnab.github.io/)
+(first author of ViViT) for helpful discussion. We are grateful to
+[Weights and Biases](https://wandb.ai/site) program for helping with
+GPU credits.
+
+You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/keras-io/video-vision-transformer)
+and try the demo on [Hugging Face Spaces](https://huggingface.co/spaces/keras-io/video-vision-transformer-CT).
+"""
diff --git a/knowledge_base/vision/xray_classification_with_tpus.py b/knowledge_base/vision/xray_classification_with_tpus.py
new file mode 100644
index 0000000000000000000000000000000000000000..f9f0d502142fefeca3fe3d43b7aa883dccc1dd68
--- /dev/null
+++ b/knowledge_base/vision/xray_classification_with_tpus.py
@@ -0,0 +1,441 @@
+"""
+Title: Pneumonia Classification on TPU
+Author: Amy MiHyun Jang
+Date created: 2020/07/28
+Last modified: 2020/08/24
+Description: Medical image classification on TPU.
+Accelerator: TPU
+"""
+
+"""
+## Introduction + Set-up
+
+This tutorial will explain how to build an X-ray image classification model
+to predict whether an X-ray scan shows presence of pneumonia.
+"""
+
+import re
+import os
+import random
+import numpy as np
+import pandas as pd
+import tensorflow as tf
+import matplotlib.pyplot as plt
+
+try:
+    tpu = tf.distribute.cluster_resolver.TPUClusterResolver.connect()
+    print("Device:", tpu.master())
+    strategy = tf.distribute.TPUStrategy(tpu)
+except:
+    strategy = tf.distribute.get_strategy()
+print("Number of replicas:", strategy.num_replicas_in_sync)
+
+"""
+We need a Google Cloud link to our data to load the data using a TPU.
+Below, we define key configuration parameters we'll use in this example.
+To run on TPU, this example must be on Colab with the TPU runtime selected.
+"""
+
+AUTOTUNE = tf.data.AUTOTUNE
+BATCH_SIZE = 25 * strategy.num_replicas_in_sync
+IMAGE_SIZE = [180, 180]
+CLASS_NAMES = ["NORMAL", "PNEUMONIA"]
+
+"""
+## Load the data
+
+The Chest X-ray data we are using from
+[*Cell*](https://www.cell.com/cell/fulltext/S0092-8674(18)30154-5) divides the data into
+training and test files. Let's first load in the training TFRecords.
+"""
+
+train_images = tf.data.TFRecordDataset(
+    "gs://download.tensorflow.org/data/ChestXRay2017/train/images.tfrec"
+)
+train_paths = tf.data.TFRecordDataset(
+    "gs://download.tensorflow.org/data/ChestXRay2017/train/paths.tfrec"
+)
+
+ds = tf.data.Dataset.zip((train_images, train_paths))
+
+"""
+Let's count how many healthy/normal chest X-rays we have and how many
+pneumonia chest X-rays we have:
+"""
+
+COUNT_NORMAL = len(
+    [
+        filename
+        for filename in train_paths
+        if "NORMAL" in filename.numpy().decode("utf-8")
+    ]
+)
+print("Normal images count in training set: " + str(COUNT_NORMAL))
+
+COUNT_PNEUMONIA = len(
+    [
+        filename
+        for filename in train_paths
+        if "PNEUMONIA" in filename.numpy().decode("utf-8")
+    ]
+)
+print("Pneumonia images count in training set: " + str(COUNT_PNEUMONIA))
+
+"""
+Notice that there are way more images that are classified as pneumonia than normal. This
+shows that we have an imbalance in our data. We will correct for this imbalance later on
+in our notebook.
+"""
+
+"""
+We want to map each filename to the corresponding (image, label) pair. The following
+methods will help us do that.
+
+As we only have two labels, we will encode the label so that `1` or `True` indicates
+pneumonia and `0` or `False` indicates normal.
+"""
+
+
+def get_label(file_path):
+    # convert the path to a list of path components
+    parts = tf.strings.split(file_path, "/")
+    # The second to last is the class-directory
+    if parts[-2] == "PNEUMONIA":
+        return 1
+    else:
+        return 0
+
+
+def decode_img(img):
+    # convert the compressed string to a 3D uint8 tensor
+    img = tf.image.decode_jpeg(img, channels=3)
+    # resize the image to the desired size.
+    return tf.image.resize(img, IMAGE_SIZE)
+
+
+def process_path(image, path):
+    label = get_label(path)
+    # load the raw data from the file as a string
+    img = decode_img(image)
+    return img, label
+
+
+ds = ds.map(process_path, num_parallel_calls=AUTOTUNE)
+
+"""
+Let's split the data into a training and validation datasets.
+"""
+
+ds = ds.shuffle(10000)
+train_ds = ds.take(4200)
+val_ds = ds.skip(4200)
+
+"""
+Let's visualize the shape of an (image, label) pair.
+"""
+
+for image, label in train_ds.take(1):
+    print("Image shape: ", image.numpy().shape)
+    print("Label: ", label.numpy())
+
+"""
+Load and format the test data as well.
+"""
+
+test_images = tf.data.TFRecordDataset(
+    "gs://download.tensorflow.org/data/ChestXRay2017/test/images.tfrec"
+)
+test_paths = tf.data.TFRecordDataset(
+    "gs://download.tensorflow.org/data/ChestXRay2017/test/paths.tfrec"
+)
+test_ds = tf.data.Dataset.zip((test_images, test_paths))
+
+test_ds = test_ds.map(process_path, num_parallel_calls=AUTOTUNE)
+test_ds = test_ds.batch(BATCH_SIZE)
+
+"""
+## Visualize the dataset
+
+First, let's use buffered prefetching so we can yield data from disk without having I/O
+become blocking.
+
+Please note that large image datasets should not be cached in memory. We do it here
+because the dataset is not very large and we want to train on TPU.
+"""
+
+
+def prepare_for_training(ds, cache=True):
+    # This is a small dataset, only load it once, and keep it in memory.
+    # use `.cache(filename)` to cache preprocessing work for datasets that don't
+    # fit in memory.
+    if cache:
+        if isinstance(cache, str):
+            ds = ds.cache(cache)
+        else:
+            ds = ds.cache()
+
+    ds = ds.batch(BATCH_SIZE)
+
+    # `prefetch` lets the dataset fetch batches in the background while the model
+    # is training.
+    ds = ds.prefetch(buffer_size=AUTOTUNE)
+
+    return ds
+
+
+"""
+Call the next batch iteration of the training data.
+"""
+
+train_ds = prepare_for_training(train_ds)
+val_ds = prepare_for_training(val_ds)
+
+image_batch, label_batch = next(iter(train_ds))
+
+"""
+Define the method to show the images in the batch.
+"""
+
+
+def show_batch(image_batch, label_batch):
+    plt.figure(figsize=(10, 10))
+    for n in range(25):
+        ax = plt.subplot(5, 5, n + 1)
+        plt.imshow(image_batch[n] / 255)
+        if label_batch[n]:
+            plt.title("PNEUMONIA")
+        else:
+            plt.title("NORMAL")
+        plt.axis("off")
+
+
+"""
+As the method takes in NumPy arrays as its parameters, call the numpy function on the
+batches to return the tensor in NumPy array form.
+"""
+
+show_batch(image_batch.numpy(), label_batch.numpy())
+
+"""
+## Build the CNN
+
+To make our model more modular and easier to understand, let's define some blocks. As
+we're building a convolution neural network, we'll create a convolution block and a dense
+layer block.
+
+The architecture for this CNN has been inspired by this
+[article](https://towardsdatascience.com/deep-learning-for-detecting-pneumonia-from-x-ray-images-fc9a3d9fdba8).
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import keras
+from keras import layers
+
+
+def conv_block(filters, inputs):
+    x = layers.SeparableConv2D(filters, 3, activation="relu", padding="same")(inputs)
+    x = layers.SeparableConv2D(filters, 3, activation="relu", padding="same")(x)
+    x = layers.BatchNormalization()(x)
+    outputs = layers.MaxPool2D()(x)
+
+    return outputs
+
+
+def dense_block(units, dropout_rate, inputs):
+    x = layers.Dense(units, activation="relu")(inputs)
+    x = layers.BatchNormalization()(x)
+    outputs = layers.Dropout(dropout_rate)(x)
+
+    return outputs
+
+
+"""
+The following method will define the function to build our model for us.
+
+The images originally have values that range from [0, 255]. CNNs work better with smaller
+numbers so we will scale this down for our input.
+
+The Dropout layers are important, as they
+reduce the likelikhood of the model overfitting. We want to end the model with a `Dense`
+layer with one node, as this will be the binary output that determines if an X-ray shows
+presence of pneumonia.
+"""
+
+
+def build_model():
+    inputs = keras.Input(shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 3))
+    x = layers.Rescaling(1.0 / 255)(inputs)
+    x = layers.Conv2D(16, 3, activation="relu", padding="same")(x)
+    x = layers.Conv2D(16, 3, activation="relu", padding="same")(x)
+    x = layers.MaxPool2D()(x)
+
+    x = conv_block(32, x)
+    x = conv_block(64, x)
+
+    x = conv_block(128, x)
+    x = layers.Dropout(0.2)(x)
+
+    x = conv_block(256, x)
+    x = layers.Dropout(0.2)(x)
+
+    x = layers.Flatten()(x)
+    x = dense_block(512, 0.7, x)
+    x = dense_block(128, 0.5, x)
+    x = dense_block(64, 0.3, x)
+
+    outputs = layers.Dense(1, activation="sigmoid")(x)
+
+    model = keras.Model(inputs=inputs, outputs=outputs)
+    return model
+
+
+"""
+## Correct for data imbalance
+
+We saw earlier in this example that the data was imbalanced, with more images classified
+as pneumonia than normal. We will correct for that by using class weighting:
+"""
+
+initial_bias = np.log([COUNT_PNEUMONIA / COUNT_NORMAL])
+print("Initial bias: {:.5f}".format(initial_bias[0]))
+
+TRAIN_IMG_COUNT = COUNT_NORMAL + COUNT_PNEUMONIA
+weight_for_0 = (1 / COUNT_NORMAL) * (TRAIN_IMG_COUNT) / 2.0
+weight_for_1 = (1 / COUNT_PNEUMONIA) * (TRAIN_IMG_COUNT) / 2.0
+
+class_weight = {0: weight_for_0, 1: weight_for_1}
+
+print("Weight for class 0: {:.2f}".format(weight_for_0))
+print("Weight for class 1: {:.2f}".format(weight_for_1))
+
+"""
+The weight for class `0` (Normal) is a lot higher than the weight for class `1`
+(Pneumonia). Because there are less normal images, each normal image will be weighted
+more to balance the data as the CNN works best when the training data is balanced.
+"""
+
+"""
+## Train the model
+"""
+
+"""
+### Defining callbacks
+
+The checkpoint callback saves the best weights of the model, so next time we want to use
+the model, we do not have to spend time training it. The early stopping callback stops
+the training process when the model starts becoming stagnant, or even worse, when the
+model starts overfitting.
+"""
+
+checkpoint_cb = keras.callbacks.ModelCheckpoint("xray_model.keras", save_best_only=True)
+
+early_stopping_cb = keras.callbacks.EarlyStopping(
+    patience=10, restore_best_weights=True
+)
+
+"""
+We also want to tune our learning rate. Too high of a learning rate will cause the model
+to diverge. Too small of a learning rate will cause the model to be too slow. We
+implement the exponential learning rate scheduling method below.
+"""
+
+initial_learning_rate = 0.015
+lr_schedule = keras.optimizers.schedules.ExponentialDecay(
+    initial_learning_rate, decay_steps=100000, decay_rate=0.96, staircase=True
+)
+
+"""
+### Fit the model
+
+For our metrics, we want to include precision and recall as they will provide use with a
+more informed picture of how good our model is. Accuracy tells us what fraction of the
+labels is correct. Since our data is not balanced, accuracy might give a skewed sense of
+a good model (i.e. a model that always predicts PNEUMONIA will be 74% accurate but is not
+a good model).
+
+Precision is the number of true positives (TP) over the sum of TP and false positives
+(FP). It shows what fraction of labeled positives are actually correct.
+
+Recall is the number of TP over the sum of TP and false negatves (FN). It shows what
+fraction of actual positives are correct.
+
+Since there are only two possible labels for the image, we will be using the
+binary crossentropy loss. When we fit the model, remember to specify the class weights,
+which we defined earlier. Because we are using a TPU, training will be quick - less than
+2 minutes.
+"""
+
+with strategy.scope():
+    model = build_model()
+
+    METRICS = [
+        keras.metrics.BinaryAccuracy(),
+        keras.metrics.Precision(name="precision"),
+        keras.metrics.Recall(name="recall"),
+    ]
+    model.compile(
+        optimizer=keras.optimizers.Adam(learning_rate=lr_schedule),
+        loss="binary_crossentropy",
+        metrics=METRICS,
+    )
+
+history = model.fit(
+    train_ds,
+    epochs=100,
+    validation_data=val_ds,
+    class_weight=class_weight,
+    callbacks=[checkpoint_cb, early_stopping_cb],
+)
+
+"""
+## Visualizing model performance
+
+Let's plot the model accuracy and loss for the training and the validating set. Note that
+no random seed is specified for this notebook. For your notebook, there might be slight
+variance.
+"""
+
+fig, ax = plt.subplots(1, 4, figsize=(20, 3))
+ax = ax.ravel()
+
+for i, met in enumerate(["precision", "recall", "binary_accuracy", "loss"]):
+    ax[i].plot(history.history[met])
+    ax[i].plot(history.history["val_" + met])
+    ax[i].set_title("Model {}".format(met))
+    ax[i].set_xlabel("epochs")
+    ax[i].set_ylabel(met)
+    ax[i].legend(["train", "val"])
+
+"""
+We see that the accuracy for our model is around 95%.
+"""
+
+"""
+## Predict and evaluate results
+
+Let's evaluate the model on our test data!
+"""
+
+model.evaluate(test_ds, return_dict=True)
+
+"""
+We see that our accuracy on our test data is lower than the accuracy for our validating
+set. This may indicate overfitting.
+
+Our recall is greater than our precision, indicating that almost all pneumonia images are
+correctly identified but some normal images are falsely identified. We should aim to
+increase our precision.
+"""
+
+for image, label in test_ds.take(1):
+    plt.imshow(image[0] / 255.0)
+    plt.title(CLASS_NAMES[label[0].numpy()])
+
+prediction = model.predict(test_ds.take(1))[0]
+scores = [1 - prediction, prediction]
+
+for score, name in zip(scores, CLASS_NAMES):
+    print("This image is %.2f percent %s" % ((100 * score), name))
diff --git a/knowledge_base/vision/yolov8.py b/knowledge_base/vision/yolov8.py
new file mode 100644
index 0000000000000000000000000000000000000000..8e70a929e6baedb447fc60ae137c0f8bdf2ddcee
--- /dev/null
+++ b/knowledge_base/vision/yolov8.py
@@ -0,0 +1,631 @@
+"""
+Title: Efficient Object Detection with YOLOV8 and KerasCV
+Author: [Gitesh Chawda](https://twitter.com/gitesh12_)
+Date created: 2023/06/26
+Last modified: 2023/06/26
+Description: Train custom YOLOV8 object detection model with KerasCV.
+Accelerator: GPU
+"""
+
+"""
+## Introduction
+"""
+
+"""
+KerasCV is an extension of Keras for computer vision tasks. In this example, we'll see
+how to train a YOLOV8 object detection model using KerasCV.
+
+KerasCV includes pre-trained models for popular computer vision datasets, such as
+ImageNet, COCO, and Pascal VOC, which can be used for transfer learning. KerasCV also
+provides a range of visualization tools for inspecting the intermediate representations
+learned by the model and for visualizing the results of object detection and segmentation
+tasks.
+"""
+
+"""
+If you're interested in learning about object detection using KerasCV, I highly suggest
+taking a look at the guide created by lukewood. This resource, available at
+[Object Detection With KerasCV](https://keras.io/guides/keras_cv/object_detection_keras_cv/#object-detection-introduction),
+provides a comprehensive overview of the fundamental concepts and techniques
+required for building object detection models with KerasCV.
+"""
+
+"""shell
+pip install --upgrade git+https://github.com/keras-team/keras-cv -q
+"""
+
+"""
+## Setup
+"""
+
+import os
+from tqdm.auto import tqdm
+import xml.etree.ElementTree as ET
+
+import tensorflow as tf
+from tensorflow import keras
+
+import keras_cv
+from keras_cv import bounding_box
+from keras_cv import visualization
+
+"""
+## Load Data
+"""
+
+"""
+For this guide, we will be utilizing the Self-Driving Car Dataset obtained from
+[roboflow](https://public.roboflow.com/object-detection/self-driving-car). In order to
+make the dataset more manageable, I have extracted a subset of the larger dataset, which
+originally consisted of 15,000 data samples. From this subset, I have chosen 7,316
+samples for model training.
+
+To simplify the task at hand and focus our efforts, we will be working with a reduced
+number of object classes. Specifically, we will be considering five primary classes for
+detection and classification: car, pedestrian, traffic light, biker, and truck. These
+classes represent some of the most common and significant objects encountered in the
+context of self-driving cars.
+
+By narrowing down the dataset to these specific classes, we can concentrate on building a
+robust object detection model that can accurately identify and classify these important
+objects.
+"""
+
+"""
+The TensorFlow Datasets library provides a convenient way to download and use various
+datasets, including the object detection dataset. This can be a great option for those
+who want to quickly start working with the data without having to manually download and
+preprocess it.
+
+You can view various object detection datasets here
+[TensorFlow Datasets](https://www.tensorflow.org/datasets/catalog/overview#object_detection)
+
+However, in this code example, we will demonstrate how to load the dataset from scratch
+using TensorFlow's `tf.data` pipeline. This approach provides more flexibility and allows
+you to customize the preprocessing steps as needed.
+
+Loading custom datasets that are not available in the TensorFlow Datasets library is one
+of the main advantages of using the `tf.data` pipeline. This approach allows you to
+create a custom data preprocessing pipeline tailored to the specific needs and
+requirements of your dataset.
+"""
+
+"""
+## Hyperparameters
+"""
+
+SPLIT_RATIO = 0.2
+BATCH_SIZE = 4
+LEARNING_RATE = 0.001
+EPOCH = 5
+GLOBAL_CLIPNORM = 10.0
+
+"""
+A dictionary is created to map each class name to a unique numerical identifier. This
+mapping is used to encode and decode the class labels during training and inference in
+object detection tasks.
+"""
+
+class_ids = [
+    "car",
+    "pedestrian",
+    "trafficLight",
+    "biker",
+    "truck",
+]
+class_mapping = dict(zip(range(len(class_ids)), class_ids))
+
+# Path to images and annotations
+path_images = "/kaggle/input/dataset/data/images/"
+path_annot = "/kaggle/input/dataset/data/annotations/"
+
+# Get all XML file paths in path_annot and sort them
+xml_files = sorted(
+    [
+        os.path.join(path_annot, file_name)
+        for file_name in os.listdir(path_annot)
+        if file_name.endswith(".xml")
+    ]
+)
+
+# Get all JPEG image file paths in path_images and sort them
+jpg_files = sorted(
+    [
+        os.path.join(path_images, file_name)
+        for file_name in os.listdir(path_images)
+        if file_name.endswith(".jpg")
+    ]
+)
+
+"""
+The function below reads the XML file and finds the image name and path, and then
+iterates over each object in the XML file to extract the bounding box coordinates and
+class labels for each object.
+
+The function returns three values: the image path, a list of bounding boxes (each
+represented as a list of four floats: xmin, ymin, xmax, ymax), and a list of class IDs
+(represented as integers) corresponding to each bounding box. The class IDs are obtained
+by mapping the class labels to integer values using a dictionary called `class_mapping`.
+"""
+
+
+def parse_annotation(xml_file):
+    tree = ET.parse(xml_file)
+    root = tree.getroot()
+
+    image_name = root.find("filename").text
+    image_path = os.path.join(path_images, image_name)
+
+    boxes = []
+    classes = []
+    for obj in root.iter("object"):
+        cls = obj.find("name").text
+        classes.append(cls)
+
+        bbox = obj.find("bndbox")
+        xmin = float(bbox.find("xmin").text)
+        ymin = float(bbox.find("ymin").text)
+        xmax = float(bbox.find("xmax").text)
+        ymax = float(bbox.find("ymax").text)
+        boxes.append([xmin, ymin, xmax, ymax])
+
+    class_ids = [
+        list(class_mapping.keys())[list(class_mapping.values()).index(cls)]
+        for cls in classes
+    ]
+    return image_path, boxes, class_ids
+
+
+image_paths = []
+bbox = []
+classes = []
+for xml_file in tqdm(xml_files):
+    image_path, boxes, class_ids = parse_annotation(xml_file)
+    image_paths.append(image_path)
+    bbox.append(boxes)
+    classes.append(class_ids)
+
+"""
+Here we are using `tf.ragged.constant` to create ragged tensors from the `bbox` and
+`classes` lists. A ragged tensor is a type of tensor that can handle varying lengths of
+data along one or more dimensions. This is useful when dealing with data that has
+variable-length sequences, such as text or time series data.
+
+```python
+classes = [
+    [8, 8, 8, 8, 8],      # 5 classes
+    [12, 14, 14, 14],     # 4 classes
+    [1],                  # 1 class
+    [7, 7],               # 2 classes
+ ...]
+```
+
+```python
+bbox = [
+    [[199.0, 19.0, 390.0, 401.0],
+    [217.0, 15.0, 270.0, 157.0],
+    [393.0, 18.0, 432.0, 162.0],
+    [1.0, 15.0, 226.0, 276.0],
+    [19.0, 95.0, 458.0, 443.0]],     #image 1 has 4 objects
+    [[52.0, 117.0, 109.0, 177.0]],   #image 2 has 1 object
+    [[88.0, 87.0, 235.0, 322.0],
+    [113.0, 117.0, 218.0, 471.0]],   #image 3 has 2 objects
+ ...]
+```
+
+In this case, the `bbox` and `classes` lists have different lengths for each image,
+depending on the number of objects in the image and the corresponding bounding boxes and
+classes. To handle this variability, ragged tensors are used instead of regular tensors.
+
+Later, these ragged tensors are used to create a `tf.data.Dataset` using the
+`from_tensor_slices` method. This method creates a dataset from the input tensors by
+slicing them along the first dimension. By using ragged tensors, the dataset can handle
+varying lengths of data for each image and provide a flexible input pipeline for further
+processing.
+"""
+
+bbox = tf.ragged.constant(bbox)
+classes = tf.ragged.constant(classes)
+image_paths = tf.ragged.constant(image_paths)
+
+data = tf.data.Dataset.from_tensor_slices((image_paths, classes, bbox))
+
+"""
+Splitting data in training and validation data
+"""
+
+# Determine the number of validation samples
+num_val = int(len(xml_files) * SPLIT_RATIO)
+
+# Split the dataset into train and validation sets
+val_data = data.take(num_val)
+train_data = data.skip(num_val)
+
+"""
+Let's see about data loading and bounding box formatting to get things going. Bounding
+boxes in KerasCV have a predetermined format. To do this, you must bundle your bounding
+boxes into a dictionary that complies with the requirements listed below:
+
+```python
+bounding_boxes = {
+    # num_boxes may be a Ragged dimension
+    'boxes': Tensor(shape=[batch, num_boxes, 4]),
+    'classes': Tensor(shape=[batch, num_boxes])
+}
+```
+
+The dictionary has two keys, `'boxes'` and `'classes'`, each of which maps to a
+TensorFlow RaggedTensor or Tensor object. The `'boxes'` Tensor has a shape of `[batch,
+num_boxes, 4]`, where batch is the number of images in the batch and num_boxes is the
+maximum number of bounding boxes in any image. The 4 represents the four values needed to
+define a bounding box:  xmin, ymin, xmax, ymax.
+
+The `'classes'` Tensor has a shape of `[batch, num_boxes]`, where each element represents
+the class label for the corresponding bounding box in the `'boxes'` Tensor. The num_boxes
+dimension may be ragged, which means that the number of boxes may vary across images in
+the batch.
+
+Final dict should be:
+```python
+{"images": images, "bounding_boxes": bounding_boxes}
+```
+"""
+
+
+def load_image(image_path):
+    image = tf.io.read_file(image_path)
+    image = tf.image.decode_jpeg(image, channels=3)
+    return image
+
+
+def load_dataset(image_path, classes, bbox):
+    # Read Image
+    image = load_image(image_path)
+    bounding_boxes = {
+        "classes": tf.cast(classes, dtype=tf.float32),
+        "boxes": bbox,
+    }
+    return {"images": tf.cast(image, tf.float32), "bounding_boxes": bounding_boxes}
+
+
+"""
+Here we create a layer that resizes images to 640x640 pixels, while maintaining the
+original aspect ratio. The bounding boxes associated with the image are specified in the
+`xyxy` format. If necessary, the resized image will be padded with zeros to maintain the
+original aspect ratio.
+
+Bounding Box Formats supported by KerasCV:
+1.   CENTER_XYWH
+2.   XYWH
+3.   XYXY
+4.   REL_XYXY
+5.   REL_XYWH
+6.   YXYX
+7.   REL_YXYX
+
+
+You can read more about KerasCV bounding box formats in
+[docs](https://keras.io/api/keras_cv/bounding_box/formats/).
+
+Furthermore, it is possible to perform format conversion between any two pairs:
+
+```python
+boxes = keras_cv.bounding_box.convert_format(
+        bounding_box,
+        images=image,
+        source="xyxy",  # Original Format
+        target="xywh",  # Target Format (to which we want to convert)
+    )
+```
+"""
+
+"""
+## Data Augmentation
+
+One of the most challenging tasks when constructing object detection pipelines is data
+augmentation. It involves applying various transformations to the input images to
+increase the diversity of the training data and improve the model's ability to
+generalize. However, when working with object detection tasks, it becomes even more
+complex as these transformations need to be aware of the underlying bounding boxes and
+update them accordingly.
+
+KerasCV provides native support for bounding box augmentation. KerasCV offers an
+extensive collection of data augmentation layers specifically designed to handle bounding
+boxes. These layers intelligently adjust the bounding box coordinates as the image is
+transformed, ensuring that the bounding boxes remain accurate and aligned with the
+augmented images.
+
+By leveraging KerasCV's capabilities, developers can conveniently integrate bounding
+box-friendly data augmentation into their object detection pipelines. By performing
+on-the-fly augmentation within a tf.data pipeline, the process becomes seamless and
+efficient, enabling better training and more accurate object detection results.
+"""
+
+augmenter = keras.Sequential(
+    layers=[
+        keras_cv.layers.RandomFlip(mode="horizontal", bounding_box_format="xyxy"),
+        keras_cv.layers.RandomShear(
+            x_factor=0.2, y_factor=0.2, bounding_box_format="xyxy"
+        ),
+        keras_cv.layers.JitteredResize(
+            target_size=(640, 640), scale_factor=(0.75, 1.3), bounding_box_format="xyxy"
+        ),
+    ]
+)
+
+"""
+## Creating Training Dataset
+"""
+
+train_ds = train_data.map(load_dataset, num_parallel_calls=tf.data.AUTOTUNE)
+train_ds = train_ds.shuffle(BATCH_SIZE * 4)
+train_ds = train_ds.ragged_batch(BATCH_SIZE, drop_remainder=True)
+train_ds = train_ds.map(augmenter, num_parallel_calls=tf.data.AUTOTUNE)
+
+"""
+## Creating Validation Dataset
+"""
+
+resizing = keras_cv.layers.JitteredResize(
+    target_size=(640, 640),
+    scale_factor=(0.75, 1.3),
+    bounding_box_format="xyxy",
+)
+
+val_ds = val_data.map(load_dataset, num_parallel_calls=tf.data.AUTOTUNE)
+val_ds = val_ds.shuffle(BATCH_SIZE * 4)
+val_ds = val_ds.ragged_batch(BATCH_SIZE, drop_remainder=True)
+val_ds = val_ds.map(resizing, num_parallel_calls=tf.data.AUTOTUNE)
+
+"""
+## Visualization
+"""
+
+
+def visualize_dataset(inputs, value_range, rows, cols, bounding_box_format):
+    inputs = next(iter(inputs.take(1)))
+    images, bounding_boxes = inputs["images"], inputs["bounding_boxes"]
+    visualization.plot_bounding_box_gallery(
+        images,
+        value_range=value_range,
+        rows=rows,
+        cols=cols,
+        y_true=bounding_boxes,
+        scale=5,
+        font_scale=0.7,
+        bounding_box_format=bounding_box_format,
+        class_mapping=class_mapping,
+    )
+
+
+visualize_dataset(
+    train_ds, bounding_box_format="xyxy", value_range=(0, 255), rows=2, cols=2
+)
+
+visualize_dataset(
+    val_ds, bounding_box_format="xyxy", value_range=(0, 255), rows=2, cols=2
+)
+
+"""
+We need to extract the inputs from the preprocessing dictionary and get them ready to be
+fed into the model.
+"""
+
+
+def dict_to_tuple(inputs):
+    return inputs["images"], inputs["bounding_boxes"]
+
+
+train_ds = train_ds.map(dict_to_tuple, num_parallel_calls=tf.data.AUTOTUNE)
+train_ds = train_ds.prefetch(tf.data.AUTOTUNE)
+
+val_ds = val_ds.map(dict_to_tuple, num_parallel_calls=tf.data.AUTOTUNE)
+val_ds = val_ds.prefetch(tf.data.AUTOTUNE)
+
+"""
+## Creating Model
+"""
+
+"""
+YOLOv8 is a cutting-edge YOLO model that is used for a variety of computer vision tasks,
+such as object detection, image classification, and instance segmentation. Ultralytics,
+the creators of YOLOv5, also developed YOLOv8, which incorporates many improvements and
+changes in architecture and developer experience compared to its predecessor. YOLOv8 is
+the latest state-of-the-art model that is highly regarded in the industry.
+"""
+
+"""
+Below table compares the performance metrics of five different YOLOv8 models with
+different sizes (measured in pixels): YOLOv8n, YOLOv8s, YOLOv8m, YOLOv8l, and YOLOv8x.
+The metrics include mean average precision (mAP) values at different
+intersection-over-union (IoU) thresholds for validation data, inference speed on CPU with
+ONNX format and A100 TensorRT, number of parameters, and number of floating-point
+operations (FLOPs) (both in millions and billions, respectively). As the size of the
+model increases, the mAP, parameters, and FLOPs generally increase while the speed
+decreases. YOLOv8x has the highest mAP, parameters, and FLOPs but also the slowest
+inference speed, while YOLOv8n has the smallest size, fastest inference speed, and lowest
+mAP, parameters, and FLOPs.
+
+| Model                                                                                |
+size<br><sup>(pixels) | mAP<sup>val<br>50-95 | Speed<br><sup>CPU ONNX<br>(ms) |
+Speed<br><sup>A100 TensorRT<br>(ms) | params<br><sup>(M) | FLOPs<br><sup>(B) |
+| ------------------------------------------------------------------------------------ |
+--------------------- | -------------------- | ------------------------------ |
+----------------------------------- | ------------------ | ----------------- |
+| YOLOv8n | 640                   | 37.3                 | 80.4
+| 0.99                                | 3.2                | 8.7               |
+| YOLOv8s | 640                   | 44.9                 | 128.4
+| 1.20                                | 11.2               | 28.6              |
+| YOLOv8m | 640                   | 50.2                 | 234.7
+| 1.83                                | 25.9               | 78.9              |
+| YOLOv8l | 640                   | 52.9                 | 375.2
+| 2.39                                | 43.7               | 165.2             |
+| YOLOv8x | 640                   | 53.9                 | 479.1
+| 3.53                                | 68.2               | 257.8             |
+"""
+
+"""
+You can read more about YOLOV8 and its architecture in this
+[RoboFlow Blog](https://blog.roboflow.com/whats-new-in-yolov8/)
+"""
+
+"""
+First we will create a instance of backbone which will be used by our yolov8 detector
+class.
+
+YOLOV8 Backbones available in KerasCV:
+
+1.   Without Weights:
+
+    1.   yolo_v8_xs_backbone
+    2.   yolo_v8_s_backbone
+    3.   yolo_v8_m_backbone
+    4.   yolo_v8_l_backbone
+    5.   yolo_v8_xl_backbone
+
+2. With Pre-trained coco weight:
+
+    1.   yolo_v8_xs_backbone_coco
+    2.   yolo_v8_s_backbone_coco
+    2.   yolo_v8_m_backbone_coco
+    2.   yolo_v8_l_backbone_coco
+    2.   yolo_v8_xl_backbone_coco
+
+
+
+"""
+
+backbone = keras_cv.models.YOLOV8Backbone.from_preset(
+    "yolo_v8_s_backbone_coco"  # We will use yolov8 small backbone with coco weights
+)
+
+"""
+Next, let's build a YOLOV8 model using the `YOLOV8Detector`, which accepts a feature
+extractor as the `backbone` argument, a `num_classes` argument that specifies the number
+of object classes to detect based on the size of the `class_mapping` list, a
+`bounding_box_format` argument that informs the model of the format of the bbox in the
+dataset, and a finally, the feature pyramid network (FPN) depth is specified by the
+`fpn_depth` argument.
+
+It is simple to build a YOLOV8 using any of the aforementioned backbones thanks to
+KerasCV.
+
+"""
+
+yolo = keras_cv.models.YOLOV8Detector(
+    num_classes=len(class_mapping),
+    bounding_box_format="xyxy",
+    backbone=backbone,
+    fpn_depth=1,
+)
+
+"""
+## Compile the Model
+"""
+
+"""
+Loss used for YOLOV8
+
+
+1. Classification Loss: This loss function calculates the discrepancy between anticipated
+class probabilities and actual class probabilities. In this instance,
+`binary_crossentropy`, a prominent solution for binary classification issues, is
+Utilized. We Utilized binary crossentropy since each thing that is identified is either
+classed as belonging to or not belonging to a certain object class (such as a person, a
+car, etc.).
+
+2. Box Loss: `box_loss` is the loss function used to measure the difference between the
+predicted bounding boxes and the ground truth. In this case, the Complete IoU (CIoU)
+metric is used, which not only measures the overlap between predicted and ground truth
+bounding boxes but also considers the difference in aspect ratio, center distance, and
+box size. Together, these loss functions help optimize the model for object detection by
+minimizing the difference between the predicted and ground truth class probabilities and
+bounding boxes.
+
+
+"""
+
+optimizer = tf.keras.optimizers.Adam(
+    learning_rate=LEARNING_RATE,
+    global_clipnorm=GLOBAL_CLIPNORM,
+)
+
+yolo.compile(
+    optimizer=optimizer, classification_loss="binary_crossentropy", box_loss="ciou"
+)
+
+"""
+## COCO Metric Callback
+
+We will be using `BoxCOCOMetrics` from KerasCV to evaluate the model and calculate the
+Map(Mean Average Precision) score, Recall and Precision. We also save our model when the
+mAP score improves.
+"""
+
+
+class EvaluateCOCOMetricsCallback(keras.callbacks.Callback):
+    def __init__(self, data, save_path):
+        super().__init__()
+        self.data = data
+        self.metrics = keras_cv.metrics.BoxCOCOMetrics(
+            bounding_box_format="xyxy",
+            evaluate_freq=1e9,
+        )
+
+        self.save_path = save_path
+        self.best_map = -1.0
+
+    def on_epoch_end(self, epoch, logs):
+        self.metrics.reset_state()
+        for batch in self.data:
+            images, y_true = batch[0], batch[1]
+            y_pred = self.model.predict(images, verbose=0)
+            self.metrics.update_state(y_true, y_pred)
+
+        metrics = self.metrics.result(force=True)
+        logs.update(metrics)
+
+        current_map = metrics["MaP"]
+        if current_map > self.best_map:
+            self.best_map = current_map
+            self.model.save(self.save_path)  # Save the model when mAP improves
+
+        return logs
+
+
+"""
+## Train the Model
+"""
+
+yolo.fit(
+    train_ds,
+    validation_data=val_ds,
+    epochs=3,
+    callbacks=[EvaluateCOCOMetricsCallback(val_ds, "model.h5")],
+)
+
+"""
+## Visualize Predictions
+"""
+
+
+def visualize_detections(model, dataset, bounding_box_format):
+    images, y_true = next(iter(dataset.take(1)))
+    y_pred = model.predict(images)
+    y_pred = bounding_box.to_ragged(y_pred)
+    visualization.plot_bounding_box_gallery(
+        images,
+        value_range=(0, 255),
+        bounding_box_format=bounding_box_format,
+        y_true=y_true,
+        y_pred=y_pred,
+        scale=4,
+        rows=2,
+        cols=2,
+        show=True,
+        font_scale=0.7,
+        class_mapping=class_mapping,
+    )
+
+
+visualize_detections(yolo, dataset=val_ds, bounding_box_format="xyxy")
diff --git a/knowledge_base/vision/zero_dce.py b/knowledge_base/vision/zero_dce.py
new file mode 100644
index 0000000000000000000000000000000000000000..fdc6cab1fedb5342d4c5ac8b1b1402f319b2ed1f
--- /dev/null
+++ b/knowledge_base/vision/zero_dce.py
@@ -0,0 +1,535 @@
+"""
+Title: Zero-DCE for low-light image enhancement
+Author: [Soumik Rakshit](http://github.com/soumik12345)
+Date created: 2021/09/18
+Last modified: 2023/07/15
+Description: Implementing Zero-Reference Deep Curve Estimation for low-light image enhancement.
+Accelerator: GPU
+Converted to Keras 3 by: [Soumik Rakshit](http://github.com/soumik12345)
+"""
+
+"""
+## Introduction
+
+**Zero-Reference Deep Curve Estimation** or **Zero-DCE** formulates low-light image
+enhancement as the task of estimating an image-specific
+[*tonal curve*](https://en.wikipedia.org/wiki/Curve_(tonality)) with a deep neural network.
+In this example, we train a lightweight deep network, **DCE-Net**, to estimate
+pixel-wise and high-order tonal curves for dynamic range adjustment of a given image.
+
+Zero-DCE takes a low-light image as input and produces high-order tonal curves as its output.
+These curves are then used for pixel-wise adjustment on the dynamic range of the input to
+obtain an enhanced image. The curve estimation process is done in such a way that it maintains
+the range of the enhanced image and preserves the contrast of neighboring pixels. This
+curve estimation is inspired by curves adjustment used in photo editing software such as
+Adobe Photoshop where users can adjust points throughout an image’s tonal range.
+
+Zero-DCE is appealing because of its relaxed assumptions with regard to reference images:
+it does not require any input/output image pairs during training.
+This is achieved through a set of carefully formulated non-reference loss functions,
+which implicitly measure the enhancement quality and guide the training of the network.
+
+### References
+
+- [Zero-Reference Deep Curve Estimation for Low-Light Image Enhancement](https://arxiv.org/abs/2001.06826)
+- [Curves adjustment in Adobe Photoshop](https://helpx.adobe.com/photoshop/using/curves-adjustment.html)
+"""
+
+"""
+## Downloading LOLDataset
+
+The **LoL Dataset** has been created for low-light image enhancement. It provides 485
+images for training and 15 for testing. Each image pair in the dataset consists of a
+low-light input image and its corresponding well-exposed reference image.
+"""
+
+import os
+
+os.environ["KERAS_BACKEND"] = "tensorflow"
+
+import random
+import numpy as np
+from glob import glob
+from PIL import Image, ImageOps
+import matplotlib.pyplot as plt
+
+import keras
+from keras import layers
+
+import tensorflow as tf
+
+"""shell
+wget https://huggingface.co/datasets/geekyrakshit/LoL-Dataset/resolve/main/lol_dataset.zip
+unzip -q lol_dataset.zip && rm lol_dataset.zip
+"""
+
+"""
+## Creating a TensorFlow Dataset
+
+We use 300 low-light images from the LoL Dataset training set for training, and we use
+the remaining 185 low-light images for validation. We resize the images to size `256 x
+256` to be used for both training and validation. Note that in order to train the DCE-Net,
+we will not require the corresponding enhanced images.
+"""
+
+IMAGE_SIZE = 256
+BATCH_SIZE = 16
+MAX_TRAIN_IMAGES = 400
+
+
+def load_data(image_path):
+    image = tf.io.read_file(image_path)
+    image = tf.image.decode_png(image, channels=3)
+    image = tf.image.resize(images=image, size=[IMAGE_SIZE, IMAGE_SIZE])
+    image = image / 255.0
+    return image
+
+
+def data_generator(low_light_images):
+    dataset = tf.data.Dataset.from_tensor_slices((low_light_images))
+    dataset = dataset.map(load_data, num_parallel_calls=tf.data.AUTOTUNE)
+    dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)
+    return dataset
+
+
+train_low_light_images = sorted(glob("./lol_dataset/our485/low/*"))[:MAX_TRAIN_IMAGES]
+val_low_light_images = sorted(glob("./lol_dataset/our485/low/*"))[MAX_TRAIN_IMAGES:]
+test_low_light_images = sorted(glob("./lol_dataset/eval15/low/*"))
+
+
+train_dataset = data_generator(train_low_light_images)
+val_dataset = data_generator(val_low_light_images)
+
+print("Train Dataset:", train_dataset)
+print("Validation Dataset:", val_dataset)
+
+"""
+## The Zero-DCE Framework
+
+The goal of DCE-Net is to estimate a set of best-fitting light-enhancement curves
+(LE-curves) given an input image. The framework then maps all pixels of the input’s RGB
+channels by applying the curves iteratively to obtain the final enhanced image.
+
+### Understanding light-enhancement curves
+
+A ligh-enhancement curve is a kind of curve that can map a low-light image
+to its enhanced version automatically,
+where the self-adaptive curve parameters are solely dependent on the input image.
+When designing such a curve, three objectives should be taken into account:
+
+- Each pixel value of the enhanced image should be in the normalized range `[0,1]`, in order to
+avoid information loss induced by overflow truncation.
+- It should be monotonous, to preserve the contrast between neighboring pixels.
+- The shape of this curve should be as simple as possible,
+and the curve should be differentiable to allow backpropagation.
+
+The light-enhancement curve is separately applied to three RGB channels instead of solely on the
+illumination channel. The three-channel adjustment can better preserve the inherent color and reduce
+the risk of over-saturation.
+
+![](https://li-chongyi.github.io/Zero-DCE_files/framework.png)
+
+### DCE-Net
+
+The DCE-Net is a lightweight deep neural network that learns the mapping between an input
+image and its best-fitting curve parameter maps. The input to the DCE-Net is a low-light
+image while the outputs are a set of pixel-wise curve parameter maps for corresponding
+higher-order curves. It is a plain CNN of seven convolutional layers with symmetrical
+concatenation. Each layer consists of 32 convolutional kernels of size 3×3 and stride 1
+followed by the ReLU activation function. The last convolutional layer is followed by the
+Tanh activation function, which produces 24 parameter maps for 8 iterations, where each
+iteration requires three curve parameter maps for the three channels.
+
+![](https://i.imgur.com/HtIg34W.png)
+"""
+
+
+def build_dce_net():
+    input_img = keras.Input(shape=[None, None, 3])
+    conv1 = layers.Conv2D(
+        32, (3, 3), strides=(1, 1), activation="relu", padding="same"
+    )(input_img)
+    conv2 = layers.Conv2D(
+        32, (3, 3), strides=(1, 1), activation="relu", padding="same"
+    )(conv1)
+    conv3 = layers.Conv2D(
+        32, (3, 3), strides=(1, 1), activation="relu", padding="same"
+    )(conv2)
+    conv4 = layers.Conv2D(
+        32, (3, 3), strides=(1, 1), activation="relu", padding="same"
+    )(conv3)
+    int_con1 = layers.Concatenate(axis=-1)([conv4, conv3])
+    conv5 = layers.Conv2D(
+        32, (3, 3), strides=(1, 1), activation="relu", padding="same"
+    )(int_con1)
+    int_con2 = layers.Concatenate(axis=-1)([conv5, conv2])
+    conv6 = layers.Conv2D(
+        32, (3, 3), strides=(1, 1), activation="relu", padding="same"
+    )(int_con2)
+    int_con3 = layers.Concatenate(axis=-1)([conv6, conv1])
+    x_r = layers.Conv2D(24, (3, 3), strides=(1, 1), activation="tanh", padding="same")(
+        int_con3
+    )
+    return keras.Model(inputs=input_img, outputs=x_r)
+
+
+"""
+## Loss functions
+
+To enable zero-reference learning in DCE-Net, we use a set of differentiable
+zero-reference losses that allow us to evaluate the quality of enhanced images.
+"""
+
+"""
+### Color constancy loss
+
+The *color constancy loss* is used to correct the potential color deviations in the
+enhanced image.
+"""
+
+
+def color_constancy_loss(x):
+    mean_rgb = tf.reduce_mean(x, axis=(1, 2), keepdims=True)
+    mr, mg, mb = (
+        mean_rgb[:, :, :, 0],
+        mean_rgb[:, :, :, 1],
+        mean_rgb[:, :, :, 2],
+    )
+    d_rg = tf.square(mr - mg)
+    d_rb = tf.square(mr - mb)
+    d_gb = tf.square(mb - mg)
+    return tf.sqrt(tf.square(d_rg) + tf.square(d_rb) + tf.square(d_gb))
+
+
+"""
+### Exposure loss
+
+To restrain under-/over-exposed regions, we use the *exposure control loss*.
+It measures the distance between the average intensity value of a local region
+and a preset well-exposedness level (set to `0.6`).
+"""
+
+
+def exposure_loss(x, mean_val=0.6):
+    x = tf.reduce_mean(x, axis=3, keepdims=True)
+    mean = tf.nn.avg_pool2d(x, ksize=16, strides=16, padding="VALID")
+    return tf.reduce_mean(tf.square(mean - mean_val))
+
+
+"""
+### Illumination smoothness loss
+
+To preserve the monotonicity relations between neighboring pixels, the
+*illumination smoothness loss* is added to each curve parameter map.
+"""
+
+
+def illumination_smoothness_loss(x):
+    batch_size = tf.shape(x)[0]
+    h_x = tf.shape(x)[1]
+    w_x = tf.shape(x)[2]
+    count_h = (tf.shape(x)[2] - 1) * tf.shape(x)[3]
+    count_w = tf.shape(x)[2] * (tf.shape(x)[3] - 1)
+    h_tv = tf.reduce_sum(tf.square((x[:, 1:, :, :] - x[:, : h_x - 1, :, :])))
+    w_tv = tf.reduce_sum(tf.square((x[:, :, 1:, :] - x[:, :, : w_x - 1, :])))
+    batch_size = tf.cast(batch_size, dtype=tf.float32)
+    count_h = tf.cast(count_h, dtype=tf.float32)
+    count_w = tf.cast(count_w, dtype=tf.float32)
+    return 2 * (h_tv / count_h + w_tv / count_w) / batch_size
+
+
+"""
+### Spatial consistency loss
+
+The *spatial consistency loss* encourages spatial coherence of the enhanced image by
+preserving the contrast between neighboring regions across the input image and its enhanced version.
+"""
+
+
+class SpatialConsistencyLoss(keras.losses.Loss):
+    def __init__(self, **kwargs):
+        super().__init__(reduction="none")
+
+        self.left_kernel = tf.constant(
+            [[[[0, 0, 0]], [[-1, 1, 0]], [[0, 0, 0]]]], dtype=tf.float32
+        )
+        self.right_kernel = tf.constant(
+            [[[[0, 0, 0]], [[0, 1, -1]], [[0, 0, 0]]]], dtype=tf.float32
+        )
+        self.up_kernel = tf.constant(
+            [[[[0, -1, 0]], [[0, 1, 0]], [[0, 0, 0]]]], dtype=tf.float32
+        )
+        self.down_kernel = tf.constant(
+            [[[[0, 0, 0]], [[0, 1, 0]], [[0, -1, 0]]]], dtype=tf.float32
+        )
+
+    def call(self, y_true, y_pred):
+        original_mean = tf.reduce_mean(y_true, 3, keepdims=True)
+        enhanced_mean = tf.reduce_mean(y_pred, 3, keepdims=True)
+        original_pool = tf.nn.avg_pool2d(
+            original_mean, ksize=4, strides=4, padding="VALID"
+        )
+        enhanced_pool = tf.nn.avg_pool2d(
+            enhanced_mean, ksize=4, strides=4, padding="VALID"
+        )
+
+        d_original_left = tf.nn.conv2d(
+            original_pool,
+            self.left_kernel,
+            strides=[1, 1, 1, 1],
+            padding="SAME",
+        )
+        d_original_right = tf.nn.conv2d(
+            original_pool,
+            self.right_kernel,
+            strides=[1, 1, 1, 1],
+            padding="SAME",
+        )
+        d_original_up = tf.nn.conv2d(
+            original_pool, self.up_kernel, strides=[1, 1, 1, 1], padding="SAME"
+        )
+        d_original_down = tf.nn.conv2d(
+            original_pool,
+            self.down_kernel,
+            strides=[1, 1, 1, 1],
+            padding="SAME",
+        )
+
+        d_enhanced_left = tf.nn.conv2d(
+            enhanced_pool,
+            self.left_kernel,
+            strides=[1, 1, 1, 1],
+            padding="SAME",
+        )
+        d_enhanced_right = tf.nn.conv2d(
+            enhanced_pool,
+            self.right_kernel,
+            strides=[1, 1, 1, 1],
+            padding="SAME",
+        )
+        d_enhanced_up = tf.nn.conv2d(
+            enhanced_pool, self.up_kernel, strides=[1, 1, 1, 1], padding="SAME"
+        )
+        d_enhanced_down = tf.nn.conv2d(
+            enhanced_pool,
+            self.down_kernel,
+            strides=[1, 1, 1, 1],
+            padding="SAME",
+        )
+
+        d_left = tf.square(d_original_left - d_enhanced_left)
+        d_right = tf.square(d_original_right - d_enhanced_right)
+        d_up = tf.square(d_original_up - d_enhanced_up)
+        d_down = tf.square(d_original_down - d_enhanced_down)
+        return d_left + d_right + d_up + d_down
+
+
+"""
+### Deep curve estimation model
+
+We implement the Zero-DCE framework as a Keras subclassed model.
+"""
+
+
+class ZeroDCE(keras.Model):
+    def __init__(self, **kwargs):
+        super().__init__(**kwargs)
+        self.dce_model = build_dce_net()
+
+    def compile(self, learning_rate, **kwargs):
+        super().compile(**kwargs)
+        self.optimizer = keras.optimizers.Adam(learning_rate=learning_rate)
+        self.spatial_constancy_loss = SpatialConsistencyLoss(reduction="none")
+        self.total_loss_tracker = keras.metrics.Mean(name="total_loss")
+        self.illumination_smoothness_loss_tracker = keras.metrics.Mean(
+            name="illumination_smoothness_loss"
+        )
+        self.spatial_constancy_loss_tracker = keras.metrics.Mean(
+            name="spatial_constancy_loss"
+        )
+        self.color_constancy_loss_tracker = keras.metrics.Mean(
+            name="color_constancy_loss"
+        )
+        self.exposure_loss_tracker = keras.metrics.Mean(name="exposure_loss")
+
+    @property
+    def metrics(self):
+        return [
+            self.total_loss_tracker,
+            self.illumination_smoothness_loss_tracker,
+            self.spatial_constancy_loss_tracker,
+            self.color_constancy_loss_tracker,
+            self.exposure_loss_tracker,
+        ]
+
+    def get_enhanced_image(self, data, output):
+        r1 = output[:, :, :, :3]
+        r2 = output[:, :, :, 3:6]
+        r3 = output[:, :, :, 6:9]
+        r4 = output[:, :, :, 9:12]
+        r5 = output[:, :, :, 12:15]
+        r6 = output[:, :, :, 15:18]
+        r7 = output[:, :, :, 18:21]
+        r8 = output[:, :, :, 21:24]
+        x = data + r1 * (tf.square(data) - data)
+        x = x + r2 * (tf.square(x) - x)
+        x = x + r3 * (tf.square(x) - x)
+        enhanced_image = x + r4 * (tf.square(x) - x)
+        x = enhanced_image + r5 * (tf.square(enhanced_image) - enhanced_image)
+        x = x + r6 * (tf.square(x) - x)
+        x = x + r7 * (tf.square(x) - x)
+        enhanced_image = x + r8 * (tf.square(x) - x)
+        return enhanced_image
+
+    def call(self, data):
+        dce_net_output = self.dce_model(data)
+        return self.get_enhanced_image(data, dce_net_output)
+
+    def compute_losses(self, data, output):
+        enhanced_image = self.get_enhanced_image(data, output)
+        loss_illumination = 200 * illumination_smoothness_loss(output)
+        loss_spatial_constancy = tf.reduce_mean(
+            self.spatial_constancy_loss(enhanced_image, data)
+        )
+        loss_color_constancy = 5 * tf.reduce_mean(color_constancy_loss(enhanced_image))
+        loss_exposure = 10 * tf.reduce_mean(exposure_loss(enhanced_image))
+        total_loss = (
+            loss_illumination
+            + loss_spatial_constancy
+            + loss_color_constancy
+            + loss_exposure
+        )
+
+        return {
+            "total_loss": total_loss,
+            "illumination_smoothness_loss": loss_illumination,
+            "spatial_constancy_loss": loss_spatial_constancy,
+            "color_constancy_loss": loss_color_constancy,
+            "exposure_loss": loss_exposure,
+        }
+
+    def train_step(self, data):
+        with tf.GradientTape() as tape:
+            output = self.dce_model(data)
+            losses = self.compute_losses(data, output)
+
+        gradients = tape.gradient(
+            losses["total_loss"], self.dce_model.trainable_weights
+        )
+        self.optimizer.apply_gradients(zip(gradients, self.dce_model.trainable_weights))
+
+        self.total_loss_tracker.update_state(losses["total_loss"])
+        self.illumination_smoothness_loss_tracker.update_state(
+            losses["illumination_smoothness_loss"]
+        )
+        self.spatial_constancy_loss_tracker.update_state(
+            losses["spatial_constancy_loss"]
+        )
+        self.color_constancy_loss_tracker.update_state(losses["color_constancy_loss"])
+        self.exposure_loss_tracker.update_state(losses["exposure_loss"])
+
+        return {metric.name: metric.result() for metric in self.metrics}
+
+    def test_step(self, data):
+        output = self.dce_model(data)
+        losses = self.compute_losses(data, output)
+
+        self.total_loss_tracker.update_state(losses["total_loss"])
+        self.illumination_smoothness_loss_tracker.update_state(
+            losses["illumination_smoothness_loss"]
+        )
+        self.spatial_constancy_loss_tracker.update_state(
+            losses["spatial_constancy_loss"]
+        )
+        self.color_constancy_loss_tracker.update_state(losses["color_constancy_loss"])
+        self.exposure_loss_tracker.update_state(losses["exposure_loss"])
+
+        return {metric.name: metric.result() for metric in self.metrics}
+
+    def save_weights(self, filepath, overwrite=True, save_format=None, options=None):
+        """While saving the weights, we simply save the weights of the DCE-Net"""
+        self.dce_model.save_weights(
+            filepath,
+            overwrite=overwrite,
+            save_format=save_format,
+            options=options,
+        )
+
+    def load_weights(self, filepath, by_name=False, skip_mismatch=False, options=None):
+        """While loading the weights, we simply load the weights of the DCE-Net"""
+        self.dce_model.load_weights(
+            filepath=filepath,
+            by_name=by_name,
+            skip_mismatch=skip_mismatch,
+            options=options,
+        )
+
+
+"""
+## Training
+"""
+
+zero_dce_model = ZeroDCE()
+zero_dce_model.compile(learning_rate=1e-4)
+history = zero_dce_model.fit(train_dataset, validation_data=val_dataset, epochs=100)
+
+
+def plot_result(item):
+    plt.plot(history.history[item], label=item)
+    plt.plot(history.history["val_" + item], label="val_" + item)
+    plt.xlabel("Epochs")
+    plt.ylabel(item)
+    plt.title("Train and Validation {} Over Epochs".format(item), fontsize=14)
+    plt.legend()
+    plt.grid()
+    plt.show()
+
+
+plot_result("total_loss")
+plot_result("illumination_smoothness_loss")
+plot_result("spatial_constancy_loss")
+plot_result("color_constancy_loss")
+plot_result("exposure_loss")
+
+"""
+## Inference
+"""
+
+
+def plot_results(images, titles, figure_size=(12, 12)):
+    fig = plt.figure(figsize=figure_size)
+    for i in range(len(images)):
+        fig.add_subplot(1, len(images), i + 1).set_title(titles[i])
+        _ = plt.imshow(images[i])
+        plt.axis("off")
+    plt.show()
+
+
+def infer(original_image):
+    image = keras.utils.img_to_array(original_image)
+    image = image.astype("float32") / 255.0
+    image = np.expand_dims(image, axis=0)
+    output_image = zero_dce_model(image)
+    output_image = tf.cast((output_image[0, :, :, :] * 255), dtype=np.uint8)
+    output_image = Image.fromarray(output_image.numpy())
+    return output_image
+
+
+"""
+### Inference on test images
+
+We compare the test images from LOLDataset enhanced by MIRNet with images enhanced via
+the `PIL.ImageOps.autocontrast()` function.
+
+You can use the trained model hosted on [Hugging Face Hub](https://huggingface.co/keras-io/low-light-image-enhancement)
+and try the demo on [Hugging Face Spaces](https://huggingface.co/spaces/keras-io/low-light-image-enhancement).
+"""
+
+for val_image_file in test_low_light_images:
+    original_image = Image.open(val_image_file)
+    enhanced_image = infer(original_image)
+    plot_results(
+        [original_image, ImageOps.autocontrast(original_image), enhanced_image],
+        ["Original", "PIL Autocontrast", "Enhanced"],
+        (20, 12),
+    )
diff --git a/mcp_server/__init__.py b/mcp_server/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..837133252d826e2a105c6088e4bd6bf530b45c80
--- /dev/null
+++ b/mcp_server/__init__.py
@@ -0,0 +1,2 @@
+__all__ = ["loader", "embeddings"]
+__version__ = "0.1.0"
\ No newline at end of file
diff --git a/mcp_server/embeddings.py b/mcp_server/embeddings.py
new file mode 100644
index 0000000000000000000000000000000000000000..d6a9679576682eafda0600cea95de98a88d0a33e
--- /dev/null
+++ b/mcp_server/embeddings.py
@@ -0,0 +1,155 @@
+from __future__ import annotations
+
+import os
+import threading
+from dataclasses import dataclass
+from typing import List, Sequence, Tuple, Optional
+
+import numpy as np
+
+# Determinism knobs
+os.environ.setdefault("TOKENIZERS_PARALLELISM", "false")
+
+try:
+    import torch  # type: ignore
+except Exception:  # pragma: no cover
+    torch = None  # type: ignore
+
+try:
+    from sentence_transformers import SentenceTransformer  # type: ignore
+except Exception as e:  # pragma: no cover
+    raise RuntimeError(
+        "sentence-transformers is required. Add it to requirements.txt and install."
+    ) from e
+
+
+DEFAULT_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
+
+
+def _set_deterministic(seed: int = 42) -> None:
+    """Best-effort determinism across numpy and torch."""
+    np.random.seed(seed)
+    if torch is not None:
+        try:
+            torch.manual_seed(seed)
+            torch.use_deterministic_algorithms(True)  # type: ignore[attr-defined]
+        except Exception:
+            # Older PyTorch or restricted envs
+            pass
+
+
+def _l2_normalize(arr: np.ndarray, eps: float = 1e-12) -> np.ndarray:
+    norms = np.linalg.norm(arr, axis=1, keepdims=True)
+    norms = np.maximum(norms, eps)
+    return arr / norms
+
+
+@dataclass(frozen=True)
+class IndexedItem:
+    """Holds the minimal info needed by the search index."""
+    id: str
+    category: str
+    filename: str
+    path: str  # relative to project root
+    summary: str
+
+
+class Embedder:
+    """
+    Thin wrapper around SentenceTransformer to produce normalized embeddings.
+    Thread-safe, lazily initialized, deterministic.
+    """
+
+    def __init__(self, model_name: str = DEFAULT_MODEL, device: Optional[str] = None, seed: int = 42):
+        self.model_name = model_name
+        self.device = device
+        self.seed = seed
+        self._model: Optional[SentenceTransformer] = None
+        self._lock = threading.Lock()
+        _set_deterministic(self.seed)
+
+    def _ensure_model(self) -> SentenceTransformer:
+        if self._model is not None:
+            return self._model
+        with self._lock:
+            if self._model is None:
+                self._model = SentenceTransformer(self.model_name, device=self.device)
+        return self._model  # type: ignore[return-value]
+
+    def embed(self, texts: Sequence[str], batch_size: int = 64) -> np.ndarray:
+        """
+        Embed a list of strings into a 2D numpy array (N, D), L2-normalized.
+        """
+        if not texts:
+            return np.zeros((0, 384), dtype=np.float32)  # model dim for MiniLM
+        model = self._ensure_model()
+        vecs = model.encode(
+            list(texts),
+            batch_size=batch_size,
+            convert_to_numpy=True,
+            normalize_embeddings=False,
+            show_progress_bar=False,
+        )
+        if not isinstance(vecs, np.ndarray):
+            vecs = np.array(vecs)
+        vecs = vecs.astype(np.float32, copy=False)
+        return _l2_normalize(vecs)
+
+    def embed_one(self, text: str) -> np.ndarray:
+        return self.embed([text])[0:1]
+
+
+class EmbeddingIndex:
+    """
+    In-memory embedding index for KB items using cosine similarity.
+
+    - Stores L2-normalized item vectors in a (N, D) float32 matrix.
+    - search(query) computes normalized query vector and returns argmax cosine.
+    """
+
+    def __init__(self, embedder: Embedder):
+        self.embedder = embedder
+        self.items: List[IndexedItem] = []
+        self.matrix: Optional[np.ndarray] = None  # (N, D), float32, normalized
+        self.dim: Optional[int] = None
+
+    def build(self, items: Sequence[IndexedItem], texts: Sequence[str]) -> None:
+        if len(items) != len(texts):
+            raise ValueError("items and texts must have the same length")
+        if not items:
+            # Empty index
+            self.items = []
+            self.matrix = np.zeros((0, 1), dtype=np.float32)
+            self.dim = 1
+            return
+        vecs = self.embedder.embed(texts)
+        self.items = list(items)
+        self.matrix = vecs  # already L2-normalized
+        self.dim = int(vecs.shape[1])
+
+    def is_built(self) -> bool:
+        return self.matrix is not None and self.items is not None
+
+    def search_one(self, query_text: str) -> Tuple[IndexedItem, float]:
+        """
+        Return (best_item, best_score) where score is cosine similarity in [ -1, 1 ].
+        """
+        if self.matrix is None or len(self.items) == 0:
+            raise RuntimeError("Index is empty. Build the index before searching.")
+        q = self.embedder.embed_one(query_text)  # (1, D), normalized
+        # Cosine for normalized vectors reduces to dot product
+        sims = (q @ self.matrix.T).astype(np.float32)  # (1, N)
+        best_idx = int(np.argmax(sims, axis=1)[0])
+        best_score = float(sims[0, best_idx])
+        return self.items[best_idx], best_score
+
+    def search_topk(self, query_text: str, k: int = 5) -> List[Tuple[IndexedItem, float]]:
+        if self.matrix is None or len(self.items) == 0:
+            raise RuntimeError("Index is empty. Build the index before searching.")
+        k = max(1, min(k, len(self.items)))
+        q = self.embedder.embed_one(query_text)
+        sims = (q @ self.matrix.T).astype(np.float32).ravel()
+        topk_idx = np.argpartition(-sims, kth=k - 1)[:k]
+        # sort exact top-k
+        topk_sorted = sorted(((int(i), float(sims[int(i)])) for i in topk_idx), key=lambda t: -t[1])
+        return [(self.items[i], score) for i, score in topk_sorted]
\ No newline at end of file
diff --git a/mcp_server/loader.py b/mcp_server/loader.py
new file mode 100644
index 0000000000000000000000000000000000000000..0fa1a98dcf80e3b4b64010bb132d053a71ab2618
--- /dev/null
+++ b/mcp_server/loader.py
@@ -0,0 +1,129 @@
+from __future__ import annotations
+
+import ast
+from dataclasses import dataclass, asdict
+from pathlib import Path
+from typing import List, Optional, Tuple
+
+KB_DIRNAME = "knowledge_base"
+
+@dataclass(frozen=True)
+class KBItem:
+    id: str
+    category: str
+    filename: str
+    path: str
+    summary: str
+
+def project_root() -> Path:
+    """Return project root (folder containing 'mcp_server' and 'knowledge_base')."""
+    return Path(__file__).resolve().parent.parent
+
+def kb_root() -> Path:
+    """Return absolute path to knowledge_base directory."""
+    return project_root() / KB_DIRNAME
+
+def extract_docstring(p: Path) -> str:
+    """Extract module docstring; fallback to first non-empty line if absent."""
+    try:
+        src = p.read_text(encoding="utf-8")
+    except Exception:
+        return ""
+    try:
+        module = ast.parse(src)
+        doc = ast.get_docstring(module) or ""
+        if doc:
+            return " ".join(doc.strip().split())
+        # fallback: first non-empty non-comment line
+        for ln in src.splitlines():
+            s = ln.strip()
+            if s:
+                return s[:300]
+        return ""
+    except Exception:
+        return ""
+
+def to_rel_path(p: Path) -> str:
+    """Return path relative to project root for stable IDs."""
+    try:
+        return str(p.relative_to(project_root()))
+    except Exception:
+        return str(p)
+
+def scan_knowledge_base() -> List[KBItem]:
+    """Scan knowledge_base for Python examples and return KBItem list."""
+    root = kb_root()
+    if not root.exists():
+        raise FileNotFoundError(f"Knowledge base folder not found: {root}")
+    items: List[KBItem] = []
+    for category_dir in sorted([d for d in root.iterdir() if d.is_dir()]):
+        category = category_dir.name
+        for py in sorted(category_dir.rglob("*.py")):
+            filename = py.name
+            rel_path = to_rel_path(py)
+            summary = extract_docstring(py) or filename
+            item_id = f"{category}/{filename}"
+            items.append(
+                KBItem(
+                    id=item_id,
+                    category=category,
+                    filename=filename,
+                    path=rel_path,
+                    summary=summary,
+                )
+            )
+    return items
+
+# Simple in-process cache. Re-scan lazily on first access.
+_ITEMS_CACHE: Optional[List[KBItem]] = None
+
+def list_items_dict() -> List[dict]:
+    """Return KB items as plain dicts suitable for JSON serialization."""
+    global _ITEMS_CACHE
+    if _ITEMS_CACHE is None:
+        _ITEMS_CACHE = scan_knowledge_base()
+    return [asdict(item) for item in _ITEMS_CACHE]
+
+def get_embedding_text(item: KBItem) -> str:
+    """Canonical string used for embedding a KB item."""
+    head = (item.summary or "").strip()
+    return f"{item.category}/{item.filename}: {head}"
+
+def ensure_kb_path(path_str: str) -> Path:
+    """
+    Validate and resolve a path under knowledge_base.
+    Accepts either:
+      - 'knowledge_base/<category>/<file>.py'
+      - '<category>/<file>.py'
+    Raises if outside KB or not a Python file.
+    """
+    base = kb_root().resolve()
+    # Normalize input
+    candidate = Path(path_str)
+    if not candidate.is_absolute():
+        # Allow either direct rel to project root or category/file
+        p1 = (project_root() / candidate).resolve()
+        p2 = (base / candidate).resolve()
+        # Prefer inside KB
+        p = p1 if str(p1).startswith(str(base)) and p1.exists() else p2
+    else:
+        p = candidate.resolve()
+    try:
+        p.relative_to(base)
+    except Exception:
+        raise ValueError(f"Path is outside knowledge base: {path_str}")
+    if not p.exists() or not p.is_file() or p.suffix != ".py":
+        raise FileNotFoundError(f"KB file not found: {p}")
+    return p
+
+def read_code(path_str: str) -> str:
+    """Read and return full source code of a KB file."""
+    p = ensure_kb_path(path_str)
+    return p.read_text(encoding="utf-8")
+
+def get_items_for_embedding() -> List[Tuple[KBItem, str]]:
+    """Return list of (KBItem, text) pairs for embedding."""
+    global _ITEMS_CACHE
+    if _ITEMS_CACHE is None:
+        _ITEMS_CACHE = scan_knowledge_base()
+    return [(it, get_embedding_text(it)) for it in _ITEMS_CACHE]
\ No newline at end of file
diff --git a/mcp_server/server.py b/mcp_server/server.py
new file mode 100644
index 0000000000000000000000000000000000000000..322a0de0af3278e2c1a4a19b3a24726b15a9bb7e
--- /dev/null
+++ b/mcp_server/server.py
@@ -0,0 +1,273 @@
+from __future__ import annotations
+
+import argparse
+import os
+from typing import Any, Dict, List
+
+import gradio as gr
+
+from mcp_server.tools.list_items import list_items as tool_list_items
+from mcp_server.tools.semantic_search import semantic_search as tool_semantic_search
+from mcp_server.tools.get_code import get_code as tool_get_code
+
+def create_gradio_blocks() -> gr.Blocks:
+    """
+    Build a Gradio UI that, when launched with mcp_server=True, exposes a remote MCP server
+    at: http://<host>:<port>/gradio_api/mcp/sse
+
+    Tools exposed (via function signatures and docstrings):
+      - list_items()
+      - semantic_search(problem_markdown: str)
+      - get_code(path: str)
+
+    Polished UI/UX:
+      - Themed interfaces, custom CSS, clear titles and descriptions
+      - Curated examples for Semantic Search and Get Code
+      - Helpful hero/guide text on the List Items tab
+    """
+
+    # Lightweight custom CSS for a more polished look
+    custom_css = """
+    :root {
+        --radius-md: 12px;
+        --shadow-md: 0 6px 24px rgba(0,0,0,.08);
+        --color-accent: #3B82F6; /* Blue 500 */
+        --color-accent-hover: #2563EB; /* Blue 600 */
+        --color-accent-soft: rgba(59,130,246,.15);
+        --link-text-color: #3B82F6;
+    }
+    .gradio-container { max-width: 1120px !important; margin: 0 auto; }
+
+    /* Buttons and controls -> blue accent */
+    .gr-button {
+        border-radius: 12px;
+        box-shadow: var(--shadow-md);
+        background: var(--color-accent) !important;
+        color: #fff !important;
+        border: 1px solid transparent !important;
+    }
+    .gr-button:hover { background: var(--color-accent-hover) !important; }
+    .gr-button:focus-visible { outline: 2px solid var(--color-accent); outline-offset: 2px; }
+
+    /* Tabs -> blue accent on active/hover */
+    .gr-tabs .tab-nav button[aria-selected="true"] {
+        border-bottom: 2px solid var(--color-accent) !important;
+        color: var(--color-accent) !important;
+    }
+    .gr-tabs .tab-nav button:hover { color: var(--color-accent) !important; }
+
+    /* Examples (chips/buttons) */
+    .gr-examples button, .examples button {
+        border-color: var(--color-accent) !important;
+        color: var(--color-accent) !important;
+        background: transparent !important;
+    }
+    .gr-examples button:hover, .examples button:hover {
+        background: var(--color-accent-soft) !important;
+    }
+
+    .gr-textbox textarea {
+        font-family: ui-monospace, SFMono-Regular, Menlo, Monaco, Consolas, "Liberation Mono", "Courier New", monospace;
+    }
+    h1, .prose h1 {
+        background: linear-gradient(90deg, #60A5FA, #22D3EE, #1D4ED8);
+        -webkit-background-clip: text;
+        background-clip: text;
+        color: transparent;
+    }
+    a, .prose a { color: var(--link-text-color) !important; }
+    .prose p, .prose li { font-size: 15px; line-height: 1.65; }
+    """
+
+    def list_items() -> List[Dict[str, Any]]:
+        """
+        List all knowledge base items.
+
+        Returns:
+            A JSON-serializable list of items, each with:
+              id (str): '<category>/<filename>.py'
+              category (str)
+              filename (str)
+              path (str): 'knowledge_base/<category>/<filename>.py'
+              summary (str): Docstring or first non-empty line
+        """
+        return tool_list_items()
+
+    def semantic_search(problem_markdown: str) -> Dict[str, Any]:
+        """
+        Semantic search over the knowledge base.
+
+        Args:
+            problem_markdown: Markdown text describing the task/problem.
+
+        Returns:
+            {
+              "best_match": {
+                  "id": str,
+                  "category": str,
+                  "filename": str,
+                  "path": str,
+                  "summary": str
+                },
+              "score": float  # cosine similarity in [-1, 1]
+            }
+        """
+        return tool_semantic_search(problem_markdown)
+
+    def get_code(path: str) -> str:
+        """
+        Return the full Python source code for a KB file.
+
+        Args:
+            path: Either 'knowledge_base/<category>/<file>.py' or '<category>/<file>.py'
+
+        Returns:
+            UTF-8 Python source as a string.
+        """
+        return tool_get_code(path)
+
+    # Curated examples for a smoother first-run UX
+    search_examples = [
+        "I want to fine-tune a transformer for sentiment classification.",
+        "Train a GNN on citation networks for node classification.",
+        "Image generation with GANs; how to stabilize training?",
+    ]
+    code_examples = [
+        "knowledge_base/nlp/text_classification_with_transformer.py",
+        "knowledge_base/graph/gnn_citations.py",
+        "knowledge_base/generative/dcgan_overriding_train_step.py",
+    ]
+
+    hero_md = """
+# ⚡️ ML Starter: Your ML Launchpad
+
+## **Starting an ML project and overwhelmed by where to begin?**
+## **LLMs not specialized enough for your domain?**
+## **Need real, reusable code instead of vague suggestions?**
+
+### **Describe your problem → get the top-ranked match → pull the exact code file.**
+
+---
+
+### 🔥 Why you'll love it
+- 🎯 **Problem-to-code in one flow** — search semantically, explore context, and download source.
+- 🧠 **Domain-tuned knowledge** — embeddings built over curated ML projects across vision, NLP, audio, structured data, and more.
+- 🤖 **Automation ready** — the same tools power IDEs/agents via MCP over SSE.
+
+### 🚀 What you can do
+- 📚 **Browse Items** — scan the entire library with instant summaries.
+- 🔎 **Semantic Search** — paste your challenge and get the closest-fit recipe plus similarity score.
+- 💻 **Get Code** — drop in the path and copy the full Python implementation.
+
+### 🛠 Under the hood
+- Sentence-transformer embeddings + cosine similarity for precise retrieval.
+- Rich metadata (id, category, path, summary) for fast filtering.
+- Remote MCP endpoint at `/gradio_api/mcp/sse` exposing `list_items()`, `semantic_search()`, `get_code()`.
+
+### ⏱ Quickstart
+1. Head to “🔎 Semantic Search”, describe what you're building, and submit.
+2. Copy the suggested path from the results.
+3. Open “💻 Get Code”, paste the path, and grab the exact source.
+4. Want the big picture first? Start with “📚 Browse Items”.
+
+### 💡 Power tip
+Run locally or on Spaces, then connect any MCP-compatible client to orchestrate the same workflow programmatically.
+"""
+
+    search_article = """
+🧭 How to use
+1) Describe your task with as much signal as possible (dataset, modality, constraints, target metric).
+2) Click Submit or pick an example. We compute embeddings and retrieve the closest KB match.
+3) Copy the 'path' value and open it in the “💻 Get Code” tab to view the full source.
+
+🧠 Notes
+- Markdown is supported. Bullet points and short snippets help a lot.
+- Similarity uses cosine distance on L2‑normalized sentence-transformer embeddings.
+    """
+    code_article = """
+Paste a valid knowledge base path to fetch the full Python source.
+
+📌 Examples
+- knowledge_base/nlp/text_classification_with_transformer.py
+- nlp/text_classification_with_transformer.py
+
+💡 Tips
+- Accepts both absolute KB paths and '<category>/<file>.py'.
+- The code block is copy-friendly for quick reuse.
+    """
+
+    list_ui = gr.Interface(
+        fn=list_items,
+        inputs=None,
+        outputs=gr.JSON(label="📦 Items (JSON)"),
+        title="📚 Browse Items",
+        description="Explore every ML Starter KB entry — id, category, path, and summary.",
+        article="",
+    )
+
+    search_ui = gr.Interface(
+        fn=semantic_search,
+        inputs=gr.Textbox(
+            lines=10,
+            label="✍️ Describe your problem (Markdown supported)",
+            placeholder="e.g., Fine-tune a transformer for sentiment classification on IMDB (dataset, goal, constraints)"
+        ),
+        outputs=gr.JSON(label="🏆 Best match + similarity score"),
+        title="🔎 Semantic Search",
+        description="Paste your task. We compute embeddings and return the closest KB recipe with a score.",
+        examples=search_examples,
+        article=search_article,
+    )
+
+    code_ui = gr.Interface(
+        fn=get_code,
+        inputs=gr.Textbox(
+            lines=1,
+            label="📄 KB file path",
+            placeholder="knowledge_base/nlp/text_classification_with_transformer.py"
+        ),
+        outputs=gr.Code(label="🧩 Python source", language="python"),
+        title="💻 Get Code",
+        description="Paste a KB path and copy the exact source into your project.",
+        examples=code_examples,
+        article=code_article,
+    )
+
+    # Compose top-level layout: explanation on top, tabs below
+    with gr.Blocks() as blocks:
+        gr.HTML(f"<style>{custom_css}</style>")
+        gr.Markdown(hero_md)
+        with gr.Tabs():
+            with gr.Tab("📚 List Items"):
+                list_ui.render()
+            with gr.Tab("🔎 Semantic Search"):
+                search_ui.render()
+            with gr.Tab("💻 Get Code"):
+                code_ui.render()
+    return blocks
+
+
+def main() -> None:
+    """
+    Entry point: Launch Gradio UI and expose remote MCP over SSE at /gradio_api/mcp/sse
+    """
+    parser = argparse.ArgumentParser(description="ML Starter MCP Server (Gradio Remote Only)")
+    parser.add_argument("--host", default="127.0.0.1", help="Host for Gradio")
+    parser.add_argument("--port", type=int, default=7860, help="Port for Gradio")
+    args = parser.parse_args()
+
+    # Derive host/port from environment for Hugging Face Spaces and containers
+    env_host = os.getenv("GRADIO_SERVER_NAME") or os.getenv("HOST") or args.host
+    env_port_str = os.getenv("GRADIO_SERVER_PORT") or os.getenv("PORT")
+    env_port = int(env_port_str) if env_port_str and env_port_str.isdigit() else args.port
+
+    # If running on HF Spaces, bind to 0.0.0.0 unless explicitly overridden
+    if os.getenv("SPACE_ID") and env_host in ("127.0.0.1", "localhost"):
+        env_host = "0.0.0.0"
+
+    blocks = create_gradio_blocks()
+    blocks.launch(server_name=env_host, server_port=env_port, mcp_server=True)
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file
diff --git a/mcp_server/tools/__init__.py b/mcp_server/tools/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..0ffc238b15cc0177fba96ab16b255fb8e4f7c55e
--- /dev/null
+++ b/mcp_server/tools/__init__.py
@@ -0,0 +1 @@
+__all__ = ["list_items", "semantic_search", "get_code"]
\ No newline at end of file
diff --git a/mcp_server/tools/get_code.py b/mcp_server/tools/get_code.py
new file mode 100644
index 0000000000000000000000000000000000000000..d578e3ff447b3817862dadf810bc40b4325748e7
--- /dev/null
+++ b/mcp_server/tools/get_code.py
@@ -0,0 +1,24 @@
+from __future__ import annotations
+
+from mcp_server.loader import read_code
+
+
+__all__ = ["get_code"]
+
+
+def get_code(path: str) -> str:
+    """
+    Return full Python source code for a knowledge base file.
+
+    Args:
+        path: Either:
+            - "knowledge_base/<category>/<file>.py"
+            - "<category>/<file>.py"
+
+    Raises:
+        ValueError, FileNotFoundError per validation rules.
+
+    Returns:
+        The full UTF-8 text of the Python file.
+    """
+    return read_code(path)
\ No newline at end of file
diff --git a/mcp_server/tools/list_items.py b/mcp_server/tools/list_items.py
new file mode 100644
index 0000000000000000000000000000000000000000..562aae48ba85a7e97a3d1f19dc4ccc61c4796df6
--- /dev/null
+++ b/mcp_server/tools/list_items.py
@@ -0,0 +1,25 @@
+from __future__ import annotations
+
+from typing import List, Dict
+
+from mcp_server.loader import list_items_dict
+
+
+__all__ = ["list_items"]
+
+
+def list_items() -> List[Dict]:
+    """
+    Return all KB items with minimal metadata:
+    [
+      {
+        "id": "nlp/text_classification_with_transformer.py",
+        "category": "nlp",
+        "filename": "text_classification_with_transformer.py",
+        "path": "knowledge_base/nlp/text_classification_with_transformer.py",
+        "summary": "...",
+      },
+      ...
+    ]
+    """
+    return list_items_dict()
\ No newline at end of file
diff --git a/mcp_server/tools/semantic_search.py b/mcp_server/tools/semantic_search.py
new file mode 100644
index 0000000000000000000000000000000000000000..1a02b862d00518744d63b151de1c7e3d6d3d904e
--- /dev/null
+++ b/mcp_server/tools/semantic_search.py
@@ -0,0 +1,56 @@
+from __future__ import annotations
+
+from typing import Dict, Tuple, List
+
+from mcp_server.loader import get_items_for_embedding, get_embedding_text
+from mcp_server.embeddings import Embedder, EmbeddingIndex, IndexedItem
+
+
+__all__ = ["semantic_search"]
+
+# Lazy singletons
+_embedder: Embedder | None = None
+_index: EmbeddingIndex | None = None
+_built: bool = False
+
+
+def _ensure_index() -> EmbeddingIndex:
+    global _embedder, _index, _built
+    if _embedder is None:
+        _embedder = Embedder()
+    if _index is None:
+        _index = EmbeddingIndex(_embedder)
+    if not _built:
+        pairs = get_items_for_embedding()  # List[ (KBItem, text) ]
+        items: List[IndexedItem] = [
+            IndexedItem(
+                id=it.id,
+                category=it.category,
+                filename=it.filename,
+                path=it.path,
+                summary=it.summary,
+            )
+            for it, _ in pairs
+        ]
+        texts: List[str] = [get_embedding_text(it) for it, _ in pairs]
+        _index.build(items, texts)
+        _built = True
+    return _index
+
+
+def semantic_search(problem_markdown: str) -> Dict:
+    """
+    Return only the best match and its score.
+    {
+      "best_match": "knowledge_base/nlp/text_classification_with_transformer.py",
+      "score": 0.89
+    }
+    """
+    if not isinstance(problem_markdown, str) or not problem_markdown.strip():
+        raise ValueError("problem_markdown must be a non-empty string")
+    index = _ensure_index()
+    best_item, score = index.search_one(problem_markdown)
+    return {
+        "best_match": best_item,
+        "score": round(float(score), 6),
+    }
\ No newline at end of file
diff --git a/requirements.txt b/requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..e0444f8330e48e1e50f49bfad1aa5ec06a367266
--- /dev/null
+++ b/requirements.txt
@@ -0,0 +1,9 @@
+# Gradio MCP (remote server only)
+gradio[mcp]>=5.0.0
+
+# Retrieval
+sentence-transformers>=3.0.1,<4
+numpy>=1.26,<3
+# PyTorch backend for sentence-transformers; pick a CPU or CUDA build as appropriate for your system
+torch>=2.1.0
+huggingface_hub>=0.23.0
\ No newline at end of file