{ "cells": [ { "cell_type": "markdown", "metadata": { "id": "onFz3_7AqnaB" }, "source": [ "## Gemma 3n Video with Audio Inference" ] }, { "cell_type": "markdown", "metadata": { "id": "KKUnhy4JqqAg" }, "source": [ "In this notebook we'll infer Gemma-3n videos with audios inside." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "id": "Vf-VvnrNjuxF" }, "outputs": [], "source": [ "!pip install -U -q transformers timm datasets" ] }, { "cell_type": "markdown", "metadata": { "id": "gcJbxIPLqvjH" }, "source": [ "We will load three examples from FineVideo dataset and Gemma-3n model so make sure you have access to both and provide access token." ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": { "base_uri": "https://localhost:8080/", "height": 17, "referenced_widgets": [ "542490f74e974451bc44009a6fa174bd", "409f985be1134b468b81136fbdb54408", "57cb1e931c614980a4147cb125524d7d", "87dc7aaf52e349a7bb43bb1b8bc137ee", "983ed4cb4eea42daa9ae8c0417021a21", "40c381fd7bb04b43a879044a4e988cc6", "8d0e5abdd7c549f1a66ee198c9fa1430", "c72dd3d6a4c246cfa6590c314783c8f0", "c0e471e664dd41eab98efe08301ef5e1", "868f63ea9455442d837dc2c422918800", "5b7b4707b1bf4159a10bf7e289bde435", "889d0d1ed24e4de2b89896511d008e60", "68fc757825dd44a48ab2383db20958db", "cb76f933e6e640d9a688f7838e5fb0b3", "8704264bff4d46c9813ac9acf92da962", "9b5d87960dde401baeaf8b6144fb8bad", "76e06881e5e94197a24944e07fdf3189", "f40dd696acc64c6284c6f8f485f3ce9d", "4488de26dce74cbbb39d99ae09bd21fa", "ded62e6c032745ec88ca0ab694b0d397" ] }, "id": "bROdG2-Jj9lT", "outputId": "1978e9bd-3b52-40b8-e643-418f9872476d" }, "outputs": [ { "data": { "application/vnd.jupyter.widget-view+json": { "model_id": "542490f74e974451bc44009a6fa174bd", "version_major": 2, "version_minor": 0 }, "text/plain": [ "VBox(children=(HTML(value='