{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## **goodbooks-10k-the-latest**"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"execution": {
"iopub.execute_input": "2025-07-06T13:00:53.977640Z",
"iopub.status.busy": "2025-07-06T13:00:53.977344Z",
"iopub.status.idle": "2025-07-06T13:00:55.797338Z",
"shell.execute_reply": "2025-07-06T13:00:55.796558Z",
"shell.execute_reply.started": "2025-07-06T13:00:53.977615Z"
},
"trusted": true
},
"outputs": [],
"source": [
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"execution": {
"iopub.execute_input": "2025-07-06T13:01:20.399323Z",
"iopub.status.busy": "2025-07-06T13:01:20.398769Z",
"iopub.status.idle": "2025-07-06T13:01:23.186859Z",
"shell.execute_reply": "2025-07-06T13:01:23.186291Z",
"shell.execute_reply.started": "2025-07-06T13:01:20.399297Z"
},
"trusted": true
},
"outputs": [],
"source": [
"book_tags_df = pd.read_csv(\"/kaggle/input/goodbooks-10k-the-latest/book_tags.csv\")\n",
"books_df = pd.read_csv(\"/kaggle/input/goodbooks-10k-the-latest/books.csv\")\n",
"ratings_df = pd.read_csv(\"/kaggle/input/goodbooks-10k-the-latest/ratings.csv\")\n",
"tags_df = pd.read_csv(\"/kaggle/input/goodbooks-10k-the-latest/tags.csv\")\n",
"to_read_df = pd.read_csv(\"/kaggle/input/goodbooks-10k-the-latest/to_read.csv\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"execution": {
"iopub.execute_input": "2025-07-06T13:01:33.994608Z",
"iopub.status.busy": "2025-07-06T13:01:33.993937Z",
"iopub.status.idle": "2025-07-06T13:01:34.016081Z",
"shell.execute_reply": "2025-07-06T13:01:34.015333Z",
"shell.execute_reply.started": "2025-07-06T13:01:33.994574Z"
},
"trusted": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.11/dist-packages/pandas/core/computation/expressions.py:73: RuntimeWarning: invalid value encountered in greater\n",
" return op(a, b)\n"
]
}
],
"source": [
"books_df['original_publication_year'] = books_df[(books_df['original_publication_year'] > 0) & (books_df['original_publication_year'] > 1900)]['original_publication_year']"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"execution": {
"iopub.execute_input": "2025-07-05T14:12:43.116998Z",
"iopub.status.busy": "2025-07-05T14:12:43.116738Z",
"iopub.status.idle": "2025-07-05T14:12:43.206274Z",
"shell.execute_reply": "2025-07-05T14:12:43.205745Z",
"shell.execute_reply.started": "2025-07-05T14:12:43.116979Z"
},
"trusted": true
},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" book_id | \n",
" goodreads_book_id | \n",
" best_book_id | \n",
" work_id | \n",
" books_count | \n",
" isbn | \n",
" isbn13 | \n",
" authors | \n",
" original_publication_year | \n",
" original_title | \n",
" ... | \n",
" ratings_count | \n",
" work_ratings_count | \n",
" work_text_reviews_count | \n",
" ratings_1 | \n",
" ratings_2 | \n",
" ratings_3 | \n",
" ratings_4 | \n",
" ratings_5 | \n",
" image_url | \n",
" small_image_url | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" 2767052 | \n",
" 2767052 | \n",
" 2792775 | \n",
" 272 | \n",
" 439023483 | \n",
" 9.780439e+12 | \n",
" Suzanne Collins | \n",
" 2008.0 | \n",
" The Hunger Games | \n",
" ... | \n",
" 4780653 | \n",
" 4942365 | \n",
" 155254 | \n",
" 66715 | \n",
" 127936 | \n",
" 560092 | \n",
" 1481305 | \n",
" 2706317 | \n",
" https://images.gr-assets.com/books/1447303603m... | \n",
" https://images.gr-assets.com/books/1447303603s... | \n",
"
\n",
" \n",
" | 1 | \n",
" 2 | \n",
" 3 | \n",
" 3 | \n",
" 4640799 | \n",
" 491 | \n",
" 439554934 | \n",
" 9.780440e+12 | \n",
" J.K. Rowling, Mary GrandPré | \n",
" 1997.0 | \n",
" Harry Potter and the Philosopher's Stone | \n",
" ... | \n",
" 4602479 | \n",
" 4800065 | \n",
" 75867 | \n",
" 75504 | \n",
" 101676 | \n",
" 455024 | \n",
" 1156318 | \n",
" 3011543 | \n",
" https://images.gr-assets.com/books/1474154022m... | \n",
" https://images.gr-assets.com/books/1474154022s... | \n",
"
\n",
" \n",
" | 2 | \n",
" 3 | \n",
" 41865 | \n",
" 41865 | \n",
" 3212258 | \n",
" 226 | \n",
" 316015849 | \n",
" 9.780316e+12 | \n",
" Stephenie Meyer | \n",
" 2005.0 | \n",
" Twilight | \n",
" ... | \n",
" 3866839 | \n",
" 3916824 | \n",
" 95009 | \n",
" 456191 | \n",
" 436802 | \n",
" 793319 | \n",
" 875073 | \n",
" 1355439 | \n",
" https://images.gr-assets.com/books/1361039443m... | \n",
" https://images.gr-assets.com/books/1361039443s... | \n",
"
\n",
" \n",
" | 3 | \n",
" 4 | \n",
" 2657 | \n",
" 2657 | \n",
" 3275794 | \n",
" 487 | \n",
" 61120081 | \n",
" 9.780061e+12 | \n",
" Harper Lee | \n",
" 1960.0 | \n",
" To Kill a Mockingbird | \n",
" ... | \n",
" 3198671 | \n",
" 3340896 | \n",
" 72586 | \n",
" 60427 | \n",
" 117415 | \n",
" 446835 | \n",
" 1001952 | \n",
" 1714267 | \n",
" https://images.gr-assets.com/books/1361975680m... | \n",
" https://images.gr-assets.com/books/1361975680s... | \n",
"
\n",
" \n",
" | 4 | \n",
" 5 | \n",
" 4671 | \n",
" 4671 | \n",
" 245494 | \n",
" 1356 | \n",
" 743273567 | \n",
" 9.780743e+12 | \n",
" F. Scott Fitzgerald | \n",
" 1925.0 | \n",
" The Great Gatsby | \n",
" ... | \n",
" 2683664 | \n",
" 2773745 | \n",
" 51992 | \n",
" 86236 | \n",
" 197621 | \n",
" 606158 | \n",
" 936012 | \n",
" 947718 | \n",
" https://images.gr-assets.com/books/1490528560m... | \n",
" https://images.gr-assets.com/books/1490528560s... | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 23 columns
\n",
"
"
],
"text/plain": [
" book_id goodreads_book_id best_book_id work_id books_count isbn \\\n",
"0 1 2767052 2767052 2792775 272 439023483 \n",
"1 2 3 3 4640799 491 439554934 \n",
"2 3 41865 41865 3212258 226 316015849 \n",
"3 4 2657 2657 3275794 487 61120081 \n",
"4 5 4671 4671 245494 1356 743273567 \n",
"\n",
" isbn13 authors original_publication_year \\\n",
"0 9.780439e+12 Suzanne Collins 2008.0 \n",
"1 9.780440e+12 J.K. Rowling, Mary GrandPré 1997.0 \n",
"2 9.780316e+12 Stephenie Meyer 2005.0 \n",
"3 9.780061e+12 Harper Lee 1960.0 \n",
"4 9.780743e+12 F. Scott Fitzgerald 1925.0 \n",
"\n",
" original_title ... ratings_count \\\n",
"0 The Hunger Games ... 4780653 \n",
"1 Harry Potter and the Philosopher's Stone ... 4602479 \n",
"2 Twilight ... 3866839 \n",
"3 To Kill a Mockingbird ... 3198671 \n",
"4 The Great Gatsby ... 2683664 \n",
"\n",
" work_ratings_count work_text_reviews_count ratings_1 ratings_2 \\\n",
"0 4942365 155254 66715 127936 \n",
"1 4800065 75867 75504 101676 \n",
"2 3916824 95009 456191 436802 \n",
"3 3340896 72586 60427 117415 \n",
"4 2773745 51992 86236 197621 \n",
"\n",
" ratings_3 ratings_4 ratings_5 \\\n",
"0 560092 1481305 2706317 \n",
"1 455024 1156318 3011543 \n",
"2 793319 875073 1355439 \n",
"3 446835 1001952 1714267 \n",
"4 606158 936012 947718 \n",
"\n",
" image_url \\\n",
"0 https://images.gr-assets.com/books/1447303603m... \n",
"1 https://images.gr-assets.com/books/1474154022m... \n",
"2 https://images.gr-assets.com/books/1361039443m... \n",
"3 https://images.gr-assets.com/books/1361975680m... \n",
"4 https://images.gr-assets.com/books/1490528560m... \n",
"\n",
" small_image_url \n",
"0 https://images.gr-assets.com/books/1447303603s... \n",
"1 https://images.gr-assets.com/books/1474154022s... \n",
"2 https://images.gr-assets.com/books/1361039443s... \n",
"3 https://images.gr-assets.com/books/1361975680s... \n",
"4 https://images.gr-assets.com/books/1490528560s... \n",
"\n",
"[5 rows x 23 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"books_df = pd.read_csv(\"/kaggle/input/goodbooks-10k-the-latest/books.csv\")\n",
"books_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"execution": {
"iopub.execute_input": "2025-07-05T14:12:45.248976Z",
"iopub.status.busy": "2025-07-05T14:12:45.248695Z",
"iopub.status.idle": "2025-07-05T14:12:45.298184Z",
"shell.execute_reply": "2025-07-05T14:12:45.297650Z",
"shell.execute_reply.started": "2025-07-05T14:12:45.248954Z"
},
"trusted": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" book_id | \n",
" goodreads_book_id | \n",
" best_book_id | \n",
" work_id | \n",
" books_count | \n",
" isbn13 | \n",
" original_publication_year | \n",
" average_rating | \n",
" ratings_count | \n",
" work_ratings_count | \n",
" work_text_reviews_count | \n",
" ratings_1 | \n",
" ratings_2 | \n",
" ratings_3 | \n",
" ratings_4 | \n",
" ratings_5 | \n",
"
\n",
" \n",
" \n",
" \n",
" | count | \n",
" 10000.00000 | \n",
" 1.000000e+04 | \n",
" 1.000000e+04 | \n",
" 1.000000e+04 | \n",
" 10000.000000 | \n",
" 9.415000e+03 | \n",
" 9979.000000 | \n",
" 10000.000000 | \n",
" 1.000000e+04 | \n",
" 1.000000e+04 | \n",
" 10000.000000 | \n",
" 10000.000000 | \n",
" 10000.000000 | \n",
" 10000.000000 | \n",
" 1.000000e+04 | \n",
" 1.000000e+04 | \n",
"
\n",
" \n",
" | mean | \n",
" 5000.50000 | \n",
" 5.264697e+06 | \n",
" 5.471214e+06 | \n",
" 8.646183e+06 | \n",
" 75.712700 | \n",
" 9.755044e+12 | \n",
" 1981.987674 | \n",
" 4.002191 | \n",
" 5.400124e+04 | \n",
" 5.968732e+04 | \n",
" 2919.955300 | \n",
" 1345.040600 | \n",
" 3110.885000 | \n",
" 11475.893800 | \n",
" 1.996570e+04 | \n",
" 2.378981e+04 | \n",
"
\n",
" \n",
" | std | \n",
" 2886.89568 | \n",
" 7.575462e+06 | \n",
" 7.827330e+06 | \n",
" 1.175106e+07 | \n",
" 170.470728 | \n",
" 4.428619e+11 | \n",
" 152.576665 | \n",
" 0.254427 | \n",
" 1.573700e+05 | \n",
" 1.678038e+05 | \n",
" 6124.378132 | \n",
" 6635.626263 | \n",
" 9717.123578 | \n",
" 28546.449183 | \n",
" 5.144736e+04 | \n",
" 7.976889e+04 | \n",
"
\n",
" \n",
" | min | \n",
" 1.00000 | \n",
" 1.000000e+00 | \n",
" 1.000000e+00 | \n",
" 8.700000e+01 | \n",
" 1.000000 | \n",
" 1.951703e+08 | \n",
" -1750.000000 | \n",
" 2.470000 | \n",
" 2.716000e+03 | \n",
" 5.510000e+03 | \n",
" 3.000000 | \n",
" 11.000000 | \n",
" 30.000000 | \n",
" 323.000000 | \n",
" 7.500000e+02 | \n",
" 7.540000e+02 | \n",
"
\n",
" \n",
" | 25% | \n",
" 2500.75000 | \n",
" 4.627575e+04 | \n",
" 4.791175e+04 | \n",
" 1.008841e+06 | \n",
" 23.000000 | \n",
" 9.780316e+12 | \n",
" 1990.000000 | \n",
" 3.850000 | \n",
" 1.356875e+04 | \n",
" 1.543875e+04 | \n",
" 694.000000 | \n",
" 196.000000 | \n",
" 656.000000 | \n",
" 3112.000000 | \n",
" 5.405750e+03 | \n",
" 5.334000e+03 | \n",
"
\n",
" \n",
" | 50% | \n",
" 5000.50000 | \n",
" 3.949655e+05 | \n",
" 4.251235e+05 | \n",
" 2.719524e+06 | \n",
" 40.000000 | \n",
" 9.780452e+12 | \n",
" 2004.000000 | \n",
" 4.020000 | \n",
" 2.115550e+04 | \n",
" 2.383250e+04 | \n",
" 1402.000000 | \n",
" 391.000000 | \n",
" 1163.000000 | \n",
" 4894.000000 | \n",
" 8.269500e+03 | \n",
" 8.836000e+03 | \n",
"
\n",
" \n",
" | 75% | \n",
" 7500.25000 | \n",
" 9.382225e+06 | \n",
" 9.636112e+06 | \n",
" 1.451775e+07 | \n",
" 67.000000 | \n",
" 9.780831e+12 | \n",
" 2011.000000 | \n",
" 4.180000 | \n",
" 4.105350e+04 | \n",
" 4.591500e+04 | \n",
" 2744.250000 | \n",
" 885.000000 | \n",
" 2353.250000 | \n",
" 9287.000000 | \n",
" 1.602350e+04 | \n",
" 1.730450e+04 | \n",
"
\n",
" \n",
" | max | \n",
" 10000.00000 | \n",
" 3.328864e+07 | \n",
" 3.553423e+07 | \n",
" 5.639960e+07 | \n",
" 3455.000000 | \n",
" 9.790008e+12 | \n",
" 2017.000000 | \n",
" 4.820000 | \n",
" 4.780653e+06 | \n",
" 4.942365e+06 | \n",
" 155254.000000 | \n",
" 456191.000000 | \n",
" 436802.000000 | \n",
" 793319.000000 | \n",
" 1.481305e+06 | \n",
" 3.011543e+06 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" book_id goodreads_book_id best_book_id work_id \\\n",
"count 10000.00000 1.000000e+04 1.000000e+04 1.000000e+04 \n",
"mean 5000.50000 5.264697e+06 5.471214e+06 8.646183e+06 \n",
"std 2886.89568 7.575462e+06 7.827330e+06 1.175106e+07 \n",
"min 1.00000 1.000000e+00 1.000000e+00 8.700000e+01 \n",
"25% 2500.75000 4.627575e+04 4.791175e+04 1.008841e+06 \n",
"50% 5000.50000 3.949655e+05 4.251235e+05 2.719524e+06 \n",
"75% 7500.25000 9.382225e+06 9.636112e+06 1.451775e+07 \n",
"max 10000.00000 3.328864e+07 3.553423e+07 5.639960e+07 \n",
"\n",
" books_count isbn13 original_publication_year average_rating \\\n",
"count 10000.000000 9.415000e+03 9979.000000 10000.000000 \n",
"mean 75.712700 9.755044e+12 1981.987674 4.002191 \n",
"std 170.470728 4.428619e+11 152.576665 0.254427 \n",
"min 1.000000 1.951703e+08 -1750.000000 2.470000 \n",
"25% 23.000000 9.780316e+12 1990.000000 3.850000 \n",
"50% 40.000000 9.780452e+12 2004.000000 4.020000 \n",
"75% 67.000000 9.780831e+12 2011.000000 4.180000 \n",
"max 3455.000000 9.790008e+12 2017.000000 4.820000 \n",
"\n",
" ratings_count work_ratings_count work_text_reviews_count \\\n",
"count 1.000000e+04 1.000000e+04 10000.000000 \n",
"mean 5.400124e+04 5.968732e+04 2919.955300 \n",
"std 1.573700e+05 1.678038e+05 6124.378132 \n",
"min 2.716000e+03 5.510000e+03 3.000000 \n",
"25% 1.356875e+04 1.543875e+04 694.000000 \n",
"50% 2.115550e+04 2.383250e+04 1402.000000 \n",
"75% 4.105350e+04 4.591500e+04 2744.250000 \n",
"max 4.780653e+06 4.942365e+06 155254.000000 \n",
"\n",
" ratings_1 ratings_2 ratings_3 ratings_4 ratings_5 \n",
"count 10000.000000 10000.000000 10000.000000 1.000000e+04 1.000000e+04 \n",
"mean 1345.040600 3110.885000 11475.893800 1.996570e+04 2.378981e+04 \n",
"std 6635.626263 9717.123578 28546.449183 5.144736e+04 7.976889e+04 \n",
"min 11.000000 30.000000 323.000000 7.500000e+02 7.540000e+02 \n",
"25% 196.000000 656.000000 3112.000000 5.405750e+03 5.334000e+03 \n",
"50% 391.000000 1163.000000 4894.000000 8.269500e+03 8.836000e+03 \n",
"75% 885.000000 2353.250000 9287.000000 1.602350e+04 1.730450e+04 \n",
"max 456191.000000 436802.000000 793319.000000 1.481305e+06 3.011543e+06 "
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"books_df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"execution": {
"iopub.execute_input": "2025-07-05T14:12:46.982964Z",
"iopub.status.busy": "2025-07-05T14:12:46.982180Z",
"iopub.status.idle": "2025-07-05T14:12:46.991181Z",
"shell.execute_reply": "2025-07-05T14:12:46.990265Z",
"shell.execute_reply.started": "2025-07-05T14:12:46.982930Z"
},
"trusted": true
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/usr/local/lib/python3.11/dist-packages/pandas/core/computation/expressions.py:73: RuntimeWarning: invalid value encountered in greater\n",
" return op(a, b)\n"
]
}
],
"source": [
"books_df['original_publication_year'] = books_df[(books_df['original_publication_year'] > 0) & (books_df['original_publication_year'] > 1900)]['original_publication_year']"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"execution": {
"iopub.execute_input": "2025-07-04T14:25:06.895437Z",
"iopub.status.busy": "2025-07-04T14:25:06.894719Z",
"iopub.status.idle": "2025-07-04T14:25:06.913897Z",
"shell.execute_reply": "2025-07-04T14:25:06.913197Z",
"shell.execute_reply.started": "2025-07-04T14:25:06.895413Z"
},
"trusted": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 10000 entries, 0 to 9999\n",
"Data columns (total 23 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 book_id 10000 non-null int64 \n",
" 1 goodreads_book_id 10000 non-null int64 \n",
" 2 best_book_id 10000 non-null int64 \n",
" 3 work_id 10000 non-null int64 \n",
" 4 books_count 10000 non-null int64 \n",
" 5 isbn 9300 non-null object \n",
" 6 isbn13 9415 non-null float64\n",
" 7 authors 10000 non-null object \n",
" 8 original_publication_year 9594 non-null float64\n",
" 9 original_title 9415 non-null object \n",
" 10 title 10000 non-null object \n",
" 11 language_code 8916 non-null object \n",
" 12 average_rating 10000 non-null float64\n",
" 13 ratings_count 10000 non-null int64 \n",
" 14 work_ratings_count 10000 non-null int64 \n",
" 15 work_text_reviews_count 10000 non-null int64 \n",
" 16 ratings_1 10000 non-null int64 \n",
" 17 ratings_2 10000 non-null int64 \n",
" 18 ratings_3 10000 non-null int64 \n",
" 19 ratings_4 10000 non-null int64 \n",
" 20 ratings_5 10000 non-null int64 \n",
" 21 image_url 10000 non-null object \n",
" 22 small_image_url 10000 non-null object \n",
"dtypes: float64(3), int64(13), object(7)\n",
"memory usage: 1.8+ MB\n"
]
}
],
"source": [
"books_df.info()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"execution": {
"iopub.execute_input": "2025-07-06T09:56:09.306945Z",
"iopub.status.busy": "2025-07-06T09:56:09.306213Z",
"iopub.status.idle": "2025-07-06T09:56:09.348119Z",
"shell.execute_reply": "2025-07-06T09:56:09.347469Z",
"shell.execute_reply.started": "2025-07-06T09:56:09.306922Z"
},
"trusted": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" ratings_1 | \n",
" ratings_2 | \n",
" ratings_3 | \n",
" ratings_4 | \n",
" ratings_5 | \n",
"
\n",
" \n",
" \n",
" \n",
" | count | \n",
" 10000.000000 | \n",
" 10000.000000 | \n",
" 10000.000000 | \n",
" 1.000000e+04 | \n",
" 1.000000e+04 | \n",
"
\n",
" \n",
" | mean | \n",
" 1345.040600 | \n",
" 3110.885000 | \n",
" 11475.893800 | \n",
" 1.996570e+04 | \n",
" 2.378981e+04 | \n",
"
\n",
" \n",
" | std | \n",
" 6635.626263 | \n",
" 9717.123578 | \n",
" 28546.449183 | \n",
" 5.144736e+04 | \n",
" 7.976889e+04 | \n",
"
\n",
" \n",
" | min | \n",
" 11.000000 | \n",
" 30.000000 | \n",
" 323.000000 | \n",
" 7.500000e+02 | \n",
" 7.540000e+02 | \n",
"
\n",
" \n",
" | 25% | \n",
" 196.000000 | \n",
" 656.000000 | \n",
" 3112.000000 | \n",
" 5.405750e+03 | \n",
" 5.334000e+03 | \n",
"
\n",
" \n",
" | 50% | \n",
" 391.000000 | \n",
" 1163.000000 | \n",
" 4894.000000 | \n",
" 8.269500e+03 | \n",
" 8.836000e+03 | \n",
"
\n",
" \n",
" | 75% | \n",
" 885.000000 | \n",
" 2353.250000 | \n",
" 9287.000000 | \n",
" 1.602350e+04 | \n",
" 1.730450e+04 | \n",
"
\n",
" \n",
" | max | \n",
" 456191.000000 | \n",
" 436802.000000 | \n",
" 793319.000000 | \n",
" 1.481305e+06 | \n",
" 3.011543e+06 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" ratings_1 ratings_2 ratings_3 ratings_4 ratings_5\n",
"count 10000.000000 10000.000000 10000.000000 1.000000e+04 1.000000e+04\n",
"mean 1345.040600 3110.885000 11475.893800 1.996570e+04 2.378981e+04\n",
"std 6635.626263 9717.123578 28546.449183 5.144736e+04 7.976889e+04\n",
"min 11.000000 30.000000 323.000000 7.500000e+02 7.540000e+02\n",
"25% 196.000000 656.000000 3112.000000 5.405750e+03 5.334000e+03\n",
"50% 391.000000 1163.000000 4894.000000 8.269500e+03 8.836000e+03\n",
"75% 885.000000 2353.250000 9287.000000 1.602350e+04 1.730450e+04\n",
"max 456191.000000 436802.000000 793319.000000 1.481305e+06 3.011543e+06"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"books_df[['ratings_1', 'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5']].describe()"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"execution": {
"iopub.execute_input": "2025-07-04T16:59:31.396853Z",
"iopub.status.busy": "2025-07-04T16:59:31.396119Z",
"iopub.status.idle": "2025-07-04T16:59:33.034848Z",
"shell.execute_reply": "2025-07-04T16:59:33.034146Z",
"shell.execute_reply.started": "2025-07-04T16:59:31.396819Z"
},
"trusted": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" user_id | \n",
" book_id | \n",
" rating | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" 258 | \n",
" 5 | \n",
"
\n",
" \n",
" | 1 | \n",
" 2 | \n",
" 4081 | \n",
" 4 | \n",
"
\n",
" \n",
" | 2 | \n",
" 2 | \n",
" 260 | \n",
" 5 | \n",
"
\n",
" \n",
" | 3 | \n",
" 2 | \n",
" 9296 | \n",
" 5 | \n",
"
\n",
" \n",
" | 4 | \n",
" 2 | \n",
" 2318 | \n",
" 3 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" user_id book_id rating\n",
"0 1 258 5\n",
"1 2 4081 4\n",
"2 2 260 5\n",
"3 2 9296 5\n",
"4 2 2318 3"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"ratings_df = pd.read_csv(\"/kaggle/input/goodbooks-10k-the-latest/ratings.csv\")\n",
"ratings_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"execution": {
"iopub.execute_input": "2025-07-04T14:25:15.301368Z",
"iopub.status.busy": "2025-07-04T14:25:15.301072Z",
"iopub.status.idle": "2025-07-04T14:25:15.308909Z",
"shell.execute_reply": "2025-07-04T14:25:15.308252Z",
"shell.execute_reply.started": "2025-07-04T14:25:15.301346Z"
},
"trusted": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 5976479 entries, 0 to 5976478\n",
"Data columns (total 3 columns):\n",
" # Column Dtype\n",
"--- ------ -----\n",
" 0 user_id int64\n",
" 1 book_id int64\n",
" 2 rating int64\n",
"dtypes: int64(3)\n",
"memory usage: 136.8 MB\n"
]
}
],
"source": [
"ratings_df.info()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"execution": {
"iopub.execute_input": "2025-07-04T14:25:17.566224Z",
"iopub.status.busy": "2025-07-04T14:25:17.565964Z",
"iopub.status.idle": "2025-07-04T14:25:17.977667Z",
"shell.execute_reply": "2025-07-04T14:25:17.977046Z",
"shell.execute_reply.started": "2025-07-04T14:25:17.566206Z"
},
"trusted": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" tag_id | \n",
" tag_name | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 0 | \n",
" - | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" --1- | \n",
"
\n",
" \n",
" | 2 | \n",
" 2 | \n",
" --10- | \n",
"
\n",
" \n",
" | 3 | \n",
" 3 | \n",
" --12- | \n",
"
\n",
" \n",
" | 4 | \n",
" 4 | \n",
" --122- | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" tag_id tag_name\n",
"0 0 -\n",
"1 1 --1-\n",
"2 2 --10-\n",
"3 3 --12-\n",
"4 4 --122-"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"tags_df = pd.read_csv(\"/kaggle/input/goodbooks-10k-the-latest/tags.csv\")\n",
"book_tags_df = pd.read_csv(\"/kaggle/input/goodbooks-10k-the-latest/book_tags.csv\")\n",
"\n",
"tags_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"execution": {
"iopub.execute_input": "2025-07-04T14:25:20.177478Z",
"iopub.status.busy": "2025-07-04T14:25:20.176912Z",
"iopub.status.idle": "2025-07-04T14:25:20.184824Z",
"shell.execute_reply": "2025-07-04T14:25:20.184151Z",
"shell.execute_reply.started": "2025-07-04T14:25:20.177458Z"
},
"trusted": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" goodreads_book_id | \n",
" tag_id | \n",
" count | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" 30574 | \n",
" 167697 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" 11305 | \n",
" 37174 | \n",
"
\n",
" \n",
" | 2 | \n",
" 1 | \n",
" 11557 | \n",
" 34173 | \n",
"
\n",
" \n",
" | 3 | \n",
" 1 | \n",
" 8717 | \n",
" 12986 | \n",
"
\n",
" \n",
" | 4 | \n",
" 1 | \n",
" 33114 | \n",
" 12716 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" goodreads_book_id tag_id count\n",
"0 1 30574 167697\n",
"1 1 11305 37174\n",
"2 1 11557 34173\n",
"3 1 8717 12986\n",
"4 1 33114 12716"
]
},
"execution_count": 10,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"book_tags_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"execution": {
"iopub.execute_input": "2025-07-06T13:03:20.872786Z",
"iopub.status.busy": "2025-07-06T13:03:20.872009Z",
"iopub.status.idle": "2025-07-06T13:03:21.007798Z",
"shell.execute_reply": "2025-07-06T13:03:21.007232Z",
"shell.execute_reply.started": "2025-07-06T13:03:20.872760Z"
},
"trusted": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"(999912, 4)\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" goodreads_book_id | \n",
" tag_id | \n",
" count | \n",
" tag_name | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" 30574 | \n",
" 167697 | \n",
" to-read | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" 11305 | \n",
" 37174 | \n",
" fantasy | \n",
"
\n",
" \n",
" | 2 | \n",
" 1 | \n",
" 11557 | \n",
" 34173 | \n",
" favorites | \n",
"
\n",
" \n",
" | 3 | \n",
" 1 | \n",
" 8717 | \n",
" 12986 | \n",
" currently-reading | \n",
"
\n",
" \n",
" | 4 | \n",
" 1 | \n",
" 33114 | \n",
" 12716 | \n",
" young-adult | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" goodreads_book_id tag_id count tag_name\n",
"0 1 30574 167697 to-read\n",
"1 1 11305 37174 fantasy\n",
"2 1 11557 34173 favorites\n",
"3 1 8717 12986 currently-reading\n",
"4 1 33114 12716 young-adult"
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"full_tags_df = pd.merge(book_tags_df, tags_df, how = \"left\", on=\"tag_id\")\n",
"print(full_tags_df.shape)\n",
"full_tags_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"execution": {
"iopub.execute_input": "2025-07-06T13:04:47.656148Z",
"iopub.status.busy": "2025-07-06T13:04:47.655886Z",
"iopub.status.idle": "2025-07-06T13:04:48.053850Z",
"shell.execute_reply": "2025-07-06T13:04:48.053070Z",
"shell.execute_reply.started": "2025-07-06T13:04:47.656128Z"
},
"trusted": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Length: 999912\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" book_id | \n",
" goodreads_book_id | \n",
" best_book_id | \n",
" work_id | \n",
" books_count | \n",
" isbn | \n",
" isbn13 | \n",
" authors | \n",
" original_publication_year | \n",
" original_title | \n",
" ... | \n",
" ratings_1 | \n",
" ratings_2 | \n",
" ratings_3 | \n",
" ratings_4 | \n",
" ratings_5 | \n",
" image_url | \n",
" small_image_url | \n",
" tag_id | \n",
" count | \n",
" tag_name | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" 2767052 | \n",
" 2767052 | \n",
" 2792775 | \n",
" 272 | \n",
" 439023483 | \n",
" 9.780439e+12 | \n",
" Suzanne Collins | \n",
" 2008.0 | \n",
" The Hunger Games | \n",
" ... | \n",
" 66715 | \n",
" 127936 | \n",
" 560092 | \n",
" 1481305 | \n",
" 2706317 | \n",
" https://images.gr-assets.com/books/1447303603m... | \n",
" https://images.gr-assets.com/books/1447303603s... | \n",
" 11557 | \n",
" 50755 | \n",
" favorites | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" 2767052 | \n",
" 2767052 | \n",
" 2792775 | \n",
" 272 | \n",
" 439023483 | \n",
" 9.780439e+12 | \n",
" Suzanne Collins | \n",
" 2008.0 | \n",
" The Hunger Games | \n",
" ... | \n",
" 66715 | \n",
" 127936 | \n",
" 560092 | \n",
" 1481305 | \n",
" 2706317 | \n",
" https://images.gr-assets.com/books/1447303603m... | \n",
" https://images.gr-assets.com/books/1447303603s... | \n",
" 8717 | \n",
" 35418 | \n",
" currently-reading | \n",
"
\n",
" \n",
" | 2 | \n",
" 1 | \n",
" 2767052 | \n",
" 2767052 | \n",
" 2792775 | \n",
" 272 | \n",
" 439023483 | \n",
" 9.780439e+12 | \n",
" Suzanne Collins | \n",
" 2008.0 | \n",
" The Hunger Games | \n",
" ... | \n",
" 66715 | \n",
" 127936 | \n",
" 560092 | \n",
" 1481305 | \n",
" 2706317 | \n",
" https://images.gr-assets.com/books/1447303603m... | \n",
" https://images.gr-assets.com/books/1447303603s... | \n",
" 33114 | \n",
" 25968 | \n",
" young-adult | \n",
"
\n",
" \n",
" | 3 | \n",
" 1 | \n",
" 2767052 | \n",
" 2767052 | \n",
" 2792775 | \n",
" 272 | \n",
" 439023483 | \n",
" 9.780439e+12 | \n",
" Suzanne Collins | \n",
" 2008.0 | \n",
" The Hunger Games | \n",
" ... | \n",
" 66715 | \n",
" 127936 | \n",
" 560092 | \n",
" 1481305 | \n",
" 2706317 | \n",
" https://images.gr-assets.com/books/1447303603m... | \n",
" https://images.gr-assets.com/books/1447303603s... | \n",
" 11743 | \n",
" 13819 | \n",
" fiction | \n",
"
\n",
" \n",
" | 4 | \n",
" 1 | \n",
" 2767052 | \n",
" 2767052 | \n",
" 2792775 | \n",
" 272 | \n",
" 439023483 | \n",
" 9.780439e+12 | \n",
" Suzanne Collins | \n",
" 2008.0 | \n",
" The Hunger Games | \n",
" ... | \n",
" 66715 | \n",
" 127936 | \n",
" 560092 | \n",
" 1481305 | \n",
" 2706317 | \n",
" https://images.gr-assets.com/books/1447303603m... | \n",
" https://images.gr-assets.com/books/1447303603s... | \n",
" 10064 | \n",
" 12985 | \n",
" dystopian | \n",
"
\n",
" \n",
"
\n",
"
5 rows × 26 columns
\n",
"
"
],
"text/plain": [
" book_id goodreads_book_id best_book_id work_id books_count isbn \\\n",
"0 1 2767052 2767052 2792775 272 439023483 \n",
"1 1 2767052 2767052 2792775 272 439023483 \n",
"2 1 2767052 2767052 2792775 272 439023483 \n",
"3 1 2767052 2767052 2792775 272 439023483 \n",
"4 1 2767052 2767052 2792775 272 439023483 \n",
"\n",
" isbn13 authors original_publication_year original_title \\\n",
"0 9.780439e+12 Suzanne Collins 2008.0 The Hunger Games \n",
"1 9.780439e+12 Suzanne Collins 2008.0 The Hunger Games \n",
"2 9.780439e+12 Suzanne Collins 2008.0 The Hunger Games \n",
"3 9.780439e+12 Suzanne Collins 2008.0 The Hunger Games \n",
"4 9.780439e+12 Suzanne Collins 2008.0 The Hunger Games \n",
"\n",
" ... ratings_1 ratings_2 ratings_3 ratings_4 ratings_5 \\\n",
"0 ... 66715 127936 560092 1481305 2706317 \n",
"1 ... 66715 127936 560092 1481305 2706317 \n",
"2 ... 66715 127936 560092 1481305 2706317 \n",
"3 ... 66715 127936 560092 1481305 2706317 \n",
"4 ... 66715 127936 560092 1481305 2706317 \n",
"\n",
" image_url \\\n",
"0 https://images.gr-assets.com/books/1447303603m... \n",
"1 https://images.gr-assets.com/books/1447303603m... \n",
"2 https://images.gr-assets.com/books/1447303603m... \n",
"3 https://images.gr-assets.com/books/1447303603m... \n",
"4 https://images.gr-assets.com/books/1447303603m... \n",
"\n",
" small_image_url tag_id count \\\n",
"0 https://images.gr-assets.com/books/1447303603s... 11557 50755 \n",
"1 https://images.gr-assets.com/books/1447303603s... 8717 35418 \n",
"2 https://images.gr-assets.com/books/1447303603s... 33114 25968 \n",
"3 https://images.gr-assets.com/books/1447303603s... 11743 13819 \n",
"4 https://images.gr-assets.com/books/1447303603s... 10064 12985 \n",
"\n",
" tag_name \n",
"0 favorites \n",
"1 currently-reading \n",
"2 young-adult \n",
"3 fiction \n",
"4 dystopian \n",
"\n",
"[5 rows x 26 columns]"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Merge on book_id\n",
"merged_df = pd.merge(books_df, full_tags_df, how = \"left\", on=\"goodreads_book_id\")\n",
"\n",
"# Show result\n",
"print(f\"Length: {len(merged_df)}\")\n",
"merged_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"execution": {
"iopub.execute_input": "2025-07-06T13:04:52.553677Z",
"iopub.status.busy": "2025-07-06T13:04:52.553010Z",
"iopub.status.idle": "2025-07-06T13:04:54.677884Z",
"shell.execute_reply": "2025-07-06T13:04:54.677226Z",
"shell.execute_reply.started": "2025-07-06T13:04:52.553652Z"
},
"trusted": true
},
"outputs": [
{
"data": {
"text/plain": [
"book_id 0\n",
"goodreads_book_id 0\n",
"best_book_id 0\n",
"work_id 0\n",
"books_count 0\n",
"isbn 0\n",
"isbn13 0\n",
"authors 0\n",
"original_publication_year 0\n",
"original_title 0\n",
"title 0\n",
"language_code 0\n",
"average_rating 0\n",
"ratings_count 0\n",
"work_ratings_count 0\n",
"work_text_reviews_count 0\n",
"ratings_1 0\n",
"ratings_2 0\n",
"ratings_3 0\n",
"ratings_4 0\n",
"ratings_5 0\n",
"image_url 0\n",
"small_image_url 0\n",
"tag_id 0\n",
"count 0\n",
"tag_name 0\n",
"dtype: int64"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merged_df.loc[:, merged_df.select_dtypes(\"O\").columns] = (\n",
" merged_df.select_dtypes(\"O\").fillna(\"\")\n",
")\n",
"\n",
"merged_df.loc[:, merged_df.select_dtypes([\"int64\", \"float64\"]).columns] = (\n",
" merged_df.select_dtypes([\"int64\", \"float64\"]).fillna(0)\n",
")\n",
"\n",
"merged_df.isnull().sum()"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"execution": {
"iopub.execute_input": "2025-07-06T13:04:56.877226Z",
"iopub.status.busy": "2025-07-06T13:04:56.876612Z",
"iopub.status.idle": "2025-07-06T13:04:56.881105Z",
"shell.execute_reply": "2025-07-06T13:04:56.880425Z",
"shell.execute_reply.started": "2025-07-06T13:04:56.877204Z"
},
"trusted": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Data Features:\n",
" Index(['book_id', 'goodreads_book_id', 'best_book_id', 'work_id',\n",
" 'books_count', 'isbn', 'isbn13', 'authors', 'original_publication_year',\n",
" 'original_title', 'title', 'language_code', 'average_rating',\n",
" 'ratings_count', 'work_ratings_count', 'work_text_reviews_count',\n",
" 'ratings_1', 'ratings_2', 'ratings_3', 'ratings_4', 'ratings_5',\n",
" 'image_url', 'small_image_url', 'tag_id', 'count', 'tag_name'],\n",
" dtype='object')\n"
]
}
],
"source": [
"print(f\"Data Features:\\n {merged_df.columns}\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2025-07-06T13:04:59.979980Z",
"iopub.status.busy": "2025-07-06T13:04:59.979445Z",
"iopub.status.idle": "2025-07-06T13:05:00.026275Z",
"shell.execute_reply": "2025-07-06T13:05:00.025659Z",
"shell.execute_reply.started": "2025-07-06T13:04:59.979957Z"
},
"trusted": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" book_id | \n",
" goodreads_book_id | \n",
" title | \n",
" authors | \n",
" language_code | \n",
" original_publication_year | \n",
" average_rating | \n",
" tag_name | \n",
" ratings_count | \n",
"
\n",
" \n",
" \n",
" \n",
" | 0 | \n",
" 1 | \n",
" 2767052 | \n",
" The Hunger Games (The Hunger Games, #1) | \n",
" Suzanne Collins | \n",
" eng | \n",
" 2008.0 | \n",
" 4.34 | \n",
" favorites | \n",
" 4780653 | \n",
"
\n",
" \n",
" | 1 | \n",
" 1 | \n",
" 2767052 | \n",
" The Hunger Games (The Hunger Games, #1) | \n",
" Suzanne Collins | \n",
" eng | \n",
" 2008.0 | \n",
" 4.34 | \n",
" currently-reading | \n",
" 4780653 | \n",
"
\n",
" \n",
" | 2 | \n",
" 1 | \n",
" 2767052 | \n",
" The Hunger Games (The Hunger Games, #1) | \n",
" Suzanne Collins | \n",
" eng | \n",
" 2008.0 | \n",
" 4.34 | \n",
" young-adult | \n",
" 4780653 | \n",
"
\n",
" \n",
" | 3 | \n",
" 1 | \n",
" 2767052 | \n",
" The Hunger Games (The Hunger Games, #1) | \n",
" Suzanne Collins | \n",
" eng | \n",
" 2008.0 | \n",
" 4.34 | \n",
" fiction | \n",
" 4780653 | \n",
"
\n",
" \n",
" | 4 | \n",
" 1 | \n",
" 2767052 | \n",
" The Hunger Games (The Hunger Games, #1) | \n",
" Suzanne Collins | \n",
" eng | \n",
" 2008.0 | \n",
" 4.34 | \n",
" dystopian | \n",
" 4780653 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" book_id goodreads_book_id title \\\n",
"0 1 2767052 The Hunger Games (The Hunger Games, #1) \n",
"1 1 2767052 The Hunger Games (The Hunger Games, #1) \n",
"2 1 2767052 The Hunger Games (The Hunger Games, #1) \n",
"3 1 2767052 The Hunger Games (The Hunger Games, #1) \n",
"4 1 2767052 The Hunger Games (The Hunger Games, #1) \n",
"\n",
" authors language_code original_publication_year average_rating \\\n",
"0 Suzanne Collins eng 2008.0 4.34 \n",
"1 Suzanne Collins eng 2008.0 4.34 \n",
"2 Suzanne Collins eng 2008.0 4.34 \n",
"3 Suzanne Collins eng 2008.0 4.34 \n",
"4 Suzanne Collins eng 2008.0 4.34 \n",
"\n",
" tag_name ratings_count \n",
"0 favorites 4780653 \n",
"1 currently-reading 4780653 \n",
"2 young-adult 4780653 \n",
"3 fiction 4780653 \n",
"4 dystopian 4780653 "
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Keep needed columns\n",
"feature_df = merged_df[['book_id', 'goodreads_book_id', 'title', 'authors', 'language_code', 'original_publication_year', 'average_rating', 'tag_name', 'ratings_count', 'image_url']]\n",
"feature_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {
"execution": {
"iopub.execute_input": "2025-07-06T13:05:02.575821Z",
"iopub.status.busy": "2025-07-06T13:05:02.575143Z",
"iopub.status.idle": "2025-07-06T13:05:02.707131Z",
"shell.execute_reply": "2025-07-06T13:05:02.706515Z",
"shell.execute_reply.started": "2025-07-06T13:05:02.575796Z"
},
"trusted": true
},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" book_id | \n",
" goodreads_book_id | \n",
" original_publication_year | \n",
" average_rating | \n",
" ratings_count | \n",
"
\n",
" \n",
" \n",
" \n",
" | count | \n",
" 999912.000000 | \n",
" 9.999120e+05 | \n",
" 999912.000000 | \n",
" 999912.000000 | \n",
" 9.999120e+05 | \n",
"
\n",
" \n",
" | mean | \n",
" 5000.470973 | \n",
" 5.263442e+06 | \n",
" 1916.446687 | \n",
" 4.002216 | \n",
" 5.400521e+04 | \n",
"
\n",
" \n",
" | std | \n",
" 2886.743854 | \n",
" 7.574057e+06 | \n",
" 394.769683 | \n",
" 0.254408 | \n",
" 1.573685e+05 | \n",
"
\n",
" \n",
" | min | \n",
" 1.000000 | \n",
" 1.000000e+00 | \n",
" 0.000000 | \n",
" 2.470000 | \n",
" 2.716000e+03 | \n",
"
\n",
" \n",
" | 25% | \n",
" 2501.000000 | \n",
" 4.622700e+04 | \n",
" 1990.000000 | \n",
" 3.850000 | \n",
" 1.356900e+04 | \n",
"
\n",
" \n",
" | 50% | \n",
" 5000.000000 | \n",
" 3.948410e+05 | \n",
" 2004.000000 | \n",
" 4.020000 | \n",
" 2.115700e+04 | \n",
"
\n",
" \n",
" | 75% | \n",
" 7500.000000 | \n",
" 9.378297e+06 | \n",
" 2011.000000 | \n",
" 4.180000 | \n",
" 4.105800e+04 | \n",
"
\n",
" \n",
" | max | \n",
" 10000.000000 | \n",
" 3.328864e+07 | \n",
" 2017.000000 | \n",
" 4.820000 | \n",
" 4.780653e+06 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" book_id goodreads_book_id original_publication_year \\\n",
"count 999912.000000 9.999120e+05 999912.000000 \n",
"mean 5000.470973 5.263442e+06 1916.446687 \n",
"std 2886.743854 7.574057e+06 394.769683 \n",
"min 1.000000 1.000000e+00 0.000000 \n",
"25% 2501.000000 4.622700e+04 1990.000000 \n",
"50% 5000.000000 3.948410e+05 2004.000000 \n",
"75% 7500.000000 9.378297e+06 2011.000000 \n",
"max 10000.000000 3.328864e+07 2017.000000 \n",
"\n",
" average_rating ratings_count \n",
"count 999912.000000 9.999120e+05 \n",
"mean 4.002216 5.400521e+04 \n",
"std 0.254408 1.573685e+05 \n",
"min 2.470000 2.716000e+03 \n",
"25% 3.850000 1.356900e+04 \n",
"50% 4.020000 2.115700e+04 \n",
"75% 4.180000 4.105800e+04 \n",
"max 4.820000 4.780653e+06 "
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"feature_df.describe()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {
"execution": {
"iopub.execute_input": "2025-07-06T13:05:05.103333Z",
"iopub.status.busy": "2025-07-06T13:05:05.103109Z",
"iopub.status.idle": "2025-07-06T13:05:05.296121Z",
"shell.execute_reply": "2025-07-06T13:05:05.295513Z",
"shell.execute_reply.started": "2025-07-06T13:05:05.103318Z"
},
"trusted": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"\n",
"RangeIndex: 999912 entries, 0 to 999911\n",
"Data columns (total 9 columns):\n",
" # Column Non-Null Count Dtype \n",
"--- ------ -------------- ----- \n",
" 0 book_id 999912 non-null int64 \n",
" 1 goodreads_book_id 999912 non-null int64 \n",
" 2 title 999912 non-null object \n",
" 3 authors 999912 non-null object \n",
" 4 language_code 999912 non-null object \n",
" 5 original_publication_year 999912 non-null float64\n",
" 6 average_rating 999912 non-null float64\n",
" 7 tag_name 999912 non-null object \n",
" 8 ratings_count 999912 non-null int64 \n",
"dtypes: float64(2), int64(3), object(4)\n",
"memory usage: 68.7+ MB\n"
]
}
],
"source": [
"feature_df.info()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### **SVD Model**"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {
"execution": {
"iopub.execute_input": "2025-07-06T13:07:23.464417Z",
"iopub.status.busy": "2025-07-06T13:07:23.463928Z",
"iopub.status.idle": "2025-07-06T13:20:35.689464Z",
"shell.execute_reply": "2025-07-06T13:20:35.688711Z",
"shell.execute_reply.started": "2025-07-06T13:07:23.464393Z"
},
"trusted": true
},
"outputs": [
{
"data": {
"text/plain": [
""
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"from surprise import Dataset, Reader, SVDpp\n",
"from surprise.model_selection import train_test_split\n",
"from surprise import accuracy\n",
"\n",
"# Prepare data\n",
"reader = Reader(rating_scale=(1, 5))\n",
"data = Dataset.load_from_df(ratings_df[['user_id', 'book_id', 'rating']], reader)\n",
"\n",
"trainset, testset = train_test_split(data, test_size=0.2, random_state=42)\n",
"\n",
"# Train SVD++\n",
"svdpp = SVDpp()\n",
"svdpp.fit(trainset)"
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {
"execution": {
"iopub.execute_input": "2025-07-06T13:26:09.026137Z",
"iopub.status.busy": "2025-07-06T13:26:09.025198Z",
"iopub.status.idle": "2025-07-06T13:28:27.249993Z",
"shell.execute_reply": "2025-07-06T13:28:27.249406Z",
"shell.execute_reply.started": "2025-07-06T13:26:09.026109Z"
},
"trusted": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"RMSE: 0.8182\n",
"RMSE (SVD++): 0.8181912821450508\n",
"MAE: 0.6286\n",
"MAE (SVD++): 0.6286450039163989\n"
]
}
],
"source": [
"# Predict\n",
"predictions = svdpp.test(testset)\n",
"\n",
"# Evaluate collaborative-only\n",
"print(\"RMSE (SVD++):\", accuracy.rmse(predictions))\n",
"print(\"MAE (SVD++):\", accuracy.mae(predictions))"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {
"execution": {
"iopub.execute_input": "2025-07-06T13:32:00.895881Z",
"iopub.status.busy": "2025-07-06T13:32:00.895597Z",
"iopub.status.idle": "2025-07-06T13:32:04.625174Z",
"shell.execute_reply": "2025-07-06T13:32:04.624591Z",
"shell.execute_reply.started": "2025-07-06T13:32:00.895859Z"
},
"trusted": true
},
"outputs": [],
"source": [
"# Prepare content features for books\n",
"from sklearn.feature_extraction.text import TfidfVectorizer\n",
"from sklearn.preprocessing import OneHotEncoder, StandardScaler\n",
"from scipy.sparse import hstack\n",
"\n",
"# Filter feature_df to books in ratings\n",
"rated_book_ids = feature_df[feature_df['book_id'].isin(ratings_df['book_id'].unique())]\n",
"\n",
"# TF-IDF for tags\n",
"tfidf = TfidfVectorizer(max_features=500)\n",
"tags_tfidf = tfidf.fit_transform(rated_book_ids['tag_name'])\n",
"\n",
"# One-hot encode language\n",
"ohe = OneHotEncoder()\n",
"lang_ohe = ohe.fit_transform(rated_book_ids[['language_code']])\n",
"\n",
"# Scale numeric features\n",
"scaler = StandardScaler()\n",
"num_features = scaler.fit_transform(rated_book_ids[['average_rating', 'ratings_count']])\n",
"\n",
"# Combine\n",
"content_features = hstack([tags_tfidf, lang_ohe, num_features]).tocsr()"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {
"execution": {
"iopub.execute_input": "2025-07-06T13:32:09.030197Z",
"iopub.status.busy": "2025-07-06T13:32:09.029842Z",
"iopub.status.idle": "2025-07-06T13:32:09.134783Z",
"shell.execute_reply": "2025-07-06T13:32:09.134079Z",
"shell.execute_reply.started": "2025-07-06T13:32:09.030176Z"
},
"trusted": true
},
"outputs": [],
"source": [
"from sklearn.metrics.pairwise import cosine_similarity\n",
"\n",
"def hybrid_predict(user_id, book_id, alpha=0.7):\n",
" # Collaborative prediction\n",
" pred = svdpp.predict(user_id, book_id).est\n",
"\n",
" # Content similarity: mean similarity to all books user rated highly\n",
" user_books = ratings_df[ratings_df['user_id'] == user_id]\n",
" high_rated_books = user_books[user_books['rating'] >= 4]['book_id'].values\n",
"\n",
" if len(high_rated_books) == 0:\n",
" content_score = 0\n",
" else:\n",
" # Get indices\n",
" idx_target = rated_book_ids[rated_book_ids['book_id'] == book_id].index[0]\n",
" idx_high = [rated_book_ids[rated_book_ids['book_id'] == b].index[0] for b in high_rated_books if b in rated_book_ids['book_id'].values]\n",
"\n",
" # Cosine similarity between target book and user's liked books\n",
" similarities = cosine_similarity(content_features[idx_target], content_features[idx_high])\n",
" content_score = np.mean(similarities)\n",
"\n",
" # Combine\n",
" final_score = alpha * pred + (1 - alpha) * content_score\n",
" return final_score"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {
"execution": {
"iopub.execute_input": "2025-07-06T13:32:17.607773Z",
"iopub.status.busy": "2025-07-06T13:32:17.607096Z",
"iopub.status.idle": "2025-07-06T13:36:09.281862Z",
"shell.execute_reply": "2025-07-06T13:36:09.281187Z",
"shell.execute_reply.started": "2025-07-06T13:32:17.607749Z"
},
"trusted": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"RMSE: 1.3263\n",
"RMSE (Hybrid): 1.3263211074229224\n",
"MAE: 1.1605\n",
"MAE (Hybrid): 1.1605183074935617\n"
]
}
],
"source": [
"from surprise import Prediction\n",
"\n",
"# Evaluate RMSE for hybrid\n",
"hybrid_preds = []\n",
"for uid, iid, true_r in testset[:2000]:\n",
" est = hybrid_predict(uid, iid)\n",
" hybrid_preds.append((uid, iid, true_r, est, None))\n",
"\n",
"hybrid_preds_surprise = [Prediction(uid, iid, true_r, est, None) for uid, iid, true_r, est, _ in hybrid_preds]\n",
"\n",
"print(\"RMSE (Hybrid):\", accuracy.rmse(hybrid_preds_surprise))\n",
"print(\"MAE (Hybrid):\", accuracy.mae(hybrid_preds_surprise))"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"execution": {
"execution_failed": "2025-07-06T12:43:38.385Z",
"iopub.execute_input": "2025-07-06T12:40:56.563459Z",
"iopub.status.busy": "2025-07-06T12:40:56.562823Z"
},
"trusted": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Precomputing SVD predictions...\n",
"Computing cosine similarity matrix for books...\n"
]
}
],
"source": [
"from sklearn.metrics.pairwise import cosine_similarity\n",
"from surprise import Prediction, accuracy\n",
"import numpy as np\n",
"\n",
"# --- Step 1: Precompute all SVD predictions (vectorized) ---\n",
"print(\"Precomputing SVD predictions...\")\n",
"svd_preds = {}\n",
"for uid, iid, _ in testset:\n",
" svd_preds[(uid, iid)] = svdpp.predict(uid, iid).est\n",
"\n",
"# --- Step 2: Precompute cosine similarity matrix between all books ---\n",
"print(\"Computing cosine similarity matrix for books...\")\n",
"# Make sure content_features is dense if it's sparse\n",
"if hasattr(content_features, \"toarray\"):\n",
" content_features_dense = content_features.toarray()\n",
"else:\n",
" content_features_dense = content_features\n",
"\n",
"book_sim_matrix = cosine_similarity(content_features_dense)\n",
"\n",
"# --- Step 3: Precompute user liked books ---\n",
"print(\"Building user liked books map...\")\n",
"# Filter for books rated >= 4 as \"liked\"\n",
"liked_books_by_user = ratings_df[ratings_df['rating'] >= 4].groupby('user_id')['book_id'].apply(list).to_dict()\n",
"\n",
"# --- Step 4: Hybrid prediction function ---\n",
"def hybrid_predict_vectorized(uid, iid, alpha=0.5):\n",
" \"\"\"\n",
" Combines SVD and content-based similarity into a single prediction.\n",
" \"\"\"\n",
" # SVD prediction\n",
" svd_score = svd_preds.get((uid, iid), 3.0) # Default to 3.0 if missing\n",
"\n",
" # Content-based score\n",
" liked_books = liked_books_by_user.get(uid, [])\n",
" if not liked_books:\n",
" content_score = 3.0 # Fallback if user has no liked books\n",
" else:\n",
" # Get similarity of target book to all liked books\n",
" try:\n",
" similarities = book_sim_matrix[iid, liked_books]\n",
" content_score = np.mean(similarities)\n",
" except IndexError:\n",
" content_score = 3.0 # Fallback if book index out of bounds\n",
"\n",
" # Combine\n",
" hybrid_score = alpha * svd_score + (1 - alpha) * content_score\n",
" return hybrid_score\n",
"\n",
"# --- Step 5: Make predictions for full test set ---\n",
"print(\"Predicting for full test set...\")\n",
"hybrid_preds = []\n",
"for uid, iid, true_r in testset:\n",
" est = hybrid_predict_vectorized(uid, iid)\n",
" hybrid_preds.append(Prediction(uid, iid, true_r, est, None))\n",
"\n",
"# --- Step 6: Evaluate ---\n",
"print(\"Evaluating hybrid model...\")\n",
"print(\"RMSE (Hybrid):\", accuracy.rmse(hybrid_preds))\n",
"print(\"MAE (Hybrid):\", accuracy.mae(hybrid_preds))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2025-07-06T13:41:20.931127Z",
"iopub.status.busy": "2025-07-06T13:41:20.930864Z",
"iopub.status.idle": "2025-07-06T13:41:26.436330Z",
"shell.execute_reply": "2025-07-06T13:41:26.435594Z",
"shell.execute_reply.started": "2025-07-06T13:41:20.931107Z"
},
"trusted": true
},
"outputs": [],
"source": [
"import pickle\n",
"\n",
"# Save SVD++ model\n",
"with open(\"svd_model.pkl\", \"wb\") as f:\n",
" pickle.dump(svdpp, f)\n",
"\n",
"# Save content features matrix\n",
"with open(\"content_features.pkl\", \"wb\") as f:\n",
" pickle.dump(content_features, f)\n",
"\n",
"# Save content features matrix\n",
"with open(\"mappings.pkl\", \"wb\") as f:\n",
" pickle.dump(mappings, f)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## **Deployement API**"
]
},
{
"cell_type": "code",
"execution_count": 20,
"metadata": {
"execution": {
"iopub.execute_input": "2025-07-06T13:46:33.571168Z",
"iopub.status.busy": "2025-07-06T13:46:33.570692Z",
"iopub.status.idle": "2025-07-06T13:46:33.576635Z",
"shell.execute_reply": "2025-07-06T13:46:33.575946Z",
"shell.execute_reply.started": "2025-07-06T13:46:33.571145Z"
},
"trusted": true
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Writing main.py\n"
]
}
],
"source": [
"%%writefile main.py\n",
"from fastapi import FastAPI, HTTPException\n",
"from pydantic import BaseModel\n",
"import pickle\n",
"import numpy as np\n",
"from surprise import dump\n",
"\n",
"# Load models\n",
"svdpp_model = pickle.load(open(\"svdpp_model.pkl\", \"rb\"))\n",
"content_features = pickle.load(open(\"content_features.pkl\", \"rb\"))\n",
"book_metadata = pickle.load(open(\"book_metadata.pkl\", \"rb\"))\n",
"\n",
"# API app\n",
"app = FastAPI(title=\"Book Recommender API\")\n",
"\n",
"# Input schema\n",
"class RatingRequest(BaseModel):\n",
" user_id: int\n",
" book_id: int\n",
" alpha: float = 0.7 # Default weight for SVD++ in hybrid\n",
"\n",
"class RecommendRequest(BaseModel):\n",
" user_id: int\n",
" top_n: int = 10\n",
" alpha: float = 0.7\n",
"\n",
"# Hybrid predict function\n",
"def hybrid_predict(user_id, book_id, alpha=0.7):\n",
" try:\n",
" # SVD++ prediction\n",
" svdpp_pred = svdpp_model.predict(user_id, book_id).est\n",
" except:\n",
" svdpp_pred = 3.0 # Fallback if user/book unknown\n",
" \n",
" try:\n",
" # Content-based prediction\n",
" idx_target = book_metadata.index.get_loc(book_id)\n",
" user_books = ratings_df[ratings_df['user_id'] == user_id]['book_id'].values\n",
" user_books_idx = [book_metadata.index.get_loc(bid) for bid in user_books if bid in book_metadata.index]\n",
" if user_books_idx:\n",
" from sklearn.metrics.pairwise import cosine_similarity\n",
" similarities = cosine_similarity(content_features[idx_target], content_features[user_books_idx])\n",
" content_score = np.mean(similarities)\n",
" else:\n",
" content_score = 3.0 # Fallback\n",
" except:\n",
" content_score = 3.0\n",
"\n",
" # Hybrid\n",
" return alpha * svdpp_pred + (1 - alpha) * content_score\n",
"\n",
"# Predict endpoint\n",
"@app.post(\"/predict/\")\n",
"def predict_rating(req: RatingRequest):\n",
" try:\n",
" pred_rating = hybrid_predict(req.user_id, req.book_id, req.alpha)\n",
" return {\"user_id\": req.user_id, \"book_id\": req.book_id, \"predicted_rating\": round(pred_rating, 3)}\n",
" except Exception as e:\n",
" raise HTTPException(status_code=400, detail=str(e))\n",
"\n",
"# Top-N recommendations\n",
"@app.post(\"/recommend/\")\n",
"def recommend_books(req: RecommendRequest):\n",
" try:\n",
" all_book_ids = book_metadata['book_id'].tolist()\n",
" preds = []\n",
" for bid in all_book_ids:\n",
" score = hybrid_predict(req.user_id, bid, req.alpha)\n",
" preds.append((bid, score))\n",
" preds.sort(key=lambda x: x[1], reverse=True)\n",
" top_books = preds[:req.top_n]\n",
" return {\"user_id\": req.user_id, \"recommendations\": top_books}\n",
" except Exception as e:\n",
" raise HTTPException(status_code=400, detail=str(e))\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"execution": {
"iopub.execute_input": "2025-07-06T13:47:04.833117Z",
"iopub.status.busy": "2025-07-06T13:47:04.832572Z"
},
"trusted": true
},
"outputs": [],
"source": [
"import kagglehub\n",
"\n",
"LOCAL_MODEL_DIR = '/kaggle/working/' # Directory where your model files are saved\n",
"MODEL_SLUG = 'book-recommender-svd' # Use the model slug from your Kaggle model page\n",
"VARIATION_SLUG = 'v1' \n",
"\n",
"\n",
"kagglehub.model_upload(\n",
" handle=f\"khalednabawi/{MODEL_SLUG}/keras/{VARIATION_SLUG}\",\n",
" local_model_dir=LOCAL_MODEL_DIR,\n",
" version_notes=\"Upload - Book Recommender Model\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"trusted": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kaggle": {
"accelerator": "nvidiaTeslaT4",
"dataSources": [
{
"datasetId": 2505178,
"sourceId": 4251459,
"sourceType": "datasetVersion"
},
{
"datasetId": 2505717,
"sourceId": 4252354,
"sourceType": "datasetVersion"
}
],
"dockerImageVersionId": 31041,
"isGpuEnabled": true,
"isInternetEnabled": true,
"language": "python",
"sourceType": "notebook"
},
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.11"
}
},
"nbformat": 4,
"nbformat_minor": 4
}