{ "cells": [ { "cell_type": "code", "execution_count": 90, "metadata": {}, "outputs": [], "source": [ "\"\"\"\n", "Name: Anand Panchdari\n", "Roll No.: C013\n", "Aim: Implement K means clustering and analyse the effect of varying the number of clusters.\n", "\"\"\"\n", "\n", "import pandas as pd\n", "import numpy as np\n", "from sklearn.cluster import KMeans\n", "from sklearn.preprocessing import LabelEncoder\n", "from sklearn.preprocessing import MinMaxScaler\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "\n", "df = pd.read_csv(\"datasets/facebook_live_sellers_thailand.csv\")\n", "df.drop(['Column1', 'Column2', 'Column3', 'Column4'], axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 91, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Shape is \n", "(7050, 12)\n", "\n", "Info is \n", "\n", "RangeIndex: 7050 entries, 0 to 7049\n", "Data columns (total 12 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 status_id 7050 non-null object\n", " 1 status_type 7050 non-null object\n", " 2 status_published 7050 non-null object\n", " 3 num_reactions 7050 non-null int64 \n", " 4 num_comments 7050 non-null int64 \n", " 5 num_shares 7050 non-null int64 \n", " 6 num_likes 7050 non-null int64 \n", " 7 num_loves 7050 non-null int64 \n", " 8 num_wows 7050 non-null int64 \n", " 9 num_hahas 7050 non-null int64 \n", " 10 num_sads 7050 non-null int64 \n", " 11 num_angrys 7050 non-null int64 \n", "dtypes: int64(9), object(3)\n", "memory usage: 661.1+ KB\n", "None\n", "\n", "Head is \n" ] }, { "data": { "application/vnd.microsoft.datawrangler.viewer.v0+json": { "columns": [ { "name": "index", "rawType": "int64", "type": "integer" }, { "name": "status_id", "rawType": "object", "type": "string" }, { "name": "status_type", "rawType": "object", "type": "string" }, { "name": "status_published", "rawType": "object", "type": "string" }, { "name": "num_reactions", "rawType": "int64", "type": "integer" }, { "name": "num_comments", "rawType": "int64", "type": "integer" }, { "name": "num_shares", "rawType": "int64", "type": "integer" }, { "name": "num_likes", "rawType": "int64", "type": "integer" }, { "name": "num_loves", "rawType": "int64", "type": "integer" }, { "name": "num_wows", "rawType": "int64", "type": "integer" }, { "name": "num_hahas", "rawType": "int64", "type": "integer" }, { "name": "num_sads", "rawType": "int64", "type": "integer" }, { "name": "num_angrys", "rawType": "int64", "type": "integer" } ], "conversionMethod": "pd.DataFrame", "ref": "7f123535-4c30-4e56-8445-4c8449f02f2c", "rows": [ [ "0", "246675545449582_1649696485147474", "video", "4/22/2018 6:00", "529", "512", "262", "432", "92", "3", "1", "1", "0" ], [ "1", "246675545449582_1649426988507757", "photo", "4/21/2018 22:45", "150", "0", "0", "150", "0", "0", "0", "0", "0" ], [ "2", "246675545449582_1648730588577397", "video", "4/21/2018 6:17", "227", "236", "57", "204", "21", "1", "1", "0", "0" ], [ "3", "246675545449582_1648576705259452", "photo", "4/21/2018 2:29", "111", "0", "0", "111", "0", "0", "0", "0", "0" ], [ "4", "246675545449582_1645700502213739", "photo", "4/18/2018 3:22", "213", "0", "0", "204", "9", "0", "0", "0", "0" ] ], "shape": { "columns": 12, "rows": 5 } }, "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
status_idstatus_typestatus_publishednum_reactionsnum_commentsnum_sharesnum_likesnum_lovesnum_wowsnum_hahasnum_sadsnum_angrys
0246675545449582_1649696485147474video4/22/2018 6:00529512262432923110
1246675545449582_1649426988507757photo4/21/2018 22:451500015000000
2246675545449582_1648730588577397video4/21/2018 6:1722723657204211100
3246675545449582_1648576705259452photo4/21/2018 2:291110011100000
4246675545449582_1645700502213739photo4/18/2018 3:222130020490000
\n", "
" ], "text/plain": [ " status_id status_type status_published \\\n", "0 246675545449582_1649696485147474 video 4/22/2018 6:00 \n", "1 246675545449582_1649426988507757 photo 4/21/2018 22:45 \n", "2 246675545449582_1648730588577397 video 4/21/2018 6:17 \n", "3 246675545449582_1648576705259452 photo 4/21/2018 2:29 \n", "4 246675545449582_1645700502213739 photo 4/18/2018 3:22 \n", "\n", " num_reactions num_comments num_shares num_likes num_loves num_wows \\\n", "0 529 512 262 432 92 3 \n", "1 150 0 0 150 0 0 \n", "2 227 236 57 204 21 1 \n", "3 111 0 0 111 0 0 \n", "4 213 0 0 204 9 0 \n", "\n", " num_hahas num_sads num_angrys \n", "0 1 1 0 \n", "1 0 0 0 \n", "2 1 0 0 \n", "3 0 0 0 \n", "4 0 0 0 " ] }, "execution_count": 91, "metadata": {}, "output_type": "execute_result" } ], "source": [ "print(f\"Shape is \\n{df.shape}\")\n", "print(f\"\\nInfo is \")\n", "print(df.info())\n", "print(f\"\\nHead is \")\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 92, "metadata": {}, "outputs": [ { "data": { "application/vnd.microsoft.datawrangler.viewer.v0+json": { "columns": [ { "name": "index", "rawType": "object", "type": "string" }, { "name": "num_reactions", "rawType": "float64", "type": "float" }, { "name": "num_comments", "rawType": "float64", "type": "float" }, { "name": "num_shares", "rawType": "float64", "type": "float" }, { "name": "num_likes", "rawType": "float64", "type": "float" }, { "name": "num_loves", "rawType": "float64", "type": "float" }, { "name": "num_wows", "rawType": "float64", "type": "float" }, { "name": "num_hahas", "rawType": "float64", "type": "float" }, { "name": "num_sads", "rawType": "float64", "type": "float" }, { "name": "num_angrys", "rawType": "float64", "type": "float" } ], "conversionMethod": "pd.DataFrame", "ref": "33b60859-3895-45b7-a506-f90c76687896", "rows": [ [ "count", "7050.0", "7050.0", "7050.0", "7050.0", "7050.0", "7050.0", "7050.0", "7050.0", "7050.0" ], [ "mean", "230.11716312056737", "224.3560283687943", "40.022553191489365", "215.0431205673759", "12.728652482269503", "1.2893617021276595", "0.6964539007092199", "0.24368794326241136", "0.11319148936170213" ], [ "std", "462.6253091352333", "889.6368195190058", "131.59996549017612", "449.47235705614156", "39.97293010859566", "8.719650380381506", "3.9571834429528265", "1.597155939511341", "0.7268118906561141" ], [ "min", "0.0", "0.0", "0.0", "0.0", "0.0", "0.0", "0.0", "0.0", "0.0" ], [ "25%", "17.0", "0.0", "0.0", "17.0", "0.0", "0.0", "0.0", "0.0", "0.0" ], [ "50%", "59.5", "4.0", "0.0", "58.0", "0.0", "0.0", "0.0", "0.0", "0.0" ], [ "75%", "219.0", "23.0", "4.0", "184.75", "3.0", "0.0", "0.0", "0.0", "0.0" ], [ "max", "4710.0", "20990.0", "3424.0", "4710.0", "657.0", "278.0", "157.0", "51.0", "31.0" ] ], "shape": { "columns": 9, "rows": 8 } }, "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
num_reactionsnum_commentsnum_sharesnum_likesnum_lovesnum_wowsnum_hahasnum_sadsnum_angrys
count7050.0000007050.0000007050.0000007050.0000007050.0000007050.0000007050.0000007050.0000007050.000000
mean230.117163224.35602840.022553215.04312112.7286521.2893620.6964540.2436880.113191
std462.625309889.636820131.599965449.47235739.9729308.7196503.9571831.5971560.726812
min0.0000000.0000000.0000000.0000000.0000000.0000000.0000000.0000000.000000
25%17.0000000.0000000.00000017.0000000.0000000.0000000.0000000.0000000.000000
50%59.5000004.0000000.00000058.0000000.0000000.0000000.0000000.0000000.000000
75%219.00000023.0000004.000000184.7500003.0000000.0000000.0000000.0000000.000000
max4710.00000020990.0000003424.0000004710.000000657.000000278.000000157.00000051.00000031.000000
\n", "
" ], "text/plain": [ " num_reactions num_comments num_shares num_likes num_loves \\\n", "count 7050.000000 7050.000000 7050.000000 7050.000000 7050.000000 \n", "mean 230.117163 224.356028 40.022553 215.043121 12.728652 \n", "std 462.625309 889.636820 131.599965 449.472357 39.972930 \n", "min 0.000000 0.000000 0.000000 0.000000 0.000000 \n", "25% 17.000000 0.000000 0.000000 17.000000 0.000000 \n", "50% 59.500000 4.000000 0.000000 58.000000 0.000000 \n", "75% 219.000000 23.000000 4.000000 184.750000 3.000000 \n", "max 4710.000000 20990.000000 3424.000000 4710.000000 657.000000 \n", "\n", " num_wows num_hahas num_sads num_angrys \n", "count 7050.000000 7050.000000 7050.000000 7050.000000 \n", "mean 1.289362 0.696454 0.243688 0.113191 \n", "std 8.719650 3.957183 1.597156 0.726812 \n", "min 0.000000 0.000000 0.000000 0.000000 \n", "25% 0.000000 0.000000 0.000000 0.000000 \n", "50% 0.000000 0.000000 0.000000 0.000000 \n", "75% 0.000000 0.000000 0.000000 0.000000 \n", "max 278.000000 157.000000 51.000000 31.000000 " ] }, "execution_count": 92, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.describe()" ] }, { "cell_type": "code", "execution_count": 93, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Status id 6997\n", "Status published 6913\n" ] } ], "source": [ "print(f\"Status id {len(df['status_id'].unique())}\")\n", "print(f\"Status published {len(df['status_published'].unique())}\")\n", "df.drop(['status_id', 'status_published'], axis=1, inplace=True)" ] }, { "cell_type": "code", "execution_count": 94, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 7050 entries, 0 to 7049\n", "Data columns (total 10 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 status_type 7050 non-null object\n", " 1 num_reactions 7050 non-null int64 \n", " 2 num_comments 7050 non-null int64 \n", " 3 num_shares 7050 non-null int64 \n", " 4 num_likes 7050 non-null int64 \n", " 5 num_loves 7050 non-null int64 \n", " 6 num_wows 7050 non-null int64 \n", " 7 num_hahas 7050 non-null int64 \n", " 8 num_sads 7050 non-null int64 \n", " 9 num_angrys 7050 non-null int64 \n", "dtypes: int64(9), object(1)\n", "memory usage: 550.9+ KB\n" ] }, { "data": { "application/vnd.microsoft.datawrangler.viewer.v0+json": { "columns": [ { "name": "index", "rawType": "int64", "type": "integer" }, { "name": "status_type", "rawType": "object", "type": "string" }, { "name": "num_reactions", "rawType": "int64", "type": "integer" }, { "name": "num_comments", "rawType": "int64", "type": "integer" }, { "name": "num_shares", "rawType": "int64", "type": "integer" }, { "name": "num_likes", "rawType": "int64", "type": "integer" }, { "name": "num_loves", "rawType": "int64", "type": "integer" }, { "name": "num_wows", "rawType": "int64", "type": "integer" }, { "name": "num_hahas", "rawType": "int64", "type": "integer" }, { "name": "num_sads", "rawType": "int64", "type": "integer" }, { "name": "num_angrys", "rawType": "int64", "type": "integer" } ], "conversionMethod": "pd.DataFrame", "ref": "2a10bfb4-6c9b-482f-aaf9-7a9f3e84fb4a", "rows": [ [ "0", "video", "529", "512", "262", "432", "92", "3", "1", "1", "0" ], [ "1", "photo", "150", "0", "0", "150", "0", "0", "0", "0", "0" ], [ "2", "video", "227", "236", "57", "204", "21", "1", "1", "0", "0" ], [ "3", "photo", "111", "0", "0", "111", "0", "0", "0", "0", "0" ], [ "4", "photo", "213", "0", "0", "204", "9", "0", "0", "0", "0" ] ], "shape": { "columns": 10, "rows": 5 } }, "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
status_typenum_reactionsnum_commentsnum_sharesnum_likesnum_lovesnum_wowsnum_hahasnum_sadsnum_angrys
0video529512262432923110
1photo1500015000000
2video22723657204211100
3photo1110011100000
4photo2130020490000
\n", "
" ], "text/plain": [ " status_type num_reactions num_comments num_shares num_likes num_loves \\\n", "0 video 529 512 262 432 92 \n", "1 photo 150 0 0 150 0 \n", "2 video 227 236 57 204 21 \n", "3 photo 111 0 0 111 0 \n", "4 photo 213 0 0 204 9 \n", "\n", " num_wows num_hahas num_sads num_angrys \n", "0 3 1 1 0 \n", "1 0 0 0 0 \n", "2 1 1 0 0 \n", "3 0 0 0 0 \n", "4 0 0 0 0 " ] }, "execution_count": 94, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df.info()\n", "\n", "df.head()" ] }, { "cell_type": "code", "execution_count": 95, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "\n", "RangeIndex: 7050 entries, 0 to 7049\n", "Data columns (total 10 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", " 0 status_type 7050 non-null object\n", " 1 num_reactions 7050 non-null int64 \n", " 2 num_comments 7050 non-null int64 \n", " 3 num_shares 7050 non-null int64 \n", " 4 num_likes 7050 non-null int64 \n", " 5 num_loves 7050 non-null int64 \n", " 6 num_wows 7050 non-null int64 \n", " 7 num_hahas 7050 non-null int64 \n", " 8 num_sads 7050 non-null int64 \n", " 9 num_angrys 7050 non-null int64 \n", "dtypes: int64(9), object(1)\n", "memory usage: 550.9+ KB\n" ] } ], "source": [ "X = df\n", "y = df['status_type']\n", "X.info()" ] }, { "cell_type": "code", "execution_count": 96, "metadata": {}, "outputs": [ { "data": { "application/vnd.microsoft.datawrangler.viewer.v0+json": { "columns": [ { "name": "index", "rawType": "int64", "type": "integer" }, { "name": "status_type", "rawType": "int64", "type": "integer" }, { "name": "num_reactions", "rawType": "int64", "type": "integer" }, { "name": "num_comments", "rawType": "int64", "type": "integer" }, { "name": "num_shares", "rawType": "int64", "type": "integer" }, { "name": "num_likes", "rawType": "int64", "type": "integer" }, { "name": "num_loves", "rawType": "int64", "type": "integer" }, { "name": "num_wows", "rawType": "int64", "type": "integer" }, { "name": "num_hahas", "rawType": "int64", "type": "integer" }, { "name": "num_sads", "rawType": "int64", "type": "integer" }, { "name": "num_angrys", "rawType": "int64", "type": "integer" } ], "conversionMethod": "pd.DataFrame", "ref": "91580faa-92d9-40f7-8b70-aa5e336ed5a1", "rows": [ [ "0", "3", "529", "512", "262", "432", "92", "3", "1", "1", "0" ], [ "1", "1", "150", "0", "0", "150", "0", "0", "0", "0", "0" ], [ "2", "3", "227", "236", "57", "204", "21", "1", "1", "0", "0" ], [ "3", "1", "111", "0", "0", "111", "0", "0", "0", "0", "0" ], [ "4", "1", "213", "0", "0", "204", "9", "0", "0", "0", "0" ] ], "shape": { "columns": 10, "rows": 5 } }, "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
status_typenum_reactionsnum_commentsnum_sharesnum_likesnum_lovesnum_wowsnum_hahasnum_sadsnum_angrys
03529512262432923110
111500015000000
2322723657204211100
311110011100000
412130020490000
\n", "
" ], "text/plain": [ " status_type num_reactions num_comments num_shares num_likes num_loves \\\n", "0 3 529 512 262 432 92 \n", "1 1 150 0 0 150 0 \n", "2 3 227 236 57 204 21 \n", "3 1 111 0 0 111 0 \n", "4 1 213 0 0 204 9 \n", "\n", " num_wows num_hahas num_sads num_angrys \n", "0 3 1 1 0 \n", "1 0 0 0 0 \n", "2 1 1 0 0 \n", "3 0 0 0 0 \n", "4 0 0 0 0 " ] }, "execution_count": 96, "metadata": {}, "output_type": "execute_result" } ], "source": [ "le = LabelEncoder()\n", "X['status_type'] = le.fit_transform(X['status_type'])\n", "y = le.transform(y)\n", "X.head()" ] }, { "cell_type": "code", "execution_count": 97, "metadata": {}, "outputs": [ { "data": { "application/vnd.microsoft.datawrangler.viewer.v0+json": { "columns": [ { "name": "index", "rawType": "int64", "type": "integer" }, { "name": "('status_type',)", "rawType": "float64", "type": "float" }, { "name": "('num_reactions',)", "rawType": "float64", "type": "float" }, { "name": "('num_comments',)", "rawType": "float64", "type": "float" }, { "name": "('num_shares',)", "rawType": "float64", "type": "float" }, { "name": "('num_likes',)", "rawType": "float64", "type": "float" }, { "name": "('num_loves',)", "rawType": "float64", "type": "float" }, { "name": "('num_wows',)", "rawType": "float64", "type": "float" }, { "name": "('num_hahas',)", "rawType": "float64", "type": "float" }, { "name": "('num_sads',)", "rawType": "float64", "type": "float" }, { "name": "('num_angrys',)", "rawType": "float64", "type": "float" } ], "conversionMethod": "pd.DataFrame", "ref": "bd2a34d2-73a6-44b1-ac3e-9c2c07c39987", "rows": [ [ "0", "1.0", "0.11231422505307856", "0.024392567889471178", "0.07651869158878503", "0.09171974522292994", "0.1400304414003044", "0.01079136690647482", "0.006369426751592357", "0.0196078431372549", "0.0" ], [ "1", "0.3333333333333333", "0.03184713375796178", "0.0", "0.0", "0.03184713375796178", "0.0", "0.0", "0.0", "0.0", "0.0" ], [ "2", "1.0", "0.048195329087048835", "0.01124344926155312", "0.01664719626168224", "0.04331210191082803", "0.0319634703196347", "0.0035971223021582736", "0.006369426751592357", "0.0", "0.0" ], [ "3", "0.3333333333333333", "0.02356687898089172", "0.0", "0.0", "0.02356687898089172", "0.0", "0.0", "0.0", "0.0", "0.0" ], [ "4", "0.3333333333333333", "0.045222929936305736", "0.0", "0.0", "0.04331210191082803", "0.0136986301369863", "0.0", "0.0", "0.0", "0.0" ] ], "shape": { "columns": 10, "rows": 5 } }, "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
status_typenum_reactionsnum_commentsnum_sharesnum_likesnum_lovesnum_wowsnum_hahasnum_sadsnum_angrys
01.0000000.1123140.0243930.0765190.0917200.1400300.0107910.0063690.0196080.0
10.3333330.0318470.0000000.0000000.0318470.0000000.0000000.0000000.0000000.0
21.0000000.0481950.0112430.0166470.0433120.0319630.0035970.0063690.0000000.0
30.3333330.0235670.0000000.0000000.0235670.0000000.0000000.0000000.0000000.0
40.3333330.0452230.0000000.0000000.0433120.0136990.0000000.0000000.0000000.0
\n", "
" ], "text/plain": [ " status_type num_reactions num_comments num_shares num_likes num_loves \\\n", "0 1.000000 0.112314 0.024393 0.076519 0.091720 0.140030 \n", "1 0.333333 0.031847 0.000000 0.000000 0.031847 0.000000 \n", "2 1.000000 0.048195 0.011243 0.016647 0.043312 0.031963 \n", "3 0.333333 0.023567 0.000000 0.000000 0.023567 0.000000 \n", "4 0.333333 0.045223 0.000000 0.000000 0.043312 0.013699 \n", "\n", " num_wows num_hahas num_sads num_angrys \n", "0 0.010791 0.006369 0.019608 0.0 \n", "1 0.000000 0.000000 0.000000 0.0 \n", "2 0.003597 0.006369 0.000000 0.0 \n", "3 0.000000 0.000000 0.000000 0.0 \n", "4 0.000000 0.000000 0.000000 0.0 " ] }, "execution_count": 97, "metadata": {}, "output_type": "execute_result" } ], "source": [ "column = X.columns\n", "minmax = MinMaxScaler()\n", "X = minmax.fit_transform(X)\n", "X = pd.DataFrame(X, columns=[column])\n", "X.head()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "def doKMEAN(n):\n", " kmean = KMeans(n_clusters=n, random_state=0)\n", " kmean.fit(X)\n", " labels = kmean.labels_\n", " correct_labels = sum(y==labels)\n", " accuracy =correct_labels/float(y.size)\n", " return correct_labels, accuracy" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "For k=1, the results are\n", "63 out of 7050 were correctly labeled\n", "Accuracy score: 0.01\n", "\n", "\n", "For k=2, the results are\n", "4288 out of 7050 were correctly labeled\n", "Accuracy score: 0.61\n", "\n", "\n", "For k=3, the results are\n", "4066 out of 7050 were correctly labeled\n", "Accuracy score: 0.58\n", "\n", "\n", "For k=4, the results are\n", "4112 out of 7050 were correctly labeled\n", "Accuracy score: 0.58\n", "\n", "\n", "For k=5, the results are\n", "4080 out of 7050 were correctly labeled\n", "Accuracy score: 0.58\n", "\n", "\n", "For k=6, the results are\n", "4014 out of 7050 were correctly labeled\n", "Accuracy score: 0.57\n", "\n", "\n", "For k=7, the results are\n", "3965 out of 7050 were correctly labeled\n", "Accuracy score: 0.56\n", "\n", "\n", "For k=8, the results are\n", "4074 out of 7050 were correctly labeled\n", "Accuracy score: 0.58\n", "\n", "\n", "For k=9, the results are\n", "4074 out of 7050 were correctly labeled\n", "Accuracy score: 0.58\n", "\n", "\n", "For k=10, the results are\n", "4009 out of 7050 were correctly labeled\n", "Accuracy score: 0.57\n", "\n", "\n", "The best value of k in range 1 to 10 is 2, with a value of 0.61\n" ] } ], "source": [ "accs=dict()\n", "for i in range(1,11):\n", " print(f\"For k={i}, the results are\")\n", " correct_labels, accuracy = doKMEAN(i)\n", " print(f\"{correct_labels} out of {y.size} were correctly labeled\")\n", " print(f'Accuracy score: {correct_labels/float(y.size):.2f}')\n", " print('\\n')\n", " accs[i]=accuracy\n", "bestk = max(accs, key=accs.get)\n", "print(f\"The best value of k in range 1 to 10 is {bestk}, with a value of {accs[bestk]:.2f}\")" ] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "venv" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.9" } }, "nbformat": 4, "nbformat_minor": 2 }