diff options
Diffstat (limited to 'MLLab9.ipynb')
| -rw-r--r-- | MLLab9.ipynb | 1689 |
1 files changed, 1689 insertions, 0 deletions
diff --git a/MLLab9.ipynb b/MLLab9.ipynb new file mode 100644 index 0000000..6a63ceb --- /dev/null +++ b/MLLab9.ipynb @@ -0,0 +1,1689 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 90, + "metadata": {}, + "outputs": [], + "source": [ + "\"\"\"\n", + "Name: Anand Panchdari\n", + "Roll No.: C013\n", + "Aim: Implement K means clustering and analyse the effect of varying the number of clusters.\n", + "\"\"\"\n", + "\n", + "import pandas as pd\n", + "import numpy as np\n", + "from sklearn.cluster import KMeans\n", + "from sklearn.preprocessing import LabelEncoder\n", + "from sklearn.preprocessing import MinMaxScaler\n", + "import seaborn as sns\n", + "import matplotlib.pyplot as plt\n", + "\n", + "df = pd.read_csv(\"datasets/facebook_live_sellers_thailand.csv\")\n", + "df.drop(['Column1', 'Column2', 'Column3', 'Column4'], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 91, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Shape is \n", + "(7050, 12)\n", + "\n", + "Info is \n", + "<class 'pandas.core.frame.DataFrame'>\n", + "RangeIndex: 7050 entries, 0 to 7049\n", + "Data columns (total 12 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 status_id 7050 non-null object\n", + " 1 status_type 7050 non-null object\n", + " 2 status_published 7050 non-null object\n", + " 3 num_reactions 7050 non-null int64 \n", + " 4 num_comments 7050 non-null int64 \n", + " 5 num_shares 7050 non-null int64 \n", + " 6 num_likes 7050 non-null int64 \n", + " 7 num_loves 7050 non-null int64 \n", + " 8 num_wows 7050 non-null int64 \n", + " 9 num_hahas 7050 non-null int64 \n", + " 10 num_sads 7050 non-null int64 \n", + " 11 num_angrys 7050 non-null int64 \n", + "dtypes: int64(9), object(3)\n", + "memory usage: 661.1+ KB\n", + "None\n", + "\n", + "Head is \n" + ] + }, + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "status_id", + "rawType": "object", + "type": "string" + }, + { + "name": "status_type", + "rawType": "object", + "type": "string" + }, + { + "name": "status_published", + "rawType": "object", + "type": "string" + }, + { + "name": "num_reactions", + "rawType": "int64", + "type": "integer" + }, + { + "name": "num_comments", + "rawType": "int64", + "type": "integer" + }, + { + "name": "num_shares", + "rawType": "int64", + "type": "integer" + }, + { + "name": "num_likes", + "rawType": "int64", + "type": "integer" + }, + { + "name": "num_loves", + "rawType": "int64", + "type": "integer" + }, + { + "name": "num_wows", + "rawType": "int64", + "type": "integer" + }, + { + "name": "num_hahas", + "rawType": "int64", + "type": "integer" + }, + { + "name": "num_sads", + "rawType": "int64", + "type": "integer" + }, + { + "name": "num_angrys", + "rawType": "int64", + "type": "integer" + } + ], + "conversionMethod": "pd.DataFrame", + "ref": "7f123535-4c30-4e56-8445-4c8449f02f2c", + "rows": [ + [ + "0", + "246675545449582_1649696485147474", + "video", + "4/22/2018 6:00", + "529", + "512", + "262", + "432", + "92", + "3", + "1", + "1", + "0" + ], + [ + "1", + "246675545449582_1649426988507757", + "photo", + "4/21/2018 22:45", + "150", + "0", + "0", + "150", + "0", + "0", + "0", + "0", + "0" + ], + [ + "2", + "246675545449582_1648730588577397", + "video", + "4/21/2018 6:17", + "227", + "236", + "57", + "204", + "21", + "1", + "1", + "0", + "0" + ], + [ + "3", + "246675545449582_1648576705259452", + "photo", + "4/21/2018 2:29", + "111", + "0", + "0", + "111", + "0", + "0", + "0", + "0", + "0" + ], + [ + "4", + "246675545449582_1645700502213739", + "photo", + "4/18/2018 3:22", + "213", + "0", + "0", + "204", + "9", + "0", + "0", + "0", + "0" + ] + ], + "shape": { + "columns": 12, + "rows": 5 + } + }, + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>status_id</th>\n", + " <th>status_type</th>\n", + " <th>status_published</th>\n", + " <th>num_reactions</th>\n", + " <th>num_comments</th>\n", + " <th>num_shares</th>\n", + " <th>num_likes</th>\n", + " <th>num_loves</th>\n", + " <th>num_wows</th>\n", + " <th>num_hahas</th>\n", + " <th>num_sads</th>\n", + " <th>num_angrys</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>246675545449582_1649696485147474</td>\n", + " <td>video</td>\n", + " <td>4/22/2018 6:00</td>\n", + " <td>529</td>\n", + " <td>512</td>\n", + " <td>262</td>\n", + " <td>432</td>\n", + " <td>92</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>246675545449582_1649426988507757</td>\n", + " <td>photo</td>\n", + " <td>4/21/2018 22:45</td>\n", + " <td>150</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>150</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>246675545449582_1648730588577397</td>\n", + " <td>video</td>\n", + " <td>4/21/2018 6:17</td>\n", + " <td>227</td>\n", + " <td>236</td>\n", + " <td>57</td>\n", + " <td>204</td>\n", + " <td>21</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>246675545449582_1648576705259452</td>\n", + " <td>photo</td>\n", + " <td>4/21/2018 2:29</td>\n", + " <td>111</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>111</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>246675545449582_1645700502213739</td>\n", + " <td>photo</td>\n", + " <td>4/18/2018 3:22</td>\n", + " <td>213</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>204</td>\n", + " <td>9</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " status_id status_type status_published \\\n", + "0 246675545449582_1649696485147474 video 4/22/2018 6:00 \n", + "1 246675545449582_1649426988507757 photo 4/21/2018 22:45 \n", + "2 246675545449582_1648730588577397 video 4/21/2018 6:17 \n", + "3 246675545449582_1648576705259452 photo 4/21/2018 2:29 \n", + "4 246675545449582_1645700502213739 photo 4/18/2018 3:22 \n", + "\n", + " num_reactions num_comments num_shares num_likes num_loves num_wows \\\n", + "0 529 512 262 432 92 3 \n", + "1 150 0 0 150 0 0 \n", + "2 227 236 57 204 21 1 \n", + "3 111 0 0 111 0 0 \n", + "4 213 0 0 204 9 0 \n", + "\n", + " num_hahas num_sads num_angrys \n", + "0 1 1 0 \n", + "1 0 0 0 \n", + "2 1 0 0 \n", + "3 0 0 0 \n", + "4 0 0 0 " + ] + }, + "execution_count": 91, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "print(f\"Shape is \\n{df.shape}\")\n", + "print(f\"\\nInfo is \")\n", + "print(df.info())\n", + "print(f\"\\nHead is \")\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 92, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "object", + "type": "string" + }, + { + "name": "num_reactions", + "rawType": "float64", + "type": "float" + }, + { + "name": "num_comments", + "rawType": "float64", + "type": "float" + }, + { + "name": "num_shares", + "rawType": "float64", + "type": "float" + }, + { + "name": "num_likes", + "rawType": "float64", + "type": "float" + }, + { + "name": "num_loves", + "rawType": "float64", + "type": "float" + }, + { + "name": "num_wows", + "rawType": "float64", + "type": "float" + }, + { + "name": "num_hahas", + "rawType": "float64", + "type": "float" + }, + { + "name": "num_sads", + "rawType": "float64", + "type": "float" + }, + { + "name": "num_angrys", + "rawType": "float64", + "type": "float" + } + ], + "conversionMethod": "pd.DataFrame", + "ref": "33b60859-3895-45b7-a506-f90c76687896", + "rows": [ + [ + "count", + "7050.0", + "7050.0", + "7050.0", + "7050.0", + "7050.0", + "7050.0", + "7050.0", + "7050.0", + "7050.0" + ], + [ + "mean", + "230.11716312056737", + "224.3560283687943", + "40.022553191489365", + "215.0431205673759", + "12.728652482269503", + "1.2893617021276595", + "0.6964539007092199", + "0.24368794326241136", + "0.11319148936170213" + ], + [ + "std", + "462.6253091352333", + "889.6368195190058", + "131.59996549017612", + "449.47235705614156", + "39.97293010859566", + "8.719650380381506", + "3.9571834429528265", + "1.597155939511341", + "0.7268118906561141" + ], + [ + "min", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0" + ], + [ + "25%", + "17.0", + "0.0", + "0.0", + "17.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0" + ], + [ + "50%", + "59.5", + "4.0", + "0.0", + "58.0", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0" + ], + [ + "75%", + "219.0", + "23.0", + "4.0", + "184.75", + "3.0", + "0.0", + "0.0", + "0.0", + "0.0" + ], + [ + "max", + "4710.0", + "20990.0", + "3424.0", + "4710.0", + "657.0", + "278.0", + "157.0", + "51.0", + "31.0" + ] + ], + "shape": { + "columns": 9, + "rows": 8 + } + }, + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>num_reactions</th>\n", + " <th>num_comments</th>\n", + " <th>num_shares</th>\n", + " <th>num_likes</th>\n", + " <th>num_loves</th>\n", + " <th>num_wows</th>\n", + " <th>num_hahas</th>\n", + " <th>num_sads</th>\n", + " <th>num_angrys</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>count</th>\n", + " <td>7050.000000</td>\n", + " <td>7050.000000</td>\n", + " <td>7050.000000</td>\n", + " <td>7050.000000</td>\n", + " <td>7050.000000</td>\n", + " <td>7050.000000</td>\n", + " <td>7050.000000</td>\n", + " <td>7050.000000</td>\n", + " <td>7050.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>mean</th>\n", + " <td>230.117163</td>\n", + " <td>224.356028</td>\n", + " <td>40.022553</td>\n", + " <td>215.043121</td>\n", + " <td>12.728652</td>\n", + " <td>1.289362</td>\n", + " <td>0.696454</td>\n", + " <td>0.243688</td>\n", + " <td>0.113191</td>\n", + " </tr>\n", + " <tr>\n", + " <th>std</th>\n", + " <td>462.625309</td>\n", + " <td>889.636820</td>\n", + " <td>131.599965</td>\n", + " <td>449.472357</td>\n", + " <td>39.972930</td>\n", + " <td>8.719650</td>\n", + " <td>3.957183</td>\n", + " <td>1.597156</td>\n", + " <td>0.726812</td>\n", + " </tr>\n", + " <tr>\n", + " <th>min</th>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>25%</th>\n", + " <td>17.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>17.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>50%</th>\n", + " <td>59.500000</td>\n", + " <td>4.000000</td>\n", + " <td>0.000000</td>\n", + " <td>58.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>75%</th>\n", + " <td>219.000000</td>\n", + " <td>23.000000</td>\n", + " <td>4.000000</td>\n", + " <td>184.750000</td>\n", + " <td>3.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " </tr>\n", + " <tr>\n", + " <th>max</th>\n", + " <td>4710.000000</td>\n", + " <td>20990.000000</td>\n", + " <td>3424.000000</td>\n", + " <td>4710.000000</td>\n", + " <td>657.000000</td>\n", + " <td>278.000000</td>\n", + " <td>157.000000</td>\n", + " <td>51.000000</td>\n", + " <td>31.000000</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " num_reactions num_comments num_shares num_likes num_loves \\\n", + "count 7050.000000 7050.000000 7050.000000 7050.000000 7050.000000 \n", + "mean 230.117163 224.356028 40.022553 215.043121 12.728652 \n", + "std 462.625309 889.636820 131.599965 449.472357 39.972930 \n", + "min 0.000000 0.000000 0.000000 0.000000 0.000000 \n", + "25% 17.000000 0.000000 0.000000 17.000000 0.000000 \n", + "50% 59.500000 4.000000 0.000000 58.000000 0.000000 \n", + "75% 219.000000 23.000000 4.000000 184.750000 3.000000 \n", + "max 4710.000000 20990.000000 3424.000000 4710.000000 657.000000 \n", + "\n", + " num_wows num_hahas num_sads num_angrys \n", + "count 7050.000000 7050.000000 7050.000000 7050.000000 \n", + "mean 1.289362 0.696454 0.243688 0.113191 \n", + "std 8.719650 3.957183 1.597156 0.726812 \n", + "min 0.000000 0.000000 0.000000 0.000000 \n", + "25% 0.000000 0.000000 0.000000 0.000000 \n", + "50% 0.000000 0.000000 0.000000 0.000000 \n", + "75% 0.000000 0.000000 0.000000 0.000000 \n", + "max 278.000000 157.000000 51.000000 31.000000 " + ] + }, + "execution_count": 92, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 93, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Status id 6997\n", + "Status published 6913\n" + ] + } + ], + "source": [ + "print(f\"Status id {len(df['status_id'].unique())}\")\n", + "print(f\"Status published {len(df['status_published'].unique())}\")\n", + "df.drop(['status_id', 'status_published'], axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 94, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "RangeIndex: 7050 entries, 0 to 7049\n", + "Data columns (total 10 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 status_type 7050 non-null object\n", + " 1 num_reactions 7050 non-null int64 \n", + " 2 num_comments 7050 non-null int64 \n", + " 3 num_shares 7050 non-null int64 \n", + " 4 num_likes 7050 non-null int64 \n", + " 5 num_loves 7050 non-null int64 \n", + " 6 num_wows 7050 non-null int64 \n", + " 7 num_hahas 7050 non-null int64 \n", + " 8 num_sads 7050 non-null int64 \n", + " 9 num_angrys 7050 non-null int64 \n", + "dtypes: int64(9), object(1)\n", + "memory usage: 550.9+ KB\n" + ] + }, + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "status_type", + "rawType": "object", + "type": "string" + }, + { + "name": "num_reactions", + "rawType": "int64", + "type": "integer" + }, + { + "name": "num_comments", + "rawType": "int64", + "type": "integer" + }, + { + "name": "num_shares", + "rawType": "int64", + "type": "integer" + }, + { + "name": "num_likes", + "rawType": "int64", + "type": "integer" + }, + { + "name": "num_loves", + "rawType": "int64", + "type": "integer" + }, + { + "name": "num_wows", + "rawType": "int64", + "type": "integer" + }, + { + "name": "num_hahas", + "rawType": "int64", + "type": "integer" + }, + { + "name": "num_sads", + "rawType": "int64", + "type": "integer" + }, + { + "name": "num_angrys", + "rawType": "int64", + "type": "integer" + } + ], + "conversionMethod": "pd.DataFrame", + "ref": "2a10bfb4-6c9b-482f-aaf9-7a9f3e84fb4a", + "rows": [ + [ + "0", + "video", + "529", + "512", + "262", + "432", + "92", + "3", + "1", + "1", + "0" + ], + [ + "1", + "photo", + "150", + "0", + "0", + "150", + "0", + "0", + "0", + "0", + "0" + ], + [ + "2", + "video", + "227", + "236", + "57", + "204", + "21", + "1", + "1", + "0", + "0" + ], + [ + "3", + "photo", + "111", + "0", + "0", + "111", + "0", + "0", + "0", + "0", + "0" + ], + [ + "4", + "photo", + "213", + "0", + "0", + "204", + "9", + "0", + "0", + "0", + "0" + ] + ], + "shape": { + "columns": 10, + "rows": 5 + } + }, + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>status_type</th>\n", + " <th>num_reactions</th>\n", + " <th>num_comments</th>\n", + " <th>num_shares</th>\n", + " <th>num_likes</th>\n", + " <th>num_loves</th>\n", + " <th>num_wows</th>\n", + " <th>num_hahas</th>\n", + " <th>num_sads</th>\n", + " <th>num_angrys</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>video</td>\n", + " <td>529</td>\n", + " <td>512</td>\n", + " <td>262</td>\n", + " <td>432</td>\n", + " <td>92</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>photo</td>\n", + " <td>150</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>150</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>video</td>\n", + " <td>227</td>\n", + " <td>236</td>\n", + " <td>57</td>\n", + " <td>204</td>\n", + " <td>21</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>photo</td>\n", + " <td>111</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>111</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>photo</td>\n", + " <td>213</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>204</td>\n", + " <td>9</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " status_type num_reactions num_comments num_shares num_likes num_loves \\\n", + "0 video 529 512 262 432 92 \n", + "1 photo 150 0 0 150 0 \n", + "2 video 227 236 57 204 21 \n", + "3 photo 111 0 0 111 0 \n", + "4 photo 213 0 0 204 9 \n", + "\n", + " num_wows num_hahas num_sads num_angrys \n", + "0 3 1 1 0 \n", + "1 0 0 0 0 \n", + "2 1 1 0 0 \n", + "3 0 0 0 0 \n", + "4 0 0 0 0 " + ] + }, + "execution_count": 94, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.info()\n", + "\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "<class 'pandas.core.frame.DataFrame'>\n", + "RangeIndex: 7050 entries, 0 to 7049\n", + "Data columns (total 10 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 status_type 7050 non-null object\n", + " 1 num_reactions 7050 non-null int64 \n", + " 2 num_comments 7050 non-null int64 \n", + " 3 num_shares 7050 non-null int64 \n", + " 4 num_likes 7050 non-null int64 \n", + " 5 num_loves 7050 non-null int64 \n", + " 6 num_wows 7050 non-null int64 \n", + " 7 num_hahas 7050 non-null int64 \n", + " 8 num_sads 7050 non-null int64 \n", + " 9 num_angrys 7050 non-null int64 \n", + "dtypes: int64(9), object(1)\n", + "memory usage: 550.9+ KB\n" + ] + } + ], + "source": [ + "X = df\n", + "y = df['status_type']\n", + "X.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 96, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "status_type", + "rawType": "int64", + "type": "integer" + }, + { + "name": "num_reactions", + "rawType": "int64", + "type": "integer" + }, + { + "name": "num_comments", + "rawType": "int64", + "type": "integer" + }, + { + "name": "num_shares", + "rawType": "int64", + "type": "integer" + }, + { + "name": "num_likes", + "rawType": "int64", + "type": "integer" + }, + { + "name": "num_loves", + "rawType": "int64", + "type": "integer" + }, + { + "name": "num_wows", + "rawType": "int64", + "type": "integer" + }, + { + "name": "num_hahas", + "rawType": "int64", + "type": "integer" + }, + { + "name": "num_sads", + "rawType": "int64", + "type": "integer" + }, + { + "name": "num_angrys", + "rawType": "int64", + "type": "integer" + } + ], + "conversionMethod": "pd.DataFrame", + "ref": "91580faa-92d9-40f7-8b70-aa5e336ed5a1", + "rows": [ + [ + "0", + "3", + "529", + "512", + "262", + "432", + "92", + "3", + "1", + "1", + "0" + ], + [ + "1", + "1", + "150", + "0", + "0", + "150", + "0", + "0", + "0", + "0", + "0" + ], + [ + "2", + "3", + "227", + "236", + "57", + "204", + "21", + "1", + "1", + "0", + "0" + ], + [ + "3", + "1", + "111", + "0", + "0", + "111", + "0", + "0", + "0", + "0", + "0" + ], + [ + "4", + "1", + "213", + "0", + "0", + "204", + "9", + "0", + "0", + "0", + "0" + ] + ], + "shape": { + "columns": 10, + "rows": 5 + } + }, + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead th {\n", + " text-align: right;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr style=\"text-align: right;\">\n", + " <th></th>\n", + " <th>status_type</th>\n", + " <th>num_reactions</th>\n", + " <th>num_comments</th>\n", + " <th>num_shares</th>\n", + " <th>num_likes</th>\n", + " <th>num_loves</th>\n", + " <th>num_wows</th>\n", + " <th>num_hahas</th>\n", + " <th>num_sads</th>\n", + " <th>num_angrys</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>3</td>\n", + " <td>529</td>\n", + " <td>512</td>\n", + " <td>262</td>\n", + " <td>432</td>\n", + " <td>92</td>\n", + " <td>3</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>1</td>\n", + " <td>150</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>150</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>3</td>\n", + " <td>227</td>\n", + " <td>236</td>\n", + " <td>57</td>\n", + " <td>204</td>\n", + " <td>21</td>\n", + " <td>1</td>\n", + " <td>1</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>1</td>\n", + " <td>111</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>111</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>1</td>\n", + " <td>213</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>204</td>\n", + " <td>9</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " <td>0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " status_type num_reactions num_comments num_shares num_likes num_loves \\\n", + "0 3 529 512 262 432 92 \n", + "1 1 150 0 0 150 0 \n", + "2 3 227 236 57 204 21 \n", + "3 1 111 0 0 111 0 \n", + "4 1 213 0 0 204 9 \n", + "\n", + " num_wows num_hahas num_sads num_angrys \n", + "0 3 1 1 0 \n", + "1 0 0 0 0 \n", + "2 1 1 0 0 \n", + "3 0 0 0 0 \n", + "4 0 0 0 0 " + ] + }, + "execution_count": 96, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "le = LabelEncoder()\n", + "X['status_type'] = le.fit_transform(X['status_type'])\n", + "y = le.transform(y)\n", + "X.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 97, + "metadata": {}, + "outputs": [ + { + "data": { + "application/vnd.microsoft.datawrangler.viewer.v0+json": { + "columns": [ + { + "name": "index", + "rawType": "int64", + "type": "integer" + }, + { + "name": "('status_type',)", + "rawType": "float64", + "type": "float" + }, + { + "name": "('num_reactions',)", + "rawType": "float64", + "type": "float" + }, + { + "name": "('num_comments',)", + "rawType": "float64", + "type": "float" + }, + { + "name": "('num_shares',)", + "rawType": "float64", + "type": "float" + }, + { + "name": "('num_likes',)", + "rawType": "float64", + "type": "float" + }, + { + "name": "('num_loves',)", + "rawType": "float64", + "type": "float" + }, + { + "name": "('num_wows',)", + "rawType": "float64", + "type": "float" + }, + { + "name": "('num_hahas',)", + "rawType": "float64", + "type": "float" + }, + { + "name": "('num_sads',)", + "rawType": "float64", + "type": "float" + }, + { + "name": "('num_angrys',)", + "rawType": "float64", + "type": "float" + } + ], + "conversionMethod": "pd.DataFrame", + "ref": "bd2a34d2-73a6-44b1-ac3e-9c2c07c39987", + "rows": [ + [ + "0", + "1.0", + "0.11231422505307856", + "0.024392567889471178", + "0.07651869158878503", + "0.09171974522292994", + "0.1400304414003044", + "0.01079136690647482", + "0.006369426751592357", + "0.0196078431372549", + "0.0" + ], + [ + "1", + "0.3333333333333333", + "0.03184713375796178", + "0.0", + "0.0", + "0.03184713375796178", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0" + ], + [ + "2", + "1.0", + "0.048195329087048835", + "0.01124344926155312", + "0.01664719626168224", + "0.04331210191082803", + "0.0319634703196347", + "0.0035971223021582736", + "0.006369426751592357", + "0.0", + "0.0" + ], + [ + "3", + "0.3333333333333333", + "0.02356687898089172", + "0.0", + "0.0", + "0.02356687898089172", + "0.0", + "0.0", + "0.0", + "0.0", + "0.0" + ], + [ + "4", + "0.3333333333333333", + "0.045222929936305736", + "0.0", + "0.0", + "0.04331210191082803", + "0.0136986301369863", + "0.0", + "0.0", + "0.0", + "0.0" + ] + ], + "shape": { + "columns": 10, + "rows": 5 + } + }, + "text/html": [ + "<div>\n", + "<style scoped>\n", + " .dataframe tbody tr th:only-of-type {\n", + " vertical-align: middle;\n", + " }\n", + "\n", + " .dataframe tbody tr th {\n", + " vertical-align: top;\n", + " }\n", + "\n", + " .dataframe thead tr th {\n", + " text-align: left;\n", + " }\n", + "</style>\n", + "<table border=\"1\" class=\"dataframe\">\n", + " <thead>\n", + " <tr>\n", + " <th></th>\n", + " <th>status_type</th>\n", + " <th>num_reactions</th>\n", + " <th>num_comments</th>\n", + " <th>num_shares</th>\n", + " <th>num_likes</th>\n", + " <th>num_loves</th>\n", + " <th>num_wows</th>\n", + " <th>num_hahas</th>\n", + " <th>num_sads</th>\n", + " <th>num_angrys</th>\n", + " </tr>\n", + " </thead>\n", + " <tbody>\n", + " <tr>\n", + " <th>0</th>\n", + " <td>1.000000</td>\n", + " <td>0.112314</td>\n", + " <td>0.024393</td>\n", + " <td>0.076519</td>\n", + " <td>0.091720</td>\n", + " <td>0.140030</td>\n", + " <td>0.010791</td>\n", + " <td>0.006369</td>\n", + " <td>0.019608</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>1</th>\n", + " <td>0.333333</td>\n", + " <td>0.031847</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.031847</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>2</th>\n", + " <td>1.000000</td>\n", + " <td>0.048195</td>\n", + " <td>0.011243</td>\n", + " <td>0.016647</td>\n", + " <td>0.043312</td>\n", + " <td>0.031963</td>\n", + " <td>0.003597</td>\n", + " <td>0.006369</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>3</th>\n", + " <td>0.333333</td>\n", + " <td>0.023567</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.023567</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " <tr>\n", + " <th>4</th>\n", + " <td>0.333333</td>\n", + " <td>0.045223</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.043312</td>\n", + " <td>0.013699</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.000000</td>\n", + " <td>0.0</td>\n", + " </tr>\n", + " </tbody>\n", + "</table>\n", + "</div>" + ], + "text/plain": [ + " status_type num_reactions num_comments num_shares num_likes num_loves \\\n", + "0 1.000000 0.112314 0.024393 0.076519 0.091720 0.140030 \n", + "1 0.333333 0.031847 0.000000 0.000000 0.031847 0.000000 \n", + "2 1.000000 0.048195 0.011243 0.016647 0.043312 0.031963 \n", + "3 0.333333 0.023567 0.000000 0.000000 0.023567 0.000000 \n", + "4 0.333333 0.045223 0.000000 0.000000 0.043312 0.013699 \n", + "\n", + " num_wows num_hahas num_sads num_angrys \n", + "0 0.010791 0.006369 0.019608 0.0 \n", + "1 0.000000 0.000000 0.000000 0.0 \n", + "2 0.003597 0.006369 0.000000 0.0 \n", + "3 0.000000 0.000000 0.000000 0.0 \n", + "4 0.000000 0.000000 0.000000 0.0 " + ] + }, + "execution_count": 97, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "column = X.columns\n", + "minmax = MinMaxScaler()\n", + "X = minmax.fit_transform(X)\n", + "X = pd.DataFrame(X, columns=[column])\n", + "X.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "def doKMEAN(n):\n", + " kmean = KMeans(n_clusters=n, random_state=0)\n", + " kmean.fit(X)\n", + " labels = kmean.labels_\n", + " correct_labels = sum(y==labels)\n", + " accuracy =correct_labels/float(y.size)\n", + " return correct_labels, accuracy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "For k=1, the results are\n", + "63 out of 7050 were correctly labeled\n", + "Accuracy score: 0.01\n", + "\n", + "\n", + "For k=2, the results are\n", + "4288 out of 7050 were correctly labeled\n", + "Accuracy score: 0.61\n", + "\n", + "\n", + "For k=3, the results are\n", + "4066 out of 7050 were correctly labeled\n", + "Accuracy score: 0.58\n", + "\n", + "\n", + "For k=4, the results are\n", + "4112 out of 7050 were correctly labeled\n", + "Accuracy score: 0.58\n", + "\n", + "\n", + "For k=5, the results are\n", + "4080 out of 7050 were correctly labeled\n", + "Accuracy score: 0.58\n", + "\n", + "\n", + "For k=6, the results are\n", + "4014 out of 7050 were correctly labeled\n", + "Accuracy score: 0.57\n", + "\n", + "\n", + "For k=7, the results are\n", + "3965 out of 7050 were correctly labeled\n", + "Accuracy score: 0.56\n", + "\n", + "\n", + "For k=8, the results are\n", + "4074 out of 7050 were correctly labeled\n", + "Accuracy score: 0.58\n", + "\n", + "\n", + "For k=9, the results are\n", + "4074 out of 7050 were correctly labeled\n", + "Accuracy score: 0.58\n", + "\n", + "\n", + "For k=10, the results are\n", + "4009 out of 7050 were correctly labeled\n", + "Accuracy score: 0.57\n", + "\n", + "\n", + "The best value of k in range 1 to 10 is 2, with a value of 0.61\n" + ] + } + ], + "source": [ + "accs=dict()\n", + "for i in range(1,11):\n", + " print(f\"For k={i}, the results are\")\n", + " correct_labels, accuracy = doKMEAN(i)\n", + " print(f\"{correct_labels} out of {y.size} were correctly labeled\")\n", + " print(f'Accuracy score: {correct_labels/float(y.size):.2f}')\n", + " print('\\n')\n", + " accs[i]=accuracy\n", + "bestk = max(accs, key=accs.get)\n", + "print(f\"The best value of k in range 1 to 10 is {bestk}, with a value of {accs[bestk]:.2f}\")" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "venv", + "language": "python", + "name": "venv" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.12.9" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} |
