{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "### Load and explore the dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.datasets import load_iris\n", "\n", "iris = load_iris()\n", "print(iris.DESCR)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(f'iris.data.shape = {iris.data.shape}')\n", "print(f'iris.target.shape = {iris.target.shape}')\n", "print(f'iris.target_names = {iris.target_names}')\n", "print(f'iris.feature_names = {iris.feature_names}')" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create a Pandas dataframe" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "\n", "pd.set_option('max_columns', 5)\n", "pd.set_option('display.width', None)\n", "\n", "iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)\n", "iris_df['species'] = [iris.target_names[i] for i in iris.target]\n", "\n", "iris_df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "pd.set_option('precision', 2)\n", "iris_df.describe()" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "iris_df['species'].describe()" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Visualize the dataset" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "%matplotlib inline\n", "import seaborn as sns\n", "\n", "sns.set(font_scale=1.1)\n", "sns.set_style('whitegrid')\n", "\n", "grid = sns.pairplot(data=iris_df, vars=iris_df.columns[0:4], hue='species')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "grid = sns.pairplot(data=iris_df, vars=iris_df.columns[0:4])" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Create a k-means estimator and fit the model" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.cluster import KMeans\n", "\n", "kmeans = KMeans(n_clusters=3, random_state=11) # find three clusters\n", "kmeans.fit(iris.data)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Compare the k-means labels to the Iris dataset’s target values" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(kmeans.labels_[0:50]) # setosa" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(kmeans.labels_[50:100]) # versicolor" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "print(kmeans.labels_[100:150]) # virginica" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Dimensionality reduction with Principal Component Analysis (PCA)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "from sklearn.decomposition import PCA\n", "\n", "pca = PCA(n_components=2, random_state=11) # reduce to two components\n", "pca.fit(iris.data)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "reduced_iris = pca.transform(iris.data)\n", "reduced_iris.shape" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Visualize the reduced data" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "reduced_iris_df = pd.DataFrame(reduced_iris, \n", " columns=['Component 1', 'Component 2'])\n", "reduced_iris_df['species'] = iris_df.species" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import matplotlib.pyplot as plt\n", "\n", "axes = sns.scatterplot(data=reduced_iris_df, hue='species', legend='brief', \n", " x='Component 1', y='Component 2') \n", "\n", "iris_centers = pca.transform(kmeans.cluster_centers_)\n", "dots = plt.scatter(iris_centers[:,0], iris_centers[:,1], s=100, c='k')" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "##########################################################################\n", "# (C) Copyright 2019 by Deitel & Associates, Inc. and #\n", "# Pearson Education, Inc. All Rights Reserved. #\n", "# #\n", "# DISCLAIMER: The authors and publisher of this book have used their #\n", "# best efforts in preparing the book. These efforts include the #\n", "# development, research, and testing of the theories and programs #\n", "# to determine their effectiveness. The authors and publisher make #\n", "# no warranty of any kind, expressed or implied, with regard to these #\n", "# programs or to the documentation contained in these books. The authors #\n", "# and publisher shall not be liable in any event for incidental or #\n", "# consequential damages in connection with, or arising out of, the #\n", "# furnishing, performance, or use of these programs. #\n", "##########################################################################\n" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.7" } }, "nbformat": 4, "nbformat_minor": 4 }