{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Load and explore the dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.datasets import load_iris\n",
    "\n",
    "iris = load_iris()\n",
    "print(iris.DESCR)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(f'iris.data.shape    = {iris.data.shape}')\n",
    "print(f'iris.target.shape  = {iris.target.shape}')\n",
    "print(f'iris.target_names  = {iris.target_names}')\n",
    "print(f'iris.feature_names = {iris.feature_names}')"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create a Pandas dataframe"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "\n",
    "pd.set_option('max_columns', 5)\n",
    "pd.set_option('display.width', None)\n",
    "\n",
    "iris_df = pd.DataFrame(iris.data, columns=iris.feature_names)\n",
    "iris_df['species'] = [iris.target_names[i] for i in iris.target]\n",
    "\n",
    "iris_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "pd.set_option('precision', 2)\n",
    "iris_df.describe()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "iris_df['species'].describe()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Visualize the dataset"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "%matplotlib inline\n",
    "import seaborn as sns\n",
    "\n",
    "sns.set(font_scale=1.1)\n",
    "sns.set_style('whitegrid')\n",
    "\n",
    "grid = sns.pairplot(data=iris_df, vars=iris_df.columns[0:4], hue='species')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "grid = sns.pairplot(data=iris_df, vars=iris_df.columns[0:4])"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Create a k-means estimator and fit the model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.cluster import KMeans\n",
    "\n",
    "kmeans = KMeans(n_clusters=3, random_state=11)  # find three clusters\n",
    "kmeans.fit(iris.data)"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Compare the k-means labels to the Iris dataset’s target values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(kmeans.labels_[0:50])  # setosa"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(kmeans.labels_[50:100])  # versicolor"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "print(kmeans.labels_[100:150])  # virginica"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Dimensionality reduction with Principal Component Analysis (PCA)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.decomposition import PCA\n",
    "\n",
    "pca = PCA(n_components=2, random_state=11)  # reduce to two components\n",
    "pca.fit(iris.data)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "reduced_iris = pca.transform(iris.data)\n",
    "reduced_iris.shape"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "### Visualize the reduced data"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "reduced_iris_df = pd.DataFrame(reduced_iris, \n",
    "                               columns=['Component 1', 'Component 2'])\n",
    "reduced_iris_df['species'] = iris_df.species"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt\n",
    "\n",
    "axes = sns.scatterplot(data=reduced_iris_df, hue='species', legend='brief', \n",
    "                       x='Component 1', y='Component 2') \n",
    "\n",
    "iris_centers = pca.transform(kmeans.cluster_centers_)\n",
    "dots = plt.scatter(iris_centers[:,0], iris_centers[:,1], s=100, c='k')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": [
    "##########################################################################\n",
    "# (C) Copyright 2019 by Deitel & Associates, Inc. and                    #\n",
    "# Pearson Education, Inc. All Rights Reserved.                           #\n",
    "#                                                                        #\n",
    "# DISCLAIMER: The authors and publisher of this book have used their     #\n",
    "# best efforts in preparing the book. These efforts include the          #\n",
    "# development, research, and testing of the theories and programs        #\n",
    "# to determine their effectiveness. The authors and publisher make       #\n",
    "# no warranty of any kind, expressed or implied, with regard to these    #\n",
    "# programs or to the documentation contained in these books. The authors #\n",
    "# and publisher shall not be liable in any event for incidental or       #\n",
    "# consequential damages in connection with, or arising out of, the       #\n",
    "# furnishing, performance, or use of these programs.                     #\n",
    "##########################################################################\n"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.7"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}