{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "ac7cbe88-834e-4787-b2d4-63209e60b5f5",
   "metadata": {},
   "source": [
    "# Clean dirty data with stored procedures"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "7e67bbbb-0190-44ed-a556-dd84f3b15385",
   "metadata": {},
   "outputs": [],
   "source": [
    "from data201 import db_connection, df_query\n",
    "\n",
    "conn = db_connection(config_file = 'MissingWeights.ini')\n",
    "cursor = conn.cursor()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "aabb921b-73dc-497b-a306-84d180a191cb",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_query(conn, 'SELECT * FROM dirty_weights')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "49a02cc0-13f3-4c67-82d5-b99b8bc31d6e",
   "metadata": {},
   "source": [
    "## The three procedures\n",
    "#### Perform a linear regression on the ***good*** student and book weight pairs from table `dirty_weights`. Return the slope `good_m` and the y intercept `good_b`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "f0851d25-7868-4ae6-8db9-380033743083",
   "metadata": {},
   "outputs": [],
   "source": [
    "cursor.execute('DROP PROCEDURE IF EXISTS regression_good_weights')\n",
    "\n",
    "cursor.execute(\n",
    "    \"\"\"\n",
    "    CREATE PROCEDURE regression_good_weights(OUT good_m DOUBLE, OUT good_b DOUBLE)\n",
    "    BEGIN\n",
    "        DECLARE n INT;\n",
    "        DECLARE sum_x, sum_y, sum_xx,  sum_xy DOUBLE;\n",
    "        DECLARE mean_x, mean_y DOUBLE;\n",
    "        \n",
    "        DECLARE numerator   DOUBLE;\n",
    "        DECLARE denominator DOUBLE;\n",
    "    \n",
    "        SELECT COUNT(*), SUM(student), SUM(books),\n",
    "               SUM(student*student), SUM(student*books),\n",
    "               AVG(student), AVG(books)           \n",
    "        INTO n, sum_x, sum_y, sum_xx, sum_xy, mean_x, mean_y\n",
    "        FROM dirty_weights\n",
    "        WHERE student > 0  -- only good student weights\n",
    "          AND books > 0;   -- only good book weights\n",
    "        \n",
    "        SET numerator   = sum_xy - (sum_x*sum_y)/n;\n",
    "        SET denominator = sum_xx - (sum_x*sum_x)/n;\n",
    "        \n",
    "        SET good_m = numerator/denominator;\n",
    "        SET good_b = mean_y - good_m*mean_x;\n",
    "    END\n",
    "    \"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "530c239f-6ad1-4076-bc41-f982a46eb67a",
   "metadata": {},
   "source": [
    "#### Create table `cleaned_weights` from table `dirty_weights` by replacing missing student and book weights with regression estimates calculated with `good_m` and `good_b`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "771e933c-28b6-43d5-9997-438382ce1d5c",
   "metadata": {},
   "outputs": [],
   "source": [
    "cursor.execute('DROP PROCEDURE IF EXISTS create_cleaned_weights')\n",
    "\n",
    "cursor.execute(\n",
    "    \"\"\"\n",
    "    CREATE PROCEDURE create_cleaned_weights(IN good_m DOUBLE, IN good_b DOUBLE)\n",
    "    BEGIN\n",
    "    \tDROP TABLE IF EXISTS cleaned_weights;\n",
    "    \tCREATE TABLE cleaned_weights LIKE dirty_weights;\n",
    "        \n",
    "        INSERT INTO cleaned_weights\n",
    "    \tSELECT IF (dw.student < 0, \n",
    "    \t\t\t\t  (dw.books - @good_b)/@good_m,  -- replace missing book weight\n",
    "    \t\t\t\t  dw.student) AS student,\n",
    "    \t\t   IF (dw.books < 0,\n",
    "    \t\t\t\t  @good_m*dw.student + @good_b,  -- replace missing student weight\n",
    "    \t\t\t\t  dw.books) AS books\n",
    "    \tFROM dirty_weights dw;\n",
    "    END\n",
    "    \"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "ffd229d8-5e08-4267-86d9-f8f9becfb4c4",
   "metadata": {},
   "source": [
    "#### Perform a linear regression on the ***cleaned*** student and book weight pairs from table `cleaned_weights`. Return the slope `cleaned_m` and the y intercept `cleaned_b`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ec453be6-d143-474c-aa12-87ba59e90a7c",
   "metadata": {},
   "outputs": [],
   "source": [
    "cursor.execute('DROP PROCEDURE IF EXISTS regression_cleaned_weights')\n",
    "\n",
    "cursor.execute(\n",
    "    \"\"\"\n",
    "    CREATE PROCEDURE regression_cleaned_weights(OUT cleaned_m DOUBLE, OUT cleaned_b DOUBLE)\n",
    "    BEGIN\n",
    "        DECLARE n INT;\n",
    "        DECLARE sum_x, sum_y, sum_xx,  sum_xy DOUBLE;\n",
    "        DECLARE mean_x, mean_y DOUBLE;\n",
    "        \n",
    "        DECLARE numerator   DOUBLE;\n",
    "        DECLARE denominator DOUBLE;\n",
    "    \n",
    "        SELECT COUNT(*), SUM(student), SUM(books),\n",
    "               SUM(student*student), SUM(student*books),\n",
    "               AVG(student), AVG(books)           \n",
    "        INTO n, sum_x, sum_y, sum_xx, sum_xy, mean_x, mean_y\n",
    "        FROM cleaned_weights;\n",
    "        \n",
    "        SET numerator   = sum_xy - (sum_x*sum_y)/n;\n",
    "        SET denominator = sum_xx - (sum_x*sum_x)/n;\n",
    "        \n",
    "        SET cleaned_m = numerator/denominator;\n",
    "        SET cleaned_b = mean_y - cleaned_m*mean_x;\n",
    "    END\n",
    "    \"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0602a2c5-c551-4341-ae57-a12b2642851e",
   "metadata": {},
   "source": [
    "## Clean the dirty database table\n",
    "#### Perform a regression on the ***good*** student and book weights from table `dirty_weights`. Receive values `@good_m` and `@good_b` upon return."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "8e1ea8a5-8dde-402c-82ff-8a19d9292008",
   "metadata": {},
   "outputs": [],
   "source": [
    "cursor.execute('CALL regression_good_weights(@good_m, @good_b)')\n",
    "\n",
    "df_query(conn, 'SELECT @good_m, @good_b')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c49383c7-5254-4d29-96fd-1ccf9a005b72",
   "metadata": {},
   "source": [
    "#### Create table `cleaned_weights` from table `dirty_weights`, passing values `@good_m` and `@good_b` received above."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d6c96431-5d0f-4e47-ac6e-20fb7c84c744",
   "metadata": {},
   "outputs": [],
   "source": [
    "cursor.execute('CALL create_cleaned_weights(@good_m, @good_b)')\n",
    "\n",
    "df_query(conn, 'SELECT * FROM cleaned_weights')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "b81c6802-e360-4e84-829b-517410d553a6",
   "metadata": {},
   "source": [
    "#### Perform a regression on the ***cleaned*** student and book weights from table `cleaned_weights`. Receive values `@cleaned_m` and `@cleaned_b` upon return."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "d7dbf082-8ebd-45a2-8cfc-c692ffba74f8",
   "metadata": {},
   "outputs": [],
   "source": [
    "cursor.execute('CALL regression_cleaned_weights(@cleaned_m, @cleaned_b)')\n",
    "\n",
    "df_query(conn, 'SELECT @cleaned_m, @cleaned_b')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "7e780923-8b92-44ac-bff8-2710b6d39e47",
   "metadata": {},
   "source": [
    "## Calculate estimates\n",
    "#### Perform regression with `@cleaned_m` and `@cleaned_b`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a26c0021-a271-40ec-8701-bb9bbef16657",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_query(conn,\n",
    "    \"\"\"\n",
    "    SELECT FORMAT(@cleaned_m*72  + @cleaned_b, 6) AS est_book_weight_72,\n",
    "           FORMAT(@cleaned_m*108 + @cleaned_b, 6) AS est_book_weight_108,\n",
    "           FORMAT(@cleaned_m*150 + @cleaned_b, 6) AS est_book_weight_150\n",
    "    \"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "dfb635e7-ebca-4b7c-a728-c49b1d441038",
   "metadata": {},
   "outputs": [],
   "source": [
    "cursor.close()\n",
    "conn.close()"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}