{ "cells": [ { "cell_type": "markdown", "id": "ac7cbe88-834e-4787-b2d4-63209e60b5f5", "metadata": {}, "source": [ "# Clean dirty data with stored procedures" ] }, { "cell_type": "code", "execution_count": null, "id": "7e67bbbb-0190-44ed-a556-dd84f3b15385", "metadata": {}, "outputs": [], "source": [ "from data201 import db_connection, df_query\n", "\n", "conn = db_connection(config_file = 'MissingWeights.ini')\n", "cursor = conn.cursor()" ] }, { "cell_type": "code", "execution_count": null, "id": "aabb921b-73dc-497b-a306-84d180a191cb", "metadata": {}, "outputs": [], "source": [ "df_query(conn, 'SELECT * FROM dirty_weights')" ] }, { "cell_type": "markdown", "id": "49a02cc0-13f3-4c67-82d5-b99b8bc31d6e", "metadata": {}, "source": [ "## The three procedures\n", "#### Perform a linear regression on the ***good*** student and book weight pairs from table `dirty_weights`. Return the slope `good_m` and the y intercept `good_b`." ] }, { "cell_type": "code", "execution_count": null, "id": "f0851d25-7868-4ae6-8db9-380033743083", "metadata": {}, "outputs": [], "source": [ "cursor.execute('DROP PROCEDURE IF EXISTS regression_good_weights')\n", "\n", "cursor.execute(\n", " \"\"\"\n", " CREATE PROCEDURE regression_good_weights(OUT good_m DOUBLE, OUT good_b DOUBLE)\n", " BEGIN\n", " DECLARE n INT;\n", " DECLARE sum_x, sum_y, sum_xx, sum_xy DOUBLE;\n", " DECLARE mean_x, mean_y DOUBLE;\n", " \n", " DECLARE numerator DOUBLE;\n", " DECLARE denominator DOUBLE;\n", " \n", " SELECT COUNT(*), SUM(student), SUM(books),\n", " SUM(student*student), SUM(student*books),\n", " AVG(student), AVG(books) \n", " INTO n, sum_x, sum_y, sum_xx, sum_xy, mean_x, mean_y\n", " FROM dirty_weights\n", " WHERE student > 0 -- only good student weights\n", " AND books > 0; -- only good book weights\n", " \n", " SET numerator = sum_xy - (sum_x*sum_y)/n;\n", " SET denominator = sum_xx - (sum_x*sum_x)/n;\n", " \n", " SET good_m = numerator/denominator;\n", " SET good_b = mean_y - good_m*mean_x;\n", " END\n", " \"\"\"\n", ")" ] }, { "cell_type": "markdown", "id": "530c239f-6ad1-4076-bc41-f982a46eb67a", "metadata": {}, "source": [ "#### Create table `cleaned_weights` from table `dirty_weights` by replacing missing student and book weights with regression estimates calculated with `good_m` and `good_b`." ] }, { "cell_type": "code", "execution_count": null, "id": "771e933c-28b6-43d5-9997-438382ce1d5c", "metadata": {}, "outputs": [], "source": [ "cursor.execute('DROP PROCEDURE IF EXISTS create_cleaned_weights')\n", "\n", "cursor.execute(\n", " \"\"\"\n", " CREATE PROCEDURE create_cleaned_weights(IN good_m DOUBLE, IN good_b DOUBLE)\n", " BEGIN\n", " \tDROP TABLE IF EXISTS cleaned_weights;\n", " \tCREATE TABLE cleaned_weights LIKE dirty_weights;\n", " \n", " INSERT INTO cleaned_weights\n", " \tSELECT IF (dw.student < 0, \n", " \t\t\t\t (dw.books - @good_b)/@good_m, -- replace missing book weight\n", " \t\t\t\t dw.student) AS student,\n", " \t\t IF (dw.books < 0,\n", " \t\t\t\t @good_m*dw.student + @good_b, -- replace missing student weight\n", " \t\t\t\t dw.books) AS books\n", " \tFROM dirty_weights dw;\n", " END\n", " \"\"\"\n", ")" ] }, { "cell_type": "markdown", "id": "ffd229d8-5e08-4267-86d9-f8f9becfb4c4", "metadata": {}, "source": [ "#### Perform a linear regression on the ***cleaned*** student and book weight pairs from table `cleaned_weights`. Return the slope `cleaned_m` and the y intercept `cleaned_b`." ] }, { "cell_type": "code", "execution_count": null, "id": "ec453be6-d143-474c-aa12-87ba59e90a7c", "metadata": {}, "outputs": [], "source": [ "cursor.execute('DROP PROCEDURE IF EXISTS regression_cleaned_weights')\n", "\n", "cursor.execute(\n", " \"\"\"\n", " CREATE PROCEDURE regression_cleaned_weights(OUT cleaned_m DOUBLE, OUT cleaned_b DOUBLE)\n", " BEGIN\n", " DECLARE n INT;\n", " DECLARE sum_x, sum_y, sum_xx, sum_xy DOUBLE;\n", " DECLARE mean_x, mean_y DOUBLE;\n", " \n", " DECLARE numerator DOUBLE;\n", " DECLARE denominator DOUBLE;\n", " \n", " SELECT COUNT(*), SUM(student), SUM(books),\n", " SUM(student*student), SUM(student*books),\n", " AVG(student), AVG(books) \n", " INTO n, sum_x, sum_y, sum_xx, sum_xy, mean_x, mean_y\n", " FROM cleaned_weights;\n", " \n", " SET numerator = sum_xy - (sum_x*sum_y)/n;\n", " SET denominator = sum_xx - (sum_x*sum_x)/n;\n", " \n", " SET cleaned_m = numerator/denominator;\n", " SET cleaned_b = mean_y - cleaned_m*mean_x;\n", " END\n", " \"\"\"\n", ")" ] }, { "cell_type": "markdown", "id": "0602a2c5-c551-4341-ae57-a12b2642851e", "metadata": {}, "source": [ "## Clean the dirty database table\n", "#### Perform a regression on the ***good*** student and book weights from table `dirty_weights`. Receive values `@good_m` and `@good_b` upon return." ] }, { "cell_type": "code", "execution_count": null, "id": "8e1ea8a5-8dde-402c-82ff-8a19d9292008", "metadata": {}, "outputs": [], "source": [ "cursor.execute('CALL regression_good_weights(@good_m, @good_b)')\n", "\n", "df_query(conn, 'SELECT @good_m, @good_b')" ] }, { "cell_type": "markdown", "id": "c49383c7-5254-4d29-96fd-1ccf9a005b72", "metadata": {}, "source": [ "#### Create table `cleaned_weights` from table `dirty_weights`, passing values `@good_m` and `@good_b` received above." ] }, { "cell_type": "code", "execution_count": null, "id": "d6c96431-5d0f-4e47-ac6e-20fb7c84c744", "metadata": {}, "outputs": [], "source": [ "cursor.execute('CALL create_cleaned_weights(@good_m, @good_b)')\n", "\n", "df_query(conn, 'SELECT * FROM cleaned_weights')" ] }, { "cell_type": "markdown", "id": "b81c6802-e360-4e84-829b-517410d553a6", "metadata": {}, "source": [ "#### Perform a regression on the ***cleaned*** student and book weights from table `cleaned_weights`. Receive values `@cleaned_m` and `@cleaned_b` upon return." ] }, { "cell_type": "code", "execution_count": null, "id": "d7dbf082-8ebd-45a2-8cfc-c692ffba74f8", "metadata": {}, "outputs": [], "source": [ "cursor.execute('CALL regression_cleaned_weights(@cleaned_m, @cleaned_b)')\n", "\n", "df_query(conn, 'SELECT @cleaned_m, @cleaned_b')" ] }, { "cell_type": "markdown", "id": "7e780923-8b92-44ac-bff8-2710b6d39e47", "metadata": {}, "source": [ "## Calculate estimates\n", "#### Perform regression with `@cleaned_m` and `@cleaned_b`." ] }, { "cell_type": "code", "execution_count": null, "id": "a26c0021-a271-40ec-8701-bb9bbef16657", "metadata": {}, "outputs": [], "source": [ "df_query(conn,\n", " \"\"\"\n", " SELECT FORMAT(@cleaned_m*72 + @cleaned_b, 6) AS est_book_weight_72,\n", " FORMAT(@cleaned_m*108 + @cleaned_b, 6) AS est_book_weight_108,\n", " FORMAT(@cleaned_m*150 + @cleaned_b, 6) AS est_book_weight_150\n", " \"\"\"\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "dfb635e7-ebca-4b7c-a728-c49b1d441038", "metadata": {}, "outputs": [], "source": [ "cursor.close()\n", "conn.close()" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.4" } }, "nbformat": 4, "nbformat_minor": 5 }