{
 "cells": [
  {
   "cell_type": "markdown",
   "id": "edc4d861-043c-40d2-871f-4489c175939e",
   "metadata": {},
   "source": [
    "# Compute medians in SQL"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "24d1152f-7651-46c9-8b84-7e8fcdb3bda7",
   "metadata": {},
   "outputs": [],
   "source": [
    "from data201 import db_connection, df_query"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ed9b6835-a7ca-48a5-a56c-112405e9cea7",
   "metadata": {},
   "outputs": [],
   "source": [
    "conn = db_connection(config_file = 'Median.ini')\n",
    "cursor = conn.cursor()"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1b776cc3-d76d-46e4-a2c7-74975d9432e0",
   "metadata": {},
   "source": [
    "## SQL: Odd count of values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "05835f99-3137-4c09-abc2-55dff85a3a1c",
   "metadata": {},
   "outputs": [],
   "source": [
    "cursor.execute('DROP TABLE IF EXISTS test_values')\n",
    "\n",
    "sql = \"\"\"\n",
    "    CREATE TABLE test_values\n",
    "    (\n",
    "        id INT NOT NULL AUTO_INCREMENT,\n",
    "        value INT NOT NULL,\n",
    "        PRIMARY KEY(id)\n",
    "    )\n",
    "    \"\"\"\n",
    "\n",
    "cursor.execute(sql)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "55d35ab6-8a28-4b37-a3b6-b4d1632afdfc",
   "metadata": {},
   "outputs": [],
   "source": [
    "sql = ( \"\"\"\n",
    "        INSERT INTO test_values(value)\n",
    "        VALUES (%s)\n",
    "        \"\"\"\n",
    "      )\n",
    "\n",
    "original_list_odd = [ 15, 6, 20, 13, 8, 2, 6 ]\n",
    "values = list(zip(original_list_odd))\n",
    "\n",
    "cursor.executemany(sql, values)\n",
    "conn.commit()\n",
    "\n",
    "df_query(conn, 'SELECT * FROM test_values')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "edd55eab-bc37-40da-9652-880763be25b7",
   "metadata": {},
   "source": [
    "#### Use window functions to count the number of values and to assign an index to each row after sorting."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "62947a2f-3a1a-4e4a-bd5f-26d15bfff369",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_query(conn,\n",
    "    \"\"\"\n",
    "    SELECT COUNT(*) OVER () AS row_count,\n",
    "           id,\n",
    "           ROW_NUMBER() OVER (ORDER BY value) AS row_index,\n",
    "           value\n",
    "    FROM test_values\n",
    "    \"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a3516777-95bb-4bf4-8b7e-4e60fbc9a79c",
   "metadata": {},
   "source": [
    "#### Make the above a CTE. Then the median is the value with the middle row index."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "95b01679-e833-4c41-afd9-f91ce7651381",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_query(conn,\n",
    "    \"\"\"\n",
    "    WITH indexed_rows AS\n",
    "    (\n",
    "    SELECT COUNT(*) OVER () AS row_count,\n",
    "           id,\n",
    "           ROW_NUMBER() OVER (ORDER BY value) AS row_index,\n",
    "           value\n",
    "    FROM test_values\n",
    "    )\n",
    "    SELECT value AS median_value\n",
    "    FROM indexed_rows\n",
    "    WHERE row_index = FLOOR((row_count + 1)/2)\n",
    "    \"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "c542f83a-b822-4fab-87cb-1d0cdf49606d",
   "metadata": {},
   "source": [
    "## SQL: Even count of values"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "9e311e1e-b44b-4216-a006-6c9f36c27418",
   "metadata": {},
   "outputs": [],
   "source": [
    "cursor.execute(\n",
    "    \"\"\"\n",
    "    INSERT INTO test_values(value)\n",
    "    VALUE (25)\n",
    "    \"\"\"\n",
    ")\n",
    "\n",
    "conn.commit()\n",
    "df_query(conn, 'SELECT * FROM test_values')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5d7326e2-d7c4-460f-be2c-96d08a708377",
   "metadata": {},
   "source": [
    "#### Use window functions to count the number of values and to assign an index to each row after sorting."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "ee5a464e-c2cc-4fa4-bad2-693211e64898",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_query(conn,\n",
    "    \"\"\"\n",
    "    SELECT COUNT(*) OVER () AS row_count,\n",
    "           id,\n",
    "           ROW_NUMBER() OVER (ORDER BY value) AS row_index,\n",
    "           value\n",
    "    FROM test_values\n",
    "    \"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f0cad009-7504-481c-81fe-397894152701",
   "metadata": {},
   "source": [
    "#### Make the above a CTE. Then the median is the average of the two value with the two middle row indexes."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "5b1624ab-2329-44a5-9177-fb2ae9354f53",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_query(conn,\n",
    "    \"\"\"\n",
    "    WITH indexed_rows AS\n",
    "    (\n",
    "        SELECT COUNT(*) OVER () AS row_count,\n",
    "               id,\n",
    "               ROW_NUMBER() OVER (ORDER BY value) AS row_index,\n",
    "               value\n",
    "        FROM test_values\n",
    "    )\n",
    "    SELECT AVG(value) AS median_value\n",
    "    FROM indexed_rows\n",
    "    WHERE row_index IN (FLOOR((row_count + 1)/2), CEIL((row_count + 1)/2))\n",
    "    \"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "1930dbe2-b9cd-4a5a-892c-24749a95010e",
   "metadata": {},
   "source": [
    "## View to calculate the median"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "824b1b23-a13c-4e35-a065-b41418fdeed9",
   "metadata": {},
   "outputs": [],
   "source": [
    "cursor.execute('DROP VIEW IF EXISTS median_view')\n",
    "\n",
    "cursor.execute(\n",
    "    \"\"\"\n",
    "    CREATE VIEW median_view AS\n",
    "    WITH indexed_rows AS\n",
    "    (\n",
    "        SELECT COUNT(*) OVER () AS row_count,\n",
    "               id,\n",
    "               ROW_NUMBER() OVER (ORDER BY value) AS row_index,\n",
    "               value\n",
    "        FROM test_values\n",
    "    )\n",
    "    SELECT AVG(value) AS median_value\n",
    "    FROM indexed_rows\n",
    "    WHERE row_index IN (FLOOR((row_count + 1)/2), CEIL((row_count + 1)/2))\n",
    "    \"\"\"\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "795fa5a9-826f-4b0e-a005-973bfa8fb18f",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_query(conn, 'SELECT * FROM median_view')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "88127f3d-5ecf-4852-93ed-409f29d4ab8b",
   "metadata": {},
   "source": [
    "## Any number of values\n",
    "#### Will the view also work for an odd number of values?"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "c5af31a8-022b-41ac-8740-ca59b359b433",
   "metadata": {},
   "outputs": [],
   "source": [
    "cursor.execute(\n",
    "    \"\"\"\n",
    "    DELETE FROM test_values\n",
    "    WHERE value = 25\n",
    "    \"\"\"\n",
    ")\n",
    "\n",
    "conn.commit()\n",
    "df_query(conn, 'SELECT * FROM test_values')"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "bd79fddf-9995-4bc1-860b-0951a26a9202",
   "metadata": {},
   "source": [
    "#### Yes, if the number of values is odd, row indexes `FLOOR((row_count + 1)/2)` and `CEIL((row_count + 1)/2)` are equal, and so we'll end up taking the average of two copies of the same median value."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "df8ebdf6-e637-45a4-9f7a-29283ce15438",
   "metadata": {},
   "outputs": [],
   "source": [
    "df_query(conn, 'SELECT * FROM median_view')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "2d63ec93-98a4-4d02-af67-993f901be157",
   "metadata": {},
   "outputs": [],
   "source": [
    "cursor.close()\n",
    "conn.close()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "45dbcb9f-d33e-498f-b601-4a1a6ee03822",
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.12.4"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
