{ "cells": [ { "cell_type": "markdown", "id": "1951683d-1075-43d4-901d-0b4bf94d80c5", "metadata": {}, "source": [ "# ETL a CSV File with Python + SQL" ] }, { "cell_type": "code", "execution_count": null, "id": "4678ac3c-f28a-4ca9-a8b4-a15f1eccf4df", "metadata": {}, "outputs": [], "source": [ "import csv\n", "from DATA225utils import make_connection" ] }, { "cell_type": "code", "execution_count": null, "id": "2089ae36-ede1-4473-a491-a0db47eca0de", "metadata": {}, "outputs": [], "source": [ "conn = make_connection(config_file = 'titanic.ini')\n", "cursor = conn.cursor()" ] }, { "cell_type": "markdown", "id": "d3ed4446-8bdd-402c-a73f-a18603113283", "metadata": {}, "source": [ "### A cleaned table of all the passengers: `all_passengers_1`" ] }, { "cell_type": "code", "execution_count": null, "id": "6afaf34f-29d8-4685-9369-4004f56416f6", "metadata": { "tags": [] }, "outputs": [], "source": [ "cursor.execute('DROP TABLE IF EXISTS all_passengers_1')\n", "\n", "sql = ( \"\"\"\n", " CREATE TABLE all_passengers_1\n", " (\n", " name varchar(32),\n", " survived varchar(3),\n", " sex varchar(6),\n", " age double,\n", " class int,\n", " PRIMARY KEY(name)\n", " )\n", " \"\"\"\n", " )\n", "\n", "cursor.execute(sql);" ] }, { "cell_type": "markdown", "id": "2aedb6d6-4106-4b61-bfb0-e16448b9f18a", "metadata": {}, "source": [ "### A cleaned table of the male passengers who survived: `male_survivors`" ] }, { "cell_type": "code", "execution_count": null, "id": "6d5d9bec-8358-4097-a247-d4bf3fc8065e", "metadata": {}, "outputs": [], "source": [ "cursor.execute('DROP TABLE IF EXISTS male_survivors')\n", "\n", "sql = ( \"\"\"\n", " CREATE TABLE male_survivors\n", " (\n", " name varchar(32),\n", " age double,\n", " class int,\n", " PRIMARY KEY(name)\n", " )\n", " \"\"\"\n", " )\n", "\n", "cursor.execute(sql);" ] }, { "cell_type": "markdown", "id": "ee8f24e3-4282-4657-a940-292883168bff", "metadata": {}, "source": [ "### A function to clean (transform) each row:\n", "- age 'NA' ==> 0\n", "- klass '1st' ==> 1\n", "- klass '2nd' ==> 2\n", "- klass '3rd' ==> 3" ] }, { "cell_type": "code", "execution_count": null, "id": "644a03a5-0ebf-4681-aa5c-8febb8437cd1", "metadata": {}, "outputs": [], "source": [ "def transform(row):\n", " if row[3] == 'NA':\n", " row[3] = 0\n", "\n", " klass = row[4]\n", "\n", " if klass == '1st':\n", " row[4] = 1\n", " elif klass == '2nd':\n", " row[4] = 2\n", " elif klass == '3rd':\n", " row[4] = 3\n", " else:\n", " row[4] = 0" ] }, { "cell_type": "markdown", "id": "720786ef-18a3-49c5-adf5-c982a889432e", "metadata": {}, "source": [ "### SQL to load the two tables." ] }, { "cell_type": "code", "execution_count": null, "id": "53deb924-30b6-4ac7-86f8-f91f76ecfa2a", "metadata": {}, "outputs": [], "source": [ "sql_all = ( \"\"\"\n", " INSERT INTO all_passengers_1\n", " VALUES (%s, %s, %s, %s, %s)\n", " \"\"\"\n", " )\n", "\n", "sql_male = ( \"\"\"\n", " INSERT INTO male_survivors\n", " VALUES (%s, %s, %s)\n", " \"\"\"\n", " )" ] }, { "cell_type": "markdown", "id": "d9a87f4e-cca2-48af-b094-bd85274a70b4", "metadata": {}, "source": [ "### Extract, transform, and load (ETL)." ] }, { "cell_type": "code", "execution_count": null, "id": "924b1854-2a2d-4197-b870-389f2b35a478", "metadata": {}, "outputs": [], "source": [ "first = True\n", "i = 0\n", "\n", "with open('TitanicSurvival.csv', newline='') as csv_file:\n", " data = csv.reader(csv_file, delimiter=',', quotechar='\"')\n", " \n", " for row in data:\n", " if not first:\n", " transform(row)\n", " cursor.execute(sql_all, row)\n", " \n", " if (row[1] == 'yes') and (row[2] == 'male'):\n", " row.remove('yes')\n", " row.remove('male')\n", "\n", " cursor.execute(sql_male, row)\n", " \n", " first = False\n", " \n", "conn.commit()" ] }, { "cell_type": "code", "execution_count": null, "id": "e3409cfd-7fa0-446e-af80-542624709e5f", "metadata": {}, "outputs": [], "source": [ "cursor.close()\n", "conn.close()" ] }, { "cell_type": "code", "execution_count": null, "id": "d0e47d8e-199c-4a02-8645-7e149479ca2c", "metadata": {}, "outputs": [], "source": [ "# Copyright (c) 2023 by Ronald Mak" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.13" } }, "nbformat": 4, "nbformat_minor": 5 }