{ "cells": [ { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "oymwaD_KjPdq", "slideshow": { "slide_type": "slide" } }, "source": [ "[![AnalyticsDojo](https://github.com/rpi-techfundamentals/spring2019-materials/blob/master/fig/final-logo.png?raw=1)](http://introml.analyticsdojo.com)\n", "

Introduction to Python - Pivottable

\n", "

introml.analyticsdojo.com

\n", "\n", "\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# More Pivottables" ] }, { "cell_type": "code", "execution_count": 1, "metadata": { "colab": {}, "colab_type": "code", "id": "GYjmsxq-jgSE" }, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "--2019-09-13 15:30:05-- https://raw.githubusercontent.com/rpi-techfundamentals/spring2019-materials/master/input/train.csv\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.128.133, 151.101.192.133, 151.101.0.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.128.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 61194 (60K) [text/plain]\n", "Saving to: ‘train.csv’\n", "\n", "train.csv 100%[===================>] 59.76K --.-KB/s in 0.05s \n", "\n", "2019-09-13 15:30:05 (1.07 MB/s) - ‘train.csv’ saved [61194/61194]\n", "\n", "--2019-09-13 15:30:05-- https://raw.githubusercontent.com/rpi-techfundamentals/spring2019-materials/master/input/test.csv\n", "Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.192.133, 151.101.0.133, 151.101.64.133, ...\n", "Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.192.133|:443... connected.\n", "HTTP request sent, awaiting response... 200 OK\n", "Length: 28629 (28K) [text/plain]\n", "Saving to: ‘test.csv’\n", "\n", "test.csv 100%[===================>] 27.96K --.-KB/s in 0.03s \n", "\n", "2019-09-13 15:30:05 (1012 KB/s) - ‘test.csv’ saved [28629/28629]\n", "\n" ] } ], "source": [ "!wget https://raw.githubusercontent.com/rpi-techfundamentals/spring2019-materials/master/input/train.csv\n", "!wget https://raw.githubusercontent.com/rpi-techfundamentals/spring2019-materials/master/input/test.csv" ] }, { "cell_type": "code", "execution_count": 3, "metadata": { "colab": {}, "colab_type": "code", "id": "gniYcP9MjPdz" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
PassengerIdSurvivedPclassNameSexAgeSibSpParchTicketFareCabinEmbarked
0103Braund, Mr. Owen Harrismale22.010A/5 211717.2500NaNS
1211Cumings, Mrs. John Bradley (Florence Briggs Th...female38.010PC 1759971.2833C85C
2313Heikkinen, Miss. Lainafemale26.000STON/O2. 31012827.9250NaNS
3411Futrelle, Mrs. Jacques Heath (Lily May Peel)female35.01011380353.1000C123S
4503Allen, Mr. William Henrymale35.0003734508.0500NaNS
.......................................
88688702Montvila, Rev. Juozasmale27.00021153613.0000NaNS
88788811Graham, Miss. Margaret Edithfemale19.00011205330.0000B42S
88888903Johnston, Miss. Catherine Helen \"Carrie\"femaleNaN12W./C. 660723.4500NaNS
88989011Behr, Mr. Karl Howellmale26.00011136930.0000C148C
89089103Dooley, Mr. Patrickmale32.0003703767.7500NaNQ
\n", "

891 rows × 12 columns

\n", "
" ], "text/plain": [ " PassengerId Survived Pclass \\\n", "0 1 0 3 \n", "1 2 1 1 \n", "2 3 1 3 \n", "3 4 1 1 \n", "4 5 0 3 \n", ".. ... ... ... \n", "886 887 0 2 \n", "887 888 1 1 \n", "888 889 0 3 \n", "889 890 1 1 \n", "890 891 0 3 \n", "\n", " Name Sex Age SibSp \\\n", "0 Braund, Mr. Owen Harris male 22.0 1 \n", "1 Cumings, Mrs. John Bradley (Florence Briggs Th... female 38.0 1 \n", "2 Heikkinen, Miss. Laina female 26.0 0 \n", "3 Futrelle, Mrs. Jacques Heath (Lily May Peel) female 35.0 1 \n", "4 Allen, Mr. William Henry male 35.0 0 \n", ".. ... ... ... ... \n", "886 Montvila, Rev. Juozas male 27.0 0 \n", "887 Graham, Miss. Margaret Edith female 19.0 0 \n", "888 Johnston, Miss. Catherine Helen \"Carrie\" female NaN 1 \n", "889 Behr, Mr. Karl Howell male 26.0 0 \n", "890 Dooley, Mr. Patrick male 32.0 0 \n", "\n", " Parch Ticket Fare Cabin Embarked \n", "0 0 A/5 21171 7.2500 NaN S \n", "1 0 PC 17599 71.2833 C85 C \n", "2 0 STON/O2. 3101282 7.9250 NaN S \n", "3 0 113803 53.1000 C123 S \n", "4 0 373450 8.0500 NaN S \n", ".. ... ... ... ... ... \n", "886 0 211536 13.0000 NaN S \n", "887 0 112053 30.0000 B42 S \n", "888 2 W./C. 6607 23.4500 NaN S \n", "889 0 111369 30.0000 C148 C \n", "890 0 370376 7.7500 NaN Q \n", "\n", "[891 rows x 12 columns]" ] }, "execution_count": 3, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import numpy as np \n", "import pandas as pd \n", "\n", "# Input data files are available in the \"../input/\" directory.\n", "# Let's input them into a Pandas DataFrame\n", "train = pd.read_csv(\"train.csv\")\n", "test = pd.read_csv(\"test.csv\")\n", "train" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "leJfxOrWjPee" }, "source": [ "### Pivot Tables\n", "- A pivot table is a data summarization tool.\n", "- It can be used to that sum, sort, averge, count, over a pandas dataframe. \n", "- Download and open data in excel to appreciate the ways that you can use Pivot Tables. " ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "#Load it and create a pivot table.\n", "from google.colab import files\n", "files.download('train.csv')" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "pd.pivot_table?" ] }, { "cell_type": "code", "execution_count": 9, "metadata": { "colab": {}, "colab_type": "code", "id": "fgKWHO4ZjPe4" }, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
countsummean
SurvivedSurvivedSurvived
SexPclass
female194910.968085
276700.921053
3144720.500000
male1122450.368852
2108170.157407
3347470.135447
\n", "
" ], "text/plain": [ " count sum mean\n", " Survived Survived Survived\n", "Sex Pclass \n", "female 1 94 91 0.968085\n", " 2 76 70 0.921053\n", " 3 144 72 0.500000\n", "male 1 122 45 0.368852\n", " 2 108 17 0.157407\n", " 3 347 47 0.135447" ] }, "execution_count": 9, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pd.pivot_table(train,index=[\"Sex\",\"Pclass\"],values=[\"Survived\"],aggfunc=['count','sum','mean',])" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "The above " ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "8eBCKaqbjPe9" }, "outputs": [], "source": [ "#What does this tell us? \n", "train.groupby(['Sex','Pclass']).Survived.mean()" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "nhA-9djqjPfB" }, "outputs": [], "source": [ "#What does this tell us? Here it doesn't look so clear. We could separate by set age ranges.\n", "train.groupby(['Sex','Age']).Survived.mean()" ] }, { "cell_type": "markdown", "metadata": { "colab_type": "text", "id": "XbLK22eHjPfH" }, "source": [ "### Combining Multiple \n", "- *Splitting* the data into groups based on some criteria\n", "- *Applying* a function to each group independently\n", "- *Combining* the results into a data structure" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "rwDPc2iHjPfI" }, "outputs": [], "source": [ "s = train.groupby(['Sex','Pclass'], as_index=False).Survived.sum()\n", "s['PerSurv'] = train.groupby(['Sex','Pclass'], as_index=False).Survived.mean().Survived\n", "s['PerSurv']=s['PerSurv']*100\n", "s['Count'] = train.groupby(['Sex','Pclass'], as_index=False).Survived.count().Survived\n", "survived =s.Survived\n", "s" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "7GNupOCgjPfS" }, "outputs": [], "source": [ "#What does this tell us? \n", "spmean=train.groupby(['Sex','Pclass']).Survived.mean()\n", "spcount=train.groupby(['Sex','Pclass']).Survived.sum()\n", "spsum=train.groupby(['Sex','Pclass']).Survived.count()\n" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "JxXzcP37jPfW" }, "outputs": [], "source": [ "spmean" ] }, { "cell_type": "code", "execution_count": null, "metadata": { "colab": {}, "colab_type": "code", "id": "AsqV4wmTjPfa" }, "outputs": [], "source": [] } ], "metadata": { "colab": { "name": "04-intro-python-groupby.ipynb", "provenance": [], "version": "0.3.2" }, "kernelspec": { "display_name": "Python 3", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.7.6" } }, "nbformat": 4, "nbformat_minor": 4 }