diff --git a/labs/2019/lab6_feature_generation_sol.ipynb b/labs/2019/lab6_feature_generation_sol.ipynb new file mode 100644 index 0000000..1c0b981 --- /dev/null +++ b/labs/2019/lab6_feature_generation_sol.ipynb @@ -0,0 +1,1088 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Lab 6: Feature Generation\n", + "\n", + "\n", + "In this lab we'll get some hands-on experience with generating features.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Goals for this lab\n", + "\n", + "- Understand the types of features that are usually used in ML projects\n", + "- Practice generating those features\n", + "- Explore the effect of different types of features on model performance\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "import numpy as np\n", + "import pandas as pd\n", + "from sklearn.ensemble import BaggingClassifier,AdaBoostClassifier\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn.metrics import accuracy_score as accuracy\n", + "from sklearn.metrics import roc_curve, auc\n", + "import graphviz # If you don't have this, install via pip/conda\n", + "from sklearn.metrics import f1_score, precision_recall_curve\n", + "from sklearn.linear_model import LogisticRegression\n", + "%matplotlib inline\n", + "\n", + "# exercise: what additional modules should you import?" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data\n", + "We'll use the data from donorschoose that we used in Assignment 3." + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Change this to wherever you're storing your data\n", + "datafile = \"../data/projects_2012_2013.csv\"\n", + "df = pd.read_csv(datafile, parse_dates=['date_posted', 'datefullyfunded'])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectidteacher_acctidschoolidschool_ncesidschool_latitudeschool_longitudeschool_cityschool_stateschool_metroschool_district...secondary_focus_subjectsecondary_focus_arearesource_typepoverty_levelgrade_leveltotal_price_including_optional_supportstudents_reachedeligible_double_your_impact_matchdate_posteddatefullyfunded
000001ccc0e81598c4bd86bacb94d7acb96963218e74e10c3764a5cfb153e6fea9f3f9f2c2da7edda5648ccd10554ed8c1.709930e+1141.807654-87.673257ChicagoILurbanPershing Elem Network...Visual ArtsMusic & The ArtsSupplieshighest povertyGrades PreK-21498.6131.0f2013-04-142013-05-02
10000fa3aa8f6649abab23615b546016d2a578595fe351e7fce057e048c409b183432ed3d4466fac2f2ead83ab354e3336.409801e+1034.296596-119.296596VenturaCAurbanVentura Unif School District...Literature & WritingLiteracy & LanguageBookshighest povertyGrades 3-5282.4728.0t2012-04-072012-04-18
2000134f07d4b30140d63262c871748ff26bd60377bdbffb53a644a16c5308e82dc8dcb501c3b2bb0b10e9c6ee2cd8afd6.227100e+1034.078625-118.257834Los AngelesCAurbanLos Angeles Unif Sch Dist...Social SciencesHistory & CivicsTechnologyhigh povertyGrades 3-51012.3856.0f2012-01-302012-04-15
30001f2d0b3827bba67cdbeaa248b832d15d900805d9d716c051c671827109f458bea7e8c6e4279fca6276128db89292e3.600090e+1140.687286-73.988217BrooklynNYurbanNew York City Dept Of Ed...NaNNaNBookshigh povertyGrades PreK-2175.3323.0f2012-10-112012-12-05
40004536db996ba697ca72c9e058bfe69400f8b82bb0143f6a40b217a517fe311fbdefab6fe41e12c55886c610c1107533.606870e+1140.793018-73.205635Central IslipNYsuburbanCentral Islip Union Free SD...Literature & WritingLiteracy & LanguageTechnologyhigh povertyGrades PreK-23591.11150.0f2013-01-082013-03-25
\n", + "

5 rows × 26 columns

\n", + "
" + ], + "text/plain": [ + " projectid teacher_acctid \\\n", + "0 00001ccc0e81598c4bd86bacb94d7acb 96963218e74e10c3764a5cfb153e6fea \n", + "1 0000fa3aa8f6649abab23615b546016d 2a578595fe351e7fce057e048c409b18 \n", + "2 000134f07d4b30140d63262c871748ff 26bd60377bdbffb53a644a16c5308e82 \n", + "3 0001f2d0b3827bba67cdbeaa248b832d 15d900805d9d716c051c671827109f45 \n", + "4 0004536db996ba697ca72c9e058bfe69 400f8b82bb0143f6a40b217a517fe311 \n", + "\n", + " schoolid school_ncesid school_latitude \\\n", + "0 9f3f9f2c2da7edda5648ccd10554ed8c 1.709930e+11 41.807654 \n", + "1 3432ed3d4466fac2f2ead83ab354e333 6.409801e+10 34.296596 \n", + "2 dc8dcb501c3b2bb0b10e9c6ee2cd8afd 6.227100e+10 34.078625 \n", + "3 8bea7e8c6e4279fca6276128db89292e 3.600090e+11 40.687286 \n", + "4 fbdefab6fe41e12c55886c610c110753 3.606870e+11 40.793018 \n", + "\n", + " school_longitude school_city school_state school_metro \\\n", + "0 -87.673257 Chicago IL urban \n", + "1 -119.296596 Ventura CA urban \n", + "2 -118.257834 Los Angeles CA urban \n", + "3 -73.988217 Brooklyn NY urban \n", + "4 -73.205635 Central Islip NY suburban \n", + "\n", + " school_district ... secondary_focus_subject \\\n", + "0 Pershing Elem Network ... Visual Arts \n", + "1 Ventura Unif School District ... Literature & Writing \n", + "2 Los Angeles Unif Sch Dist ... Social Sciences \n", + "3 New York City Dept Of Ed ... NaN \n", + "4 Central Islip Union Free SD ... Literature & Writing \n", + "\n", + " secondary_focus_area resource_type poverty_level grade_level \\\n", + "0 Music & The Arts Supplies highest poverty Grades PreK-2 \n", + "1 Literacy & Language Books highest poverty Grades 3-5 \n", + "2 History & Civics Technology high poverty Grades 3-5 \n", + "3 NaN Books high poverty Grades PreK-2 \n", + "4 Literacy & Language Technology high poverty Grades PreK-2 \n", + "\n", + " total_price_including_optional_support students_reached \\\n", + "0 1498.61 31.0 \n", + "1 282.47 28.0 \n", + "2 1012.38 56.0 \n", + "3 175.33 23.0 \n", + "4 3591.11 150.0 \n", + "\n", + " eligible_double_your_impact_match date_posted datefullyfunded \n", + "0 f 2013-04-14 2013-05-02 \n", + "1 t 2012-04-07 2012-04-18 \n", + "2 f 2012-01-30 2012-04-15 \n", + "3 f 2012-10-11 2012-12-05 \n", + "4 f 2013-01-08 2013-03-25 \n", + "\n", + "[5 rows x 26 columns]" + ] + }, + "execution_count": 6, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 1. Create label/outcome\n", + "same as in the homework - predict if a project on donorschoose will not get fully funded within 60 days of posting." + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# code\n", + "df['duration'] = df.datefullyfunded - df.date_posted\n", + "df['label'] = np.where(df['duration']>pd.Timedelta('60 days'), 1, 0)" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
projectidteacher_acctidschoolidschool_ncesidschool_latitudeschool_longitudeschool_cityschool_stateschool_metroschool_district...resource_typepoverty_levelgrade_leveltotal_price_including_optional_supportstudents_reachedeligible_double_your_impact_matchdate_posteddatefullyfundeddurationlabel
000001ccc0e81598c4bd86bacb94d7acb96963218e74e10c3764a5cfb153e6fea9f3f9f2c2da7edda5648ccd10554ed8c1.709930e+1141.807654-87.673257ChicagoILurbanPershing Elem Network...Supplieshighest povertyGrades PreK-21498.6131.0f2013-04-142013-05-0218 days0
10000fa3aa8f6649abab23615b546016d2a578595fe351e7fce057e048c409b183432ed3d4466fac2f2ead83ab354e3336.409801e+1034.296596-119.296596VenturaCAurbanVentura Unif School District...Bookshighest povertyGrades 3-5282.4728.0t2012-04-072012-04-1811 days0
2000134f07d4b30140d63262c871748ff26bd60377bdbffb53a644a16c5308e82dc8dcb501c3b2bb0b10e9c6ee2cd8afd6.227100e+1034.078625-118.257834Los AngelesCAurbanLos Angeles Unif Sch Dist...Technologyhigh povertyGrades 3-51012.3856.0f2012-01-302012-04-1576 days1
30001f2d0b3827bba67cdbeaa248b832d15d900805d9d716c051c671827109f458bea7e8c6e4279fca6276128db89292e3.600090e+1140.687286-73.988217BrooklynNYurbanNew York City Dept Of Ed...Bookshigh povertyGrades PreK-2175.3323.0f2012-10-112012-12-0555 days0
40004536db996ba697ca72c9e058bfe69400f8b82bb0143f6a40b217a517fe311fbdefab6fe41e12c55886c610c1107533.606870e+1140.793018-73.205635Central IslipNYsuburbanCentral Islip Union Free SD...Technologyhigh povertyGrades PreK-23591.11150.0f2013-01-082013-03-2576 days1
\n", + "

5 rows × 28 columns

\n", + "
" + ], + "text/plain": [ + " projectid teacher_acctid \\\n", + "0 00001ccc0e81598c4bd86bacb94d7acb 96963218e74e10c3764a5cfb153e6fea \n", + "1 0000fa3aa8f6649abab23615b546016d 2a578595fe351e7fce057e048c409b18 \n", + "2 000134f07d4b30140d63262c871748ff 26bd60377bdbffb53a644a16c5308e82 \n", + "3 0001f2d0b3827bba67cdbeaa248b832d 15d900805d9d716c051c671827109f45 \n", + "4 0004536db996ba697ca72c9e058bfe69 400f8b82bb0143f6a40b217a517fe311 \n", + "\n", + " schoolid school_ncesid school_latitude \\\n", + "0 9f3f9f2c2da7edda5648ccd10554ed8c 1.709930e+11 41.807654 \n", + "1 3432ed3d4466fac2f2ead83ab354e333 6.409801e+10 34.296596 \n", + "2 dc8dcb501c3b2bb0b10e9c6ee2cd8afd 6.227100e+10 34.078625 \n", + "3 8bea7e8c6e4279fca6276128db89292e 3.600090e+11 40.687286 \n", + "4 fbdefab6fe41e12c55886c610c110753 3.606870e+11 40.793018 \n", + "\n", + " school_longitude school_city school_state school_metro \\\n", + "0 -87.673257 Chicago IL urban \n", + "1 -119.296596 Ventura CA urban \n", + "2 -118.257834 Los Angeles CA urban \n", + "3 -73.988217 Brooklyn NY urban \n", + "4 -73.205635 Central Islip NY suburban \n", + "\n", + " school_district ... resource_type poverty_level \\\n", + "0 Pershing Elem Network ... Supplies highest poverty \n", + "1 Ventura Unif School District ... Books highest poverty \n", + "2 Los Angeles Unif Sch Dist ... Technology high poverty \n", + "3 New York City Dept Of Ed ... Books high poverty \n", + "4 Central Islip Union Free SD ... Technology high poverty \n", + "\n", + " grade_level total_price_including_optional_support students_reached \\\n", + "0 Grades PreK-2 1498.61 31.0 \n", + "1 Grades 3-5 282.47 28.0 \n", + "2 Grades 3-5 1012.38 56.0 \n", + "3 Grades PreK-2 175.33 23.0 \n", + "4 Grades PreK-2 3591.11 150.0 \n", + "\n", + " eligible_double_your_impact_match date_posted datefullyfunded duration label \n", + "0 f 2013-04-14 2013-05-02 18 days 0 \n", + "1 t 2012-04-07 2012-04-18 11 days 0 \n", + "2 f 2012-01-30 2012-04-15 76 days 1 \n", + "3 f 2012-10-11 2012-12-05 55 days 0 \n", + "4 f 2013-01-08 2013-03-25 76 days 1 \n", + "\n", + "[5 rows x 28 columns]" + ] + }, + "execution_count": 41, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 52, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Timestamp('2013-04-14 00:00:00')" + ] + }, + "execution_count": 52, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.date_posted[0]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 2. Feature Generation\n", + "We'll do this in a few iterations and run some models betweeen each iteration to see how the performance changes.\n", + " - Models: Let's take a few simple models to run - logistic regression (L2) and Random Forests (n_estimators = 1000)\n", + " - Training and Test Sets: For now, create one six month test set and use data before that as training set (same as in the homework)\n", + " - Metrics: Try AUCROC, Precision at 10% and 20%\n", + " \n", + "Feature Generation iterations:\n", + "\n", + "The main thing to remember here is that the features you generate are being generated as of the \"posting_date\" and can only use information up to that date.\n", + "\n", + "1. select existing columns that already exist in the raw data and prep them to run with sklearn models. This should be very similar to what you did in assignment 3. You'll create dummy variables from categorical variables.\n", + "\n", + "2. Could discretizing some of the varibles help? Try discretizing \"total_ammount\" and \"students_reached\" \n", + "\n", + "3. Aggregation:\n", + " - let's try simple aggregations such as number and percentage (2 different features) of projects that got fully funded in the last x days for several values of x (let's say 10, 30, 60)\n", + " - you can extend the previous features to spatial aggregations by limiting that to the same city/state/school as the project you are generating features for.\n", + " - you can use the lat long to generate the same features for projects within some distance y\n", + " \n" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "# feature generation code\n", + "str_columns = [column for column in df.columns if (df[column].dtype=='O') and (len(df[column].unique())<=51)]\n", + "float_columns = ['total_price_including_optional_support', 'students_reached']" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['school_state', 'school_metro', 'school_charter', 'school_magnet', 'teacher_prefix', 'primary_focus_subject', 'primary_focus_area', 'secondary_focus_subject', 'secondary_focus_area', 'resource_type', 'poverty_level', 'grade_level', 'eligible_double_your_impact_match']\n", + "['total_price_including_optional_support', 'students_reached']\n" + ] + } + ], + "source": [ + "print(str_columns)\n", + "print(float_columns)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The code above automatically generated two lists of columns for dummy variables and discretized variables respectively. It's more reliable than handwriting column names manually. The str_columns has the additional restriction by setting an upper bound on the number of distinct values." + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# Generate dummy variables from str_columns\n", + "features = pd.get_dummies(df[str_columns], dummy_na=True, columns=str_columns, drop_first=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [], + "source": [ + "# Generate discreized variables from float_columns\n", + "for column in float_columns:\n", + " features[column] = pd.cut(df[column], bins=5, labels=['low', 'medium low', 'medium', 'medium high', 'high'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Now we generate aggregation feature. We will use number of projects that got funded in the last 10 days as an example." + ] + }, + { + "cell_type": "code", + "execution_count": 54, + "metadata": {}, + "outputs": [], + "source": [ + "date_posted_list = pd.to_datetime(df.date_posted.unique())\n", + "num_projects_funded_dict = {}\n", + "# use a dictionary to store the number of projects funded within 10 days of a specific day\n", + "for date_posted in date_posted_list:\n", + " since = date_posted - df.datefullyfunded\n", + " num_projects_funded_dict[date_posted.strftime(\"%Y%m%d\")] = np.sum((since>pd.Timedelta('0 days')) & (since<=pd.Timedelta('10 days')))\n" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": {}, + "outputs": [], + "source": [ + "# create a Series of aggregation feature\n", + "aggr_list = np.zeros(len(df))\n", + "for i in range(len(df)):\n", + " date = df.iloc[i].date_posted\n", + " aggr_list[i] = num_projects_funded_dict[date.strftime(\"%Y%m%d\")]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Append the newly created aggregation feature to the feature dataframe" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": {}, + "outputs": [], + "source": [ + "features['num_projects_funded_within10day'] = aggr_list" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
school_state_ALschool_state_ARschool_state_AZschool_state_CAschool_state_COschool_state_CTschool_state_DCschool_state_DEschool_state_FLschool_state_GA...poverty_level_nangrade_level_Grades 6-8grade_level_Grades 9-12grade_level_Grades PreK-2grade_level_naneligible_double_your_impact_match_teligible_double_your_impact_match_nantotal_price_including_optional_supportstudents_reachednum_projects_funded_within10day
00000000000...0001000lowlow1281.0
10001000000...0000010lowlow1435.0
20001000000...0000000lowlow568.0
30000000000...0001000lowlow1973.0
40000000000...0001000lowlow1688.0
\n", + "

5 rows × 149 columns

\n", + "
" + ], + "text/plain": [ + " school_state_AL school_state_AR school_state_AZ school_state_CA \\\n", + "0 0 0 0 0 \n", + "1 0 0 0 1 \n", + "2 0 0 0 1 \n", + "3 0 0 0 0 \n", + "4 0 0 0 0 \n", + "\n", + " school_state_CO school_state_CT school_state_DC school_state_DE \\\n", + "0 0 0 0 0 \n", + "1 0 0 0 0 \n", + "2 0 0 0 0 \n", + "3 0 0 0 0 \n", + "4 0 0 0 0 \n", + "\n", + " school_state_FL school_state_GA ... \\\n", + "0 0 0 ... \n", + "1 0 0 ... \n", + "2 0 0 ... \n", + "3 0 0 ... \n", + "4 0 0 ... \n", + "\n", + " poverty_level_nan grade_level_Grades 6-8 grade_level_Grades 9-12 \\\n", + "0 0 0 0 \n", + "1 0 0 0 \n", + "2 0 0 0 \n", + "3 0 0 0 \n", + "4 0 0 0 \n", + "\n", + " grade_level_Grades PreK-2 grade_level_nan \\\n", + "0 1 0 \n", + "1 0 0 \n", + "2 0 0 \n", + "3 1 0 \n", + "4 1 0 \n", + "\n", + " eligible_double_your_impact_match_t eligible_double_your_impact_match_nan \\\n", + "0 0 0 \n", + "1 1 0 \n", + "2 0 0 \n", + "3 0 0 \n", + "4 0 0 \n", + "\n", + " total_price_including_optional_support students_reached \\\n", + "0 low low \n", + "1 low low \n", + "2 low low \n", + "3 low low \n", + "4 low low \n", + "\n", + " num_projects_funded_within10day \n", + "0 1281.0 \n", + "1 1435.0 \n", + "2 568.0 \n", + "3 1973.0 \n", + "4 1688.0 \n", + "\n", + "[5 rows x 149 columns]" + ] + }, + "execution_count": 61, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "features.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## 3. Let's test models now" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "raw_features = []\n", + "discretized_features = []\n", + "simple_aggregate_features = []\n", + "spatial_aggregate_features = []\n", + "\n", + "# now seelect which one(s) you want to test models with\n", + "selected_feature_groups = []\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Create one (temporal) train and test split \n" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [], + "source": [ + "# split train and test data based on a date threshold in mid 2013\n", + "split_threshold = pd.Timestamp(2013,6,30)\n", + "train_filter = (df.date_posted <= split_threshold)\n", + "test_filter = (df.date_posted > split_threshold)\n", + "train_x, train_y = features[train_filter], df.label[train_filter]\n", + "test_x, test_y = features[test_filter], df.label[test_filter]" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Imputation\n", + "Impute features that may be missing (separately on train and test set to avoid leakage). Each feature may be missing for a different reason so fill them appropriately (and generate missing flags as separate variables when necessary - remember what we talked about in class about this)\n", + "." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# code" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Train and Test models\n", + "- Build model(s) using the selected feature groups\n", + "- test model(s)\n", + "- evaluate\n", + "\n", + "You should do this for different subsets of feature groups above to get an idea of what the performance impact is" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "# code" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Add more features\n", + "Can you think of other features (especially aggregate ones) that will be helpful?\n", + " - avg amount for fully funded projects in the last x days within y distance (or same geographical area)?\n", + " - difference between what this project is asking for and the feature above?\n", + " - ...\n", + " \n", + "Now create a new feature group and see how well do the models do with the additional feature(s)?" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 2", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.14" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +}