diff --git a/labs/2019/lab6_feature_generation_sol.ipynb b/labs/2019/lab6_feature_generation_sol.ipynb
new file mode 100644
index 0000000..1c0b981
--- /dev/null
+++ b/labs/2019/lab6_feature_generation_sol.ipynb
@@ -0,0 +1,1088 @@
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Lab 6: Feature Generation\n",
+    "\n",
+    "\n",
+    "In this lab we'll get some hands-on experience with generating features.\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Goals for this lab\n",
+    "\n",
+    "- Understand the types of features that are usually used in ML projects\n",
+    "- Practice generating those features\n",
+    "- Explore the effect of different types of features on model performance\n",
+    "\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 1,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "import matplotlib.pyplot as plt\n",
+    "import numpy as np\n",
+    "import pandas as pd\n",
+    "from sklearn.ensemble import BaggingClassifier,AdaBoostClassifier\n",
+    "from sklearn.model_selection import train_test_split\n",
+    "from sklearn.metrics import accuracy_score as accuracy\n",
+    "from sklearn.metrics import roc_curve, auc\n",
+    "import graphviz # If you don't have this, install via pip/conda\n",
+    "from sklearn.metrics import f1_score, precision_recall_curve\n",
+    "from sklearn.linear_model import LogisticRegression\n",
+    "%matplotlib inline\n",
+    "\n",
+    "# exercise: what additional modules should you import?"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Data\n",
+    "We'll use the data from donorschoose that we used in Assignment 3."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 5,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Change this to wherever you're storing your data\n",
+    "datafile = \"../data/projects_2012_2013.csv\"\n",
+    "df = pd.read_csv(datafile, parse_dates=['date_posted', 'datefullyfunded'])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style>\n",
+       "    .dataframe thead tr:only-child th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>projectid</th>\n",
+       "      <th>teacher_acctid</th>\n",
+       "      <th>schoolid</th>\n",
+       "      <th>school_ncesid</th>\n",
+       "      <th>school_latitude</th>\n",
+       "      <th>school_longitude</th>\n",
+       "      <th>school_city</th>\n",
+       "      <th>school_state</th>\n",
+       "      <th>school_metro</th>\n",
+       "      <th>school_district</th>\n",
+       "      <th>...</th>\n",
+       "      <th>secondary_focus_subject</th>\n",
+       "      <th>secondary_focus_area</th>\n",
+       "      <th>resource_type</th>\n",
+       "      <th>poverty_level</th>\n",
+       "      <th>grade_level</th>\n",
+       "      <th>total_price_including_optional_support</th>\n",
+       "      <th>students_reached</th>\n",
+       "      <th>eligible_double_your_impact_match</th>\n",
+       "      <th>date_posted</th>\n",
+       "      <th>datefullyfunded</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>00001ccc0e81598c4bd86bacb94d7acb</td>\n",
+       "      <td>96963218e74e10c3764a5cfb153e6fea</td>\n",
+       "      <td>9f3f9f2c2da7edda5648ccd10554ed8c</td>\n",
+       "      <td>1.709930e+11</td>\n",
+       "      <td>41.807654</td>\n",
+       "      <td>-87.673257</td>\n",
+       "      <td>Chicago</td>\n",
+       "      <td>IL</td>\n",
+       "      <td>urban</td>\n",
+       "      <td>Pershing Elem Network</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Visual Arts</td>\n",
+       "      <td>Music &amp; The Arts</td>\n",
+       "      <td>Supplies</td>\n",
+       "      <td>highest poverty</td>\n",
+       "      <td>Grades PreK-2</td>\n",
+       "      <td>1498.61</td>\n",
+       "      <td>31.0</td>\n",
+       "      <td>f</td>\n",
+       "      <td>2013-04-14</td>\n",
+       "      <td>2013-05-02</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0000fa3aa8f6649abab23615b546016d</td>\n",
+       "      <td>2a578595fe351e7fce057e048c409b18</td>\n",
+       "      <td>3432ed3d4466fac2f2ead83ab354e333</td>\n",
+       "      <td>6.409801e+10</td>\n",
+       "      <td>34.296596</td>\n",
+       "      <td>-119.296596</td>\n",
+       "      <td>Ventura</td>\n",
+       "      <td>CA</td>\n",
+       "      <td>urban</td>\n",
+       "      <td>Ventura Unif School District</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Literature &amp; Writing</td>\n",
+       "      <td>Literacy &amp; Language</td>\n",
+       "      <td>Books</td>\n",
+       "      <td>highest poverty</td>\n",
+       "      <td>Grades 3-5</td>\n",
+       "      <td>282.47</td>\n",
+       "      <td>28.0</td>\n",
+       "      <td>t</td>\n",
+       "      <td>2012-04-07</td>\n",
+       "      <td>2012-04-18</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>000134f07d4b30140d63262c871748ff</td>\n",
+       "      <td>26bd60377bdbffb53a644a16c5308e82</td>\n",
+       "      <td>dc8dcb501c3b2bb0b10e9c6ee2cd8afd</td>\n",
+       "      <td>6.227100e+10</td>\n",
+       "      <td>34.078625</td>\n",
+       "      <td>-118.257834</td>\n",
+       "      <td>Los Angeles</td>\n",
+       "      <td>CA</td>\n",
+       "      <td>urban</td>\n",
+       "      <td>Los Angeles Unif Sch Dist</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Social Sciences</td>\n",
+       "      <td>History &amp; Civics</td>\n",
+       "      <td>Technology</td>\n",
+       "      <td>high poverty</td>\n",
+       "      <td>Grades 3-5</td>\n",
+       "      <td>1012.38</td>\n",
+       "      <td>56.0</td>\n",
+       "      <td>f</td>\n",
+       "      <td>2012-01-30</td>\n",
+       "      <td>2012-04-15</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0001f2d0b3827bba67cdbeaa248b832d</td>\n",
+       "      <td>15d900805d9d716c051c671827109f45</td>\n",
+       "      <td>8bea7e8c6e4279fca6276128db89292e</td>\n",
+       "      <td>3.600090e+11</td>\n",
+       "      <td>40.687286</td>\n",
+       "      <td>-73.988217</td>\n",
+       "      <td>Brooklyn</td>\n",
+       "      <td>NY</td>\n",
+       "      <td>urban</td>\n",
+       "      <td>New York City Dept Of Ed</td>\n",
+       "      <td>...</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>NaN</td>\n",
+       "      <td>Books</td>\n",
+       "      <td>high poverty</td>\n",
+       "      <td>Grades PreK-2</td>\n",
+       "      <td>175.33</td>\n",
+       "      <td>23.0</td>\n",
+       "      <td>f</td>\n",
+       "      <td>2012-10-11</td>\n",
+       "      <td>2012-12-05</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0004536db996ba697ca72c9e058bfe69</td>\n",
+       "      <td>400f8b82bb0143f6a40b217a517fe311</td>\n",
+       "      <td>fbdefab6fe41e12c55886c610c110753</td>\n",
+       "      <td>3.606870e+11</td>\n",
+       "      <td>40.793018</td>\n",
+       "      <td>-73.205635</td>\n",
+       "      <td>Central Islip</td>\n",
+       "      <td>NY</td>\n",
+       "      <td>suburban</td>\n",
+       "      <td>Central Islip Union Free SD</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Literature &amp; Writing</td>\n",
+       "      <td>Literacy &amp; Language</td>\n",
+       "      <td>Technology</td>\n",
+       "      <td>high poverty</td>\n",
+       "      <td>Grades PreK-2</td>\n",
+       "      <td>3591.11</td>\n",
+       "      <td>150.0</td>\n",
+       "      <td>f</td>\n",
+       "      <td>2013-01-08</td>\n",
+       "      <td>2013-03-25</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 26 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                          projectid                    teacher_acctid  \\\n",
+       "0  00001ccc0e81598c4bd86bacb94d7acb  96963218e74e10c3764a5cfb153e6fea   \n",
+       "1  0000fa3aa8f6649abab23615b546016d  2a578595fe351e7fce057e048c409b18   \n",
+       "2  000134f07d4b30140d63262c871748ff  26bd60377bdbffb53a644a16c5308e82   \n",
+       "3  0001f2d0b3827bba67cdbeaa248b832d  15d900805d9d716c051c671827109f45   \n",
+       "4  0004536db996ba697ca72c9e058bfe69  400f8b82bb0143f6a40b217a517fe311   \n",
+       "\n",
+       "                           schoolid  school_ncesid  school_latitude  \\\n",
+       "0  9f3f9f2c2da7edda5648ccd10554ed8c   1.709930e+11        41.807654   \n",
+       "1  3432ed3d4466fac2f2ead83ab354e333   6.409801e+10        34.296596   \n",
+       "2  dc8dcb501c3b2bb0b10e9c6ee2cd8afd   6.227100e+10        34.078625   \n",
+       "3  8bea7e8c6e4279fca6276128db89292e   3.600090e+11        40.687286   \n",
+       "4  fbdefab6fe41e12c55886c610c110753   3.606870e+11        40.793018   \n",
+       "\n",
+       "   school_longitude    school_city school_state school_metro  \\\n",
+       "0        -87.673257        Chicago           IL        urban   \n",
+       "1       -119.296596        Ventura           CA        urban   \n",
+       "2       -118.257834    Los Angeles           CA        urban   \n",
+       "3        -73.988217       Brooklyn           NY        urban   \n",
+       "4        -73.205635  Central Islip           NY     suburban   \n",
+       "\n",
+       "                school_district       ...       secondary_focus_subject  \\\n",
+       "0         Pershing Elem Network       ...                   Visual Arts   \n",
+       "1  Ventura Unif School District       ...          Literature & Writing   \n",
+       "2     Los Angeles Unif Sch Dist       ...               Social Sciences   \n",
+       "3      New York City Dept Of Ed       ...                           NaN   \n",
+       "4   Central Islip Union Free SD       ...          Literature & Writing   \n",
+       "\n",
+       "  secondary_focus_area resource_type    poverty_level    grade_level  \\\n",
+       "0     Music & The Arts      Supplies  highest poverty  Grades PreK-2   \n",
+       "1  Literacy & Language         Books  highest poverty     Grades 3-5   \n",
+       "2     History & Civics    Technology     high poverty     Grades 3-5   \n",
+       "3                  NaN         Books     high poverty  Grades PreK-2   \n",
+       "4  Literacy & Language    Technology     high poverty  Grades PreK-2   \n",
+       "\n",
+       "  total_price_including_optional_support students_reached  \\\n",
+       "0                                1498.61             31.0   \n",
+       "1                                 282.47             28.0   \n",
+       "2                                1012.38             56.0   \n",
+       "3                                 175.33             23.0   \n",
+       "4                                3591.11            150.0   \n",
+       "\n",
+       "  eligible_double_your_impact_match date_posted datefullyfunded  \n",
+       "0                                 f  2013-04-14      2013-05-02  \n",
+       "1                                 t  2012-04-07      2012-04-18  \n",
+       "2                                 f  2012-01-30      2012-04-15  \n",
+       "3                                 f  2012-10-11      2012-12-05  \n",
+       "4                                 f  2013-01-08      2013-03-25  \n",
+       "\n",
+       "[5 rows x 26 columns]"
+      ]
+     },
+     "execution_count": 6,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 1. Create label/outcome\n",
+    "same as in the homework - predict if a project on donorschoose will not get fully funded within 60 days of posting."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 39,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# code\n",
+    "df['duration'] = df.datefullyfunded - df.date_posted\n",
+    "df['label'] =  np.where(df['duration']>pd.Timedelta('60 days'), 1, 0)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 41,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style>\n",
+       "    .dataframe thead tr:only-child th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>projectid</th>\n",
+       "      <th>teacher_acctid</th>\n",
+       "      <th>schoolid</th>\n",
+       "      <th>school_ncesid</th>\n",
+       "      <th>school_latitude</th>\n",
+       "      <th>school_longitude</th>\n",
+       "      <th>school_city</th>\n",
+       "      <th>school_state</th>\n",
+       "      <th>school_metro</th>\n",
+       "      <th>school_district</th>\n",
+       "      <th>...</th>\n",
+       "      <th>resource_type</th>\n",
+       "      <th>poverty_level</th>\n",
+       "      <th>grade_level</th>\n",
+       "      <th>total_price_including_optional_support</th>\n",
+       "      <th>students_reached</th>\n",
+       "      <th>eligible_double_your_impact_match</th>\n",
+       "      <th>date_posted</th>\n",
+       "      <th>datefullyfunded</th>\n",
+       "      <th>duration</th>\n",
+       "      <th>label</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>00001ccc0e81598c4bd86bacb94d7acb</td>\n",
+       "      <td>96963218e74e10c3764a5cfb153e6fea</td>\n",
+       "      <td>9f3f9f2c2da7edda5648ccd10554ed8c</td>\n",
+       "      <td>1.709930e+11</td>\n",
+       "      <td>41.807654</td>\n",
+       "      <td>-87.673257</td>\n",
+       "      <td>Chicago</td>\n",
+       "      <td>IL</td>\n",
+       "      <td>urban</td>\n",
+       "      <td>Pershing Elem Network</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Supplies</td>\n",
+       "      <td>highest poverty</td>\n",
+       "      <td>Grades PreK-2</td>\n",
+       "      <td>1498.61</td>\n",
+       "      <td>31.0</td>\n",
+       "      <td>f</td>\n",
+       "      <td>2013-04-14</td>\n",
+       "      <td>2013-05-02</td>\n",
+       "      <td>18 days</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0000fa3aa8f6649abab23615b546016d</td>\n",
+       "      <td>2a578595fe351e7fce057e048c409b18</td>\n",
+       "      <td>3432ed3d4466fac2f2ead83ab354e333</td>\n",
+       "      <td>6.409801e+10</td>\n",
+       "      <td>34.296596</td>\n",
+       "      <td>-119.296596</td>\n",
+       "      <td>Ventura</td>\n",
+       "      <td>CA</td>\n",
+       "      <td>urban</td>\n",
+       "      <td>Ventura Unif School District</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Books</td>\n",
+       "      <td>highest poverty</td>\n",
+       "      <td>Grades 3-5</td>\n",
+       "      <td>282.47</td>\n",
+       "      <td>28.0</td>\n",
+       "      <td>t</td>\n",
+       "      <td>2012-04-07</td>\n",
+       "      <td>2012-04-18</td>\n",
+       "      <td>11 days</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>000134f07d4b30140d63262c871748ff</td>\n",
+       "      <td>26bd60377bdbffb53a644a16c5308e82</td>\n",
+       "      <td>dc8dcb501c3b2bb0b10e9c6ee2cd8afd</td>\n",
+       "      <td>6.227100e+10</td>\n",
+       "      <td>34.078625</td>\n",
+       "      <td>-118.257834</td>\n",
+       "      <td>Los Angeles</td>\n",
+       "      <td>CA</td>\n",
+       "      <td>urban</td>\n",
+       "      <td>Los Angeles Unif Sch Dist</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Technology</td>\n",
+       "      <td>high poverty</td>\n",
+       "      <td>Grades 3-5</td>\n",
+       "      <td>1012.38</td>\n",
+       "      <td>56.0</td>\n",
+       "      <td>f</td>\n",
+       "      <td>2012-01-30</td>\n",
+       "      <td>2012-04-15</td>\n",
+       "      <td>76 days</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0001f2d0b3827bba67cdbeaa248b832d</td>\n",
+       "      <td>15d900805d9d716c051c671827109f45</td>\n",
+       "      <td>8bea7e8c6e4279fca6276128db89292e</td>\n",
+       "      <td>3.600090e+11</td>\n",
+       "      <td>40.687286</td>\n",
+       "      <td>-73.988217</td>\n",
+       "      <td>Brooklyn</td>\n",
+       "      <td>NY</td>\n",
+       "      <td>urban</td>\n",
+       "      <td>New York City Dept Of Ed</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Books</td>\n",
+       "      <td>high poverty</td>\n",
+       "      <td>Grades PreK-2</td>\n",
+       "      <td>175.33</td>\n",
+       "      <td>23.0</td>\n",
+       "      <td>f</td>\n",
+       "      <td>2012-10-11</td>\n",
+       "      <td>2012-12-05</td>\n",
+       "      <td>55 days</td>\n",
+       "      <td>0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0004536db996ba697ca72c9e058bfe69</td>\n",
+       "      <td>400f8b82bb0143f6a40b217a517fe311</td>\n",
+       "      <td>fbdefab6fe41e12c55886c610c110753</td>\n",
+       "      <td>3.606870e+11</td>\n",
+       "      <td>40.793018</td>\n",
+       "      <td>-73.205635</td>\n",
+       "      <td>Central Islip</td>\n",
+       "      <td>NY</td>\n",
+       "      <td>suburban</td>\n",
+       "      <td>Central Islip Union Free SD</td>\n",
+       "      <td>...</td>\n",
+       "      <td>Technology</td>\n",
+       "      <td>high poverty</td>\n",
+       "      <td>Grades PreK-2</td>\n",
+       "      <td>3591.11</td>\n",
+       "      <td>150.0</td>\n",
+       "      <td>f</td>\n",
+       "      <td>2013-01-08</td>\n",
+       "      <td>2013-03-25</td>\n",
+       "      <td>76 days</td>\n",
+       "      <td>1</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 28 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "                          projectid                    teacher_acctid  \\\n",
+       "0  00001ccc0e81598c4bd86bacb94d7acb  96963218e74e10c3764a5cfb153e6fea   \n",
+       "1  0000fa3aa8f6649abab23615b546016d  2a578595fe351e7fce057e048c409b18   \n",
+       "2  000134f07d4b30140d63262c871748ff  26bd60377bdbffb53a644a16c5308e82   \n",
+       "3  0001f2d0b3827bba67cdbeaa248b832d  15d900805d9d716c051c671827109f45   \n",
+       "4  0004536db996ba697ca72c9e058bfe69  400f8b82bb0143f6a40b217a517fe311   \n",
+       "\n",
+       "                           schoolid  school_ncesid  school_latitude  \\\n",
+       "0  9f3f9f2c2da7edda5648ccd10554ed8c   1.709930e+11        41.807654   \n",
+       "1  3432ed3d4466fac2f2ead83ab354e333   6.409801e+10        34.296596   \n",
+       "2  dc8dcb501c3b2bb0b10e9c6ee2cd8afd   6.227100e+10        34.078625   \n",
+       "3  8bea7e8c6e4279fca6276128db89292e   3.600090e+11        40.687286   \n",
+       "4  fbdefab6fe41e12c55886c610c110753   3.606870e+11        40.793018   \n",
+       "\n",
+       "   school_longitude    school_city school_state school_metro  \\\n",
+       "0        -87.673257        Chicago           IL        urban   \n",
+       "1       -119.296596        Ventura           CA        urban   \n",
+       "2       -118.257834    Los Angeles           CA        urban   \n",
+       "3        -73.988217       Brooklyn           NY        urban   \n",
+       "4        -73.205635  Central Islip           NY     suburban   \n",
+       "\n",
+       "                school_district  ...  resource_type    poverty_level  \\\n",
+       "0         Pershing Elem Network  ...       Supplies  highest poverty   \n",
+       "1  Ventura Unif School District  ...          Books  highest poverty   \n",
+       "2     Los Angeles Unif Sch Dist  ...     Technology     high poverty   \n",
+       "3      New York City Dept Of Ed  ...          Books     high poverty   \n",
+       "4   Central Islip Union Free SD  ...     Technology     high poverty   \n",
+       "\n",
+       "     grade_level total_price_including_optional_support students_reached  \\\n",
+       "0  Grades PreK-2                                1498.61             31.0   \n",
+       "1     Grades 3-5                                 282.47             28.0   \n",
+       "2     Grades 3-5                                1012.38             56.0   \n",
+       "3  Grades PreK-2                                 175.33             23.0   \n",
+       "4  Grades PreK-2                                3591.11            150.0   \n",
+       "\n",
+       "  eligible_double_your_impact_match date_posted datefullyfunded duration label  \n",
+       "0                                 f  2013-04-14      2013-05-02  18 days     0  \n",
+       "1                                 t  2012-04-07      2012-04-18  11 days     0  \n",
+       "2                                 f  2012-01-30      2012-04-15  76 days     1  \n",
+       "3                                 f  2012-10-11      2012-12-05  55 days     0  \n",
+       "4                                 f  2013-01-08      2013-03-25  76 days     1  \n",
+       "\n",
+       "[5 rows x 28 columns]"
+      ]
+     },
+     "execution_count": 41,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.head()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 52,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/plain": [
+       "Timestamp('2013-04-14 00:00:00')"
+      ]
+     },
+     "execution_count": 52,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "df.date_posted[0]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 2. Feature Generation\n",
+    "We'll do this in a few iterations and run some models betweeen each iteration to see how the performance changes.\n",
+    " - Models: Let's take a few simple models to run - logistic regression (L2) and Random Forests (n_estimators = 1000)\n",
+    " - Training and Test Sets: For now, create one six month test set and use data before that as training set (same as in the homework)\n",
+    " - Metrics: Try AUCROC, Precision at 10% and 20%\n",
+    " \n",
+    "Feature Generation iterations:\n",
+    "\n",
+    "The main thing to remember here is that the features you generate are being generated as of the \"posting_date\" and can only use information up to that date.\n",
+    "\n",
+    "1. select existing columns that already exist in the raw data and prep them to run with sklearn models. This should be very similar to what you did in assignment 3. You'll create dummy variables from categorical variables.\n",
+    "\n",
+    "2. Could discretizing some of the varibles help? Try discretizing \"total_ammount\" and \"students_reached\" \n",
+    "\n",
+    "3. Aggregation:\n",
+    " - let's try simple aggregations such as number and percentage (2 different features) of projects that got fully funded in the last x days for several values of x (let's say 10, 30, 60)\n",
+    " - you can extend the previous features to spatial aggregations by limiting that to the same city/state/school as the project you are generating features for.\n",
+    " - you can use the lat long to generate the same features for projects within some distance y\n",
+    " \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 34,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# feature generation code\n",
+    "str_columns = [column for column in df.columns if (df[column].dtype=='O') and (len(df[column].unique())<=51)]\n",
+    "float_columns = ['total_price_including_optional_support', 'students_reached']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 35,
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "['school_state', 'school_metro', 'school_charter', 'school_magnet', 'teacher_prefix', 'primary_focus_subject', 'primary_focus_area', 'secondary_focus_subject', 'secondary_focus_area', 'resource_type', 'poverty_level', 'grade_level', 'eligible_double_your_impact_match']\n",
+      "['total_price_including_optional_support', 'students_reached']\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(str_columns)\n",
+    "print(float_columns)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "The code above automatically generated two lists of columns for dummy variables and discretized variables respectively. It's more reliable than handwriting column names manually. The str_columns has the additional restriction by setting an upper bound on the number of distinct values."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 36,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# Generate dummy variables from str_columns\n",
+    "features = pd.get_dummies(df[str_columns], dummy_na=True, columns=str_columns, drop_first=True)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 37,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Generate discreized variables from float_columns\n",
+    "for column in float_columns:\n",
+    "    features[column] = pd.cut(df[column], bins=5, labels=['low', 'medium low', 'medium', 'medium high', 'high'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Now we generate aggregation feature. We will use number of projects that got funded in the last 10 days as an example."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 54,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "date_posted_list = pd.to_datetime(df.date_posted.unique())\n",
+    "num_projects_funded_dict = {}\n",
+    "# use a dictionary to store the number of projects funded within 10 days of a specific day\n",
+    "for date_posted in date_posted_list:\n",
+    "    since = date_posted - df.datefullyfunded\n",
+    "    num_projects_funded_dict[date_posted.strftime(\"%Y%m%d\")] = np.sum((since>pd.Timedelta('0 days')) & (since<=pd.Timedelta('10 days')))\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 57,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# create a Series of aggregation feature\n",
+    "aggr_list = np.zeros(len(df))\n",
+    "for i in range(len(df)):\n",
+    "    date = df.iloc[i].date_posted\n",
+    "    aggr_list[i] = num_projects_funded_dict[date.strftime(\"%Y%m%d\")]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Append the newly created aggregation feature to the feature dataframe"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 59,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "features['num_projects_funded_within10day'] = aggr_list"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 61,
+   "metadata": {},
+   "outputs": [
+    {
+     "data": {
+      "text/html": [
+       "<div>\n",
+       "<style>\n",
+       "    .dataframe thead tr:only-child th {\n",
+       "        text-align: right;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe thead th {\n",
+       "        text-align: left;\n",
+       "    }\n",
+       "\n",
+       "    .dataframe tbody tr th {\n",
+       "        vertical-align: top;\n",
+       "    }\n",
+       "</style>\n",
+       "<table border=\"1\" class=\"dataframe\">\n",
+       "  <thead>\n",
+       "    <tr style=\"text-align: right;\">\n",
+       "      <th></th>\n",
+       "      <th>school_state_AL</th>\n",
+       "      <th>school_state_AR</th>\n",
+       "      <th>school_state_AZ</th>\n",
+       "      <th>school_state_CA</th>\n",
+       "      <th>school_state_CO</th>\n",
+       "      <th>school_state_CT</th>\n",
+       "      <th>school_state_DC</th>\n",
+       "      <th>school_state_DE</th>\n",
+       "      <th>school_state_FL</th>\n",
+       "      <th>school_state_GA</th>\n",
+       "      <th>...</th>\n",
+       "      <th>poverty_level_nan</th>\n",
+       "      <th>grade_level_Grades 6-8</th>\n",
+       "      <th>grade_level_Grades 9-12</th>\n",
+       "      <th>grade_level_Grades PreK-2</th>\n",
+       "      <th>grade_level_nan</th>\n",
+       "      <th>eligible_double_your_impact_match_t</th>\n",
+       "      <th>eligible_double_your_impact_match_nan</th>\n",
+       "      <th>total_price_including_optional_support</th>\n",
+       "      <th>students_reached</th>\n",
+       "      <th>num_projects_funded_within10day</th>\n",
+       "    </tr>\n",
+       "  </thead>\n",
+       "  <tbody>\n",
+       "    <tr>\n",
+       "      <th>0</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>low</td>\n",
+       "      <td>low</td>\n",
+       "      <td>1281.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>1</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>low</td>\n",
+       "      <td>low</td>\n",
+       "      <td>1435.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>2</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>low</td>\n",
+       "      <td>low</td>\n",
+       "      <td>568.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>3</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>low</td>\n",
+       "      <td>low</td>\n",
+       "      <td>1973.0</td>\n",
+       "    </tr>\n",
+       "    <tr>\n",
+       "      <th>4</th>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>...</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>1</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>0</td>\n",
+       "      <td>low</td>\n",
+       "      <td>low</td>\n",
+       "      <td>1688.0</td>\n",
+       "    </tr>\n",
+       "  </tbody>\n",
+       "</table>\n",
+       "<p>5 rows × 149 columns</p>\n",
+       "</div>"
+      ],
+      "text/plain": [
+       "   school_state_AL  school_state_AR  school_state_AZ  school_state_CA  \\\n",
+       "0                0                0                0                0   \n",
+       "1                0                0                0                1   \n",
+       "2                0                0                0                1   \n",
+       "3                0                0                0                0   \n",
+       "4                0                0                0                0   \n",
+       "\n",
+       "   school_state_CO  school_state_CT  school_state_DC  school_state_DE  \\\n",
+       "0                0                0                0                0   \n",
+       "1                0                0                0                0   \n",
+       "2                0                0                0                0   \n",
+       "3                0                0                0                0   \n",
+       "4                0                0                0                0   \n",
+       "\n",
+       "   school_state_FL  school_state_GA               ...                 \\\n",
+       "0                0                0               ...                  \n",
+       "1                0                0               ...                  \n",
+       "2                0                0               ...                  \n",
+       "3                0                0               ...                  \n",
+       "4                0                0               ...                  \n",
+       "\n",
+       "   poverty_level_nan  grade_level_Grades 6-8  grade_level_Grades 9-12  \\\n",
+       "0                  0                       0                        0   \n",
+       "1                  0                       0                        0   \n",
+       "2                  0                       0                        0   \n",
+       "3                  0                       0                        0   \n",
+       "4                  0                       0                        0   \n",
+       "\n",
+       "   grade_level_Grades PreK-2  grade_level_nan  \\\n",
+       "0                          1                0   \n",
+       "1                          0                0   \n",
+       "2                          0                0   \n",
+       "3                          1                0   \n",
+       "4                          1                0   \n",
+       "\n",
+       "   eligible_double_your_impact_match_t  eligible_double_your_impact_match_nan  \\\n",
+       "0                                    0                                      0   \n",
+       "1                                    1                                      0   \n",
+       "2                                    0                                      0   \n",
+       "3                                    0                                      0   \n",
+       "4                                    0                                      0   \n",
+       "\n",
+       "   total_price_including_optional_support  students_reached  \\\n",
+       "0                                     low               low   \n",
+       "1                                     low               low   \n",
+       "2                                     low               low   \n",
+       "3                                     low               low   \n",
+       "4                                     low               low   \n",
+       "\n",
+       "   num_projects_funded_within10day  \n",
+       "0                           1281.0  \n",
+       "1                           1435.0  \n",
+       "2                            568.0  \n",
+       "3                           1973.0  \n",
+       "4                           1688.0  \n",
+       "\n",
+       "[5 rows x 149 columns]"
+      ]
+     },
+     "execution_count": 61,
+     "metadata": {},
+     "output_type": "execute_result"
+    }
+   ],
+   "source": [
+    "features.head()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## 3. Let's test models now"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "raw_features = []\n",
+    "discretized_features = []\n",
+    "simple_aggregate_features = []\n",
+    "spatial_aggregate_features = []\n",
+    "\n",
+    "# now seelect which one(s) you want to test models with\n",
+    "selected_feature_groups  = []\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Create one (temporal) train and test split \n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 45,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# split train and test data based on a date threshold in mid 2013\n",
+    "split_threshold = pd.Timestamp(2013,6,30)\n",
+    "train_filter = (df.date_posted <= split_threshold)\n",
+    "test_filter = (df.date_posted > split_threshold)\n",
+    "train_x, train_y = features[train_filter], df.label[train_filter]\n",
+    "test_x, test_y = features[test_filter], df.label[test_filter]"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Imputation\n",
+    "Impute features that may be missing (separately on train and test set to avoid leakage). Each feature may be missing for a different reason so fill them appropriately (and generate missing flags as separate variables when necessary - remember what we talked about in class about this)\n",
+    "."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# code"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Train and Test models\n",
+    "- Build model(s) using the selected feature groups\n",
+    "- test model(s)\n",
+    "- evaluate\n",
+    "\n",
+    "You should do this for different subsets of feature groups above to get an idea of what the performance impact is"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": [
+    "# code"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Add more features\n",
+    "Can you think of other features (especially aggregate ones) that will be helpful?\n",
+    "  - avg amount for fully funded projects in the last x days within y distance (or same geographical area)?\n",
+    "  - difference between what this project is asking for and the feature above?\n",
+    "  - ...\n",
+    "  \n",
+    "Now create a new feature group and see how well do the models do with the additional feature(s)?"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "collapsed": true
+   },
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 2",
+   "language": "python",
+   "name": "python2"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 2
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython2",
+   "version": "2.7.14"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

	projectid	teacher_acctid	schoolid	school_ncesid	school_latitude	school_longitude	school_city	school_state	school_metro	school_district	...	secondary_focus_subject	secondary_focus_area	resource_type	poverty_level	grade_level	total_price_including_optional_support	students_reached	eligible_double_your_impact_match	date_posted	datefullyfunded
0	00001ccc0e81598c4bd86bacb94d7acb	96963218e74e10c3764a5cfb153e6fea	9f3f9f2c2da7edda5648ccd10554ed8c	1.709930e+11	41.807654	-87.673257	Chicago	IL	urban	Pershing Elem Network	...	Visual Arts	Music & The Arts	Supplies	highest poverty	Grades PreK-2	1498.61	31.0	f	2013-04-14	2013-05-02
1	0000fa3aa8f6649abab23615b546016d	2a578595fe351e7fce057e048c409b18	3432ed3d4466fac2f2ead83ab354e333	6.409801e+10	34.296596	-119.296596	Ventura	CA	urban	Ventura Unif School District	...	Literature & Writing	Literacy & Language	Books	highest poverty	Grades 3-5	282.47	28.0	t	2012-04-07	2012-04-18
2	000134f07d4b30140d63262c871748ff	26bd60377bdbffb53a644a16c5308e82	dc8dcb501c3b2bb0b10e9c6ee2cd8afd	6.227100e+10	34.078625	-118.257834	Los Angeles	CA	urban	Los Angeles Unif Sch Dist	...	Social Sciences	History & Civics	Technology	high poverty	Grades 3-5	1012.38	56.0	f	2012-01-30	2012-04-15
3	0001f2d0b3827bba67cdbeaa248b832d	15d900805d9d716c051c671827109f45	8bea7e8c6e4279fca6276128db89292e	3.600090e+11	40.687286	-73.988217	Brooklyn	NY	urban	New York City Dept Of Ed	...	NaN	NaN	Books	high poverty	Grades PreK-2	175.33	23.0	f	2012-10-11	2012-12-05
4	0004536db996ba697ca72c9e058bfe69	400f8b82bb0143f6a40b217a517fe311	fbdefab6fe41e12c55886c610c110753	3.606870e+11	40.793018	-73.205635	Central Islip	NY	suburban	Central Islip Union Free SD	...	Literature & Writing	Literacy & Language	Technology	high poverty	Grades PreK-2	3591.11	150.0	f	2013-01-08	2013-03-25
	school_state_CA	...	grade_level_Grades PreK-2	eligible_double_your_impact_match_t	total_price_including_optional_support	students_reached	num_projects_funded_within10day
0	0	...	1	0	low	low	1281.0
1	1	...	0	1	low	low	1435.0
2	1	...	0	0	low	low	568.0
3	0	...	1	0	low	low	1973.0
4	0	...	1	0	low	low	1688.0