springboard-curriculum · JLindsey96 · Jun 18, 2024
diff --git a/Notebooks/02_data_wrangling.ipynb b/Notebooks/02_data_wrangling.ipynb
@@ -126,9 +126,9 @@
    "source": [
     "#Code task 1#\n",
     "#Import pandas, matplotlib.pyplot, and seaborn in the correct lines below\n",
-    "import ___ as pd\n",
-    "import ___ as plt\n",
-    "import ___ as sns\n",
+    "import pandas as pd\n",
+    "import matplotlib.pyplot as plt\n",
+    "import seaborn as sns\n",
     "import os\n",
     "\n",
     "from library.sb_utils import save_file\n"
@@ -185,7 +185,7 @@
    "source": [
     "#Code task 2#\n",
     "#Call the info method on ski_data to see a summary of the data\n",
-    "ski_data.___"
+    "ski_data.info()"
    ]
   },
   {
@@ -212,7 +212,7 @@
    "source": [
     "#Code task 3#\n",
     "#Call the head method on ski_data to print the first several rows of the data\n",
-    "ski_data.___"
+    "ski_data.head()"
    ]
   },
   {
@@ -253,7 +253,7 @@
     "#Filter the ski_data dataframe to display just the row for our resort with the name 'Big Mountain Resort'\n",
     "#Hint: you will find that the transpose of the row will give a nicer output. DataFrame's do have a\n",
     "#transpose method, but you can access this conveniently with the `T` property.\n",
-    "ski_data[ski_data.Name == ___].___"
+    "ski_data[ski_data.Name == 'Big Mountain Resort'].transpose()"
    ]
   },
   {
@@ -288,9 +288,9 @@
     "#ski_data as well as the percentages (using `.mean()` instead of `.sum()`).\n",
     "#Order them (increasing or decreasing) using sort_values\n",
     "#Call `pd.concat` to present these in a single table (DataFrame) with the helpful column names 'count' and '%'\n",
-    "missing = ___([ski_data.___.___, 100 * ski_data.___.___], axis=1)\n",
-    "missing.columns=[___, ___]\n",
-    "missing.___(by=___)"
+    "missing = ski_data([ski_data.pd.concat, 100 * ski_data.pd.concat], axis=1)\n",
+    "missing.columns=['count', '%']\n",
+    "missing.sum(by=ski_data.isnull())"
    ]
   },
   {
@@ -322,7 +322,7 @@
    "source": [
     "#Code task 6#\n",
     "#Use ski_data's `select_dtypes` method to select columns of dtype 'object'\n",
-    "ski_data.___(___)"
+    "ski_data.select_dtypes('object')"
    ]
   },
   {
@@ -350,7 +350,7 @@
    "source": [
     "#Code task 7#\n",
     "#Use pandas' Series method `value_counts` to find any duplicated resort names\n",
-    "ski_data['Name'].___.head()"
+    "ski_data['Name'].value_counts.head()"
    ]
   },
   {
@@ -375,7 +375,7 @@
    "source": [
     "#Code task 8#\n",
     "#Concatenate the string columns 'Name' and 'Region' and count the values again (as above)\n",
-    "(ski_data[___] + ', ' + ski_data[___]).___.head()"
+    "(ski_data['Name'] + ', ' + ski_data['Region']).value_counts.head()"
    ]
   },
   {
@@ -386,7 +386,7 @@
    "source": [
     "#Code task 9#\n",
     "#Concatenate 'Name' and 'state' and count the values again (as above)\n",
-    "(ski_data[___] + ', ' + ski_data[___]).___.head()"
+    "(ski_data['Name'] + ', ' + ski_data['state']).value_counts.head()"
    ]
   },
   {
@@ -577,7 +577,7 @@
    "source": [
     "#Code task 10#\n",
     "#Calculate the number of times Region does not equal state\n",
-    "(ski_data.Region ___ ski_data.state).___"
+    "(ski_data.Region != ski_data.state).sum()"
    ]
   },
   {
@@ -661,8 +661,8 @@
     "#Code task 11#\n",
     "#Filter the ski_data dataframe for rows where 'Region' and 'state' are different,\n",
     "#group that by 'state' and perform `value_counts` on the 'Region'\n",
-    "(ski_data[ski_data.___ ___ ski_data.___]\n",
-    " .groupby(___)[___]\n",
+    "(ski_data[ski_data.sort != ski_data.rows]\n",
+    " .groupby('state')['Region']\n",
     " .value_counts())"
    ]
   },
@@ -689,7 +689,7 @@
     "#Code task 12#\n",
     "#Select the 'Region' and 'state' columns from ski_data and use the `nunique` method to calculate\n",
     "#the number of unique values in each\n",
-    "ski_data[[___, ___]].___"
+    "ski_data[['Region', 'state']].nunique()"
    ]
   },
   {
@@ -721,21 +721,21 @@
    "source": [
     "#Code task 13#\n",
     "#Create two subplots on 1 row and 2 columns with a figsize of (12, 8)\n",
-    "fig, ax = plt.subplots(___, ___, figsize=(___))\n",
+    "fig, ax = plt.subplots(row=1, col=2, figsize=(12, 8))\n",
     "#Specify a horizontal barplot ('barh') as kind of plot (kind=)\n",
-    "ski_data.Region.value_counts().plot(kind=___, ax=ax[0])\n",
+    "ski_data.Region.value_counts().plot(kind='barh', ax=ax[0])\n",
     "#Give the plot a helpful title of 'Region'\n",
-    "ax[0].set_title(___)\n",
+    "ax[0].set_title('Region')\n",
     "#Label the xaxis 'Count'\n",
-    "ax[0].set_xlabel(___)\n",
+    "ax[0].set_xlabel('Count')\n",
     "#Specify a horizontal barplot ('barh') as kind of plot (kind=)\n",
-    "ski_data.state.value_counts().plot(kind=___, ax=ax[1])\n",
+    "ski_data.state.value_counts().plot(kind='barh', ax=ax[1])\n",
     "#Give the plot a helpful title of 'state'\n",
-    "ax[1].set_title(___)\n",
+    "ax[1].set_title('state')\n",
     "#Label the xaxis 'Count'\n",
-    "ax[1].set_xlabel(___)\n",
+    "ax[1].set_xlabel('Count')\n",
     "#Give the subplots a little \"breathing room\" with a wspace of 0.5\n",
-    "plt.subplots_adjust(wspace=___);\n",
+    "plt.subplots_adjust(wspace=0.5);\n",
     "#You're encouraged to explore a few different figure sizes, orientations, and spacing here\n",
     "# as the importance of easy-to-read and informative figures is frequently understated\n",
     "# and you will find the ability to tweak figures invaluable later on"
@@ -778,7 +778,7 @@
     "#Code task 14#\n",
     "# Calculate average weekday and weekend price by state and sort by the average of the two\n",
     "# Hint: use the pattern dataframe.groupby(<grouping variable>)[<list of columns>].mean()\n",
-    "state_price_means = ski_data.___(___)[[___, ___]].mean()\n",
+    "state_price_means = ski_data.groupby('price')[['weekday', 'weekend']].mean()\n",
     "state_price_means.head()"
    ]
   },
@@ -849,11 +849,11 @@
     "#gather the ticket prices from the 'Adultweekday' and 'AdultWeekend' columns using the `value_vars` argument,\n",
     "#call the resultant price column 'Price' via the `value_name` argument,\n",
     "#name the weekday/weekend indicator column 'Ticket' via the `var_name` argument\n",
-    "ticket_prices = pd.melt(ski_data[[___, ___, ___]], \n",
-    "                        id_vars=___, \n",
-    "                        var_name=___, \n",
-    "                        value_vars=[___, ___], \n",
-    "                        value_name=___)"
+    "ticket_prices = pd.melt(ski_data[['state', 'AdultWeekday', 'Adultweekend']], \n",
+    "                        id_vars='state', \n",
+    "                        var_name='Ticket', \n",
+    "                        value_vars=['Adultweekday', 'AdultWeekend'], \n",
+    "                        value_name='Price')"
    ]
   },
   {
@@ -958,7 +958,7 @@
     "#with 'state' on the x-axis, 'Price' as the y-value, and a hue that indicates 'Ticket'\n",
     "#This will use boxplot's x, y, hue, and data arguments.\n",
     "plt.subplots(figsize=(12, 8))\n",
-    "sns.boxplot(x=___, y=___, hue=___, data=ticket_prices)\n",
+    "sns.boxplot(x='state', y='Price', hue='Ticket', data=ticket_prices)\n",
     "plt.xticks(rotation='vertical')\n",
     "plt.ylabel('Price ($)')\n",
     "plt.xlabel('State');"
@@ -1020,7 +1020,7 @@
     "#Call ski_data's `describe` method for a statistical summary of the numerical columns\n",
     "#Hint: there are fewer summary stat columns than features, so displaying the transpose\n",
     "#will be useful again\n",
-    "ski_data.___.___"
+    "ski_data.describe.transpose()"
    ]
   },
   {
@@ -1086,8 +1086,8 @@
     "#Try passing it an argument figsize=(15,10)\n",
     "#Try calling plt.subplots_adjust() with an argument hspace=0.5 to adjust the spacing\n",
     "#It's important you create legible and easy-to-read plots\n",
-    "ski_data.___(___)\n",
-    "#plt.subplots_adjust(hspace=___);\n",
+    "ski_data.hist(figsize=(15,10))\n",
+    "#plt.subplots_adjust(hspace=0.5);\n",
     "#Hint: notice how the terminating ';' \"swallows\" some messy output and leads to a tidier notebook"
    ]
   },
@@ -1120,7 +1120,7 @@
    "source": [
     "#Code task 19#\n",
     "#Filter the 'SkiableTerrain_ac' column to print the values greater than 10000\n",
-    "ski_data.___[ski_data.___ > ___]"
+    "ski_data.sort_values[ski_data.column['SkiableTerrain_ac'] > 10000]"
    ]
   },
   {
@@ -1139,7 +1139,7 @@
     "#Code task 20#\n",
     "#Now you know there's only one, print the whole row to investigate all values, including seeing the resort name\n",
     "#Hint: don't forget the transpose will be helpful here\n",
-    "ski_data[ski_data.___ > ___].___"
+    "ski_data[ski_data.column('SkiableTerrain_ac') > 0].transpose()"
    ]
   },
   {
@@ -1185,7 +1185,7 @@
    "source": [
     "#Code task 21#\n",
     "#Use the .loc accessor to print the 'SkiableTerrain_ac' value only for this resort\n",
-    "ski_data.___[39, 'SkiableTerrain_ac']"
+    "ski_data.loc[39, 'SkiableTerrain_ac']"
    ]
   },
   {
@@ -1196,7 +1196,7 @@
    "source": [
     "#Code task 22#\n",
     "#Use the .loc accessor again to modify this value with the correct value of 1819\n",
-    "ski_data.___[39, 'SkiableTerrain_ac'] = ___"
+    "ski_data.loc[39, 'SkiableTerrain_ac'] = 1819"
    ]
   },
   {
@@ -1207,7 +1207,7 @@
    "source": [
     "#Code task 23#\n",
     "#Use the .loc accessor a final time to verify that the value has been modified\n",
-    "ski_data.___[39, 'SkiableTerrain_ac']"
+    "ski_data.loc[39, 'SkiableTerrain_ac']"
    ]
   },
   {
@@ -1559,7 +1559,7 @@
    "source": [
     "#Code task 24#\n",
     "#Drop the 'fastEight' column from ski_data. Use inplace=True\n",
-    "ski_data.drop(columns=___, inplace=___)"
+    "ski_data.drop(columns='fastEight', inplace=True)"
    ]
   },
   {
@@ -1577,7 +1577,7 @@
    "source": [
     "#Code task 25#\n",
     "#Filter the 'yearsOpen' column for values greater than 100\n",
-    "ski_data.___[ski_data.___ > ___]"
+    "ski_data.loc[ski_data.column['yearsOpen'] > 100]"
    ]
   },
   {
@@ -1603,7 +1603,7 @@
     "#Code task 26#\n",
     "#Call the hist method on 'yearsOpen' after filtering for values under 1000\n",
     "#Pass the argument bins=30 to hist(), but feel free to explore other values\n",
-    "ski_data.___[ski_data.___ < ___].hist(___)\n",
+    "ski_data.loc[ski_data.column['yearsOpen'] < 1000].hist(bins=30)\n",
     "plt.xlabel('Years open')\n",
     "plt.ylabel('Count')\n",
     "plt.title('Distribution of years open excluding 2019');"
@@ -1730,13 +1730,13 @@
     "#respectively\n",
     "#Finally, add a call to the reset_index() method (we recommend you experiment with and without this to see\n",
     "#what it does)\n",
-    "state_summary = ski_data.groupby('state').agg(\n",
+    "c
     "    resorts_per_state=pd.NamedAgg(column='Name', aggfunc='size'), #could pick any column here\n",
     "    state_total_skiable_area_ac=pd.NamedAgg(column='SkiableTerrain_ac', aggfunc='sum'),\n",
-    "    state_total_days_open=pd.NamedAgg(column=__, aggfunc='sum'),\n",
-    "    ___=pd.NamedAgg(column=___, aggfunc=___),\n",
-    "    ___=pd.NamedAgg(column=___, aggfunc=___)\n",
-    ").___\n",
+    "    state_total_days_open=pd.NamedAgg(column='state, aggfunc='sum'),\n",
+    "    state_total_skiable_area_ac=pd.NamedAgg(column='SkiableTerrain_ac', aggfunc='sum'),\n",
+    "    state_total_days_open=pd.NamedAgg(column='state, aggfunc='sum'),\n",
+    ")state_summary = ski_data.groupby('state').agg(\n",
     "state_summary.head()"
    ]
   },
@@ -1856,7 +1856,7 @@
     "#Code task 29#\n",
     "#Use pandas' `read_html` method to read the table from the URL below\n",
     "states_url = 'https://simple.wikipedia.org/w/index.php?title=List_of_U.S._states&oldid=7168473'\n",
-    "usa_states = pd.___(___)"
+    "usa_states = pd.read_html(states_url)"
    ]
   },
   {
@@ -2088,7 +2088,7 @@
     "#Code task 30#\n",
     "#Use the iloc accessor to get the pandas Series for column number 4 from `usa_states`\n",
     "#It should be a column of dates\n",
-    "established = usa_sates.___[:, 4]"
+    "established = usa_sates.iloc[:, 4]"
    ]
   },
   {
@@ -2178,8 +2178,8 @@
     "#Now use the iloc accessor again to extract columns 0, 5, and 6 and the dataframe's `copy()` method\n",
     "#Set the names of these extracted columns to 'state', 'state_population', and 'state_area_sq_miles',\n",
     "#respectively.\n",
-    "usa_states_sub = usa_states.___[:, [___]].copy()\n",
-    "usa_states_sub.columns = [___]\n",
+    "usa_states_sub = usa_states.iloc[:, [4:6]].copy()\n",
+    "usa_states_sub.columns = ['state', 'state_population', 'state_area_sq_miles']\n",
     "usa_states_sub.head()"
    ]
   },
@@ -2199,7 +2199,7 @@
     "#Code task 32#\n",
     "#Find the states in `state_summary` that are not in `usa_states_sub`\n",
     "#Hint: set(list1) - set(list2) is an easy way to get items in list1 that are not in list2\n",
-    "missing_states = ___(state_summary.state) - ___(usa_states_sub.state)\n",
+    "missing_states = states(state_summary.state) - states(usa_states_sub.state)\n",
     "missing_states"
    ]
   },
@@ -2262,7 +2262,7 @@
     "#value='' #empty string as replacement\n",
     "#regex=True #we used a regex in our `to_replace` argument\n",
     "#inplace=True #Do this \"in place\"\n",
-    "usa_states_sub.state.___(to_replace=___, value=__, regex=___, inplace=___)\n",
+    "usa_states_sub.state.replace(to_replace='\\[.*\\]', value='', regex=True, inplace=True)\n",
     "usa_states_sub.state[usa_states_sub.state.str.contains('Massachusetts|Pennsylvania|Rhode Island|Virginia')]"
    ]
   },
@@ -2275,7 +2275,7 @@
     "#Code task 34#\n",
     "#And now verify none of our states are missing by checking that there are no states in\n",
     "#state_summary that are not in usa_states_sub (as earlier using `set()`)\n",
-    "missing_states = ___(state_summary.state) - ___(usa_states_sub.state)\n",
+    "missing_states = states(state_summary.state) - states(usa_states_sub.state)\n",
     "missing_states"
    ]
   },
@@ -2295,7 +2295,7 @@
     "#Code task 35#\n",
     "#Use 'state_summary's `merge()` method to combine our new data in 'usa_states_sub'\n",
     "#specify the arguments how='left' and on='state'\n",
-    "state_summary = state_summary.___(usa_states_sub, ___=___, ___=___)\n",
+    "state_summary = state_summary.merge(usa_states_sub, how='left', on='state')\n",
     "state_summary.head()"
    ]
   },
@@ -2329,7 +2329,7 @@
     "#Code task 36#\n",
     "#Use ski_data's `plot()` method to create a scatterplot (kind='scatter') with 'AdultWeekday' on the x-axis and\n",
     "#'AdultWeekend' on the y-axis\n",
-    "ski_data.___(x=___, y=___, kind=___);"
+    "ski_data.plot(x='AdultWeekday', y='AdultWeekend', kind='scatter');"
    ]
   },
   {
@@ -2347,7 +2347,7 @@
    "source": [
     "#Code task 37#\n",
     "#Use the loc accessor on ski_data to print the 'AdultWeekend' and 'AdultWeekday' columns for Montana only\n",
-    "ski_data.___[ski_data.state == ___, [___, ___]]"
+    "ski_data.loc[ski_data.state == Montana, ['AdultWeekend', 'AdultWeekday']]"
    ]
   },
   {