Skip to content

Update 02_data_wrangling.ipynb//JTLindsey #94

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
116 changes: 58 additions & 58 deletions Notebooks/02_data_wrangling.ipynb
Original file line number Diff line number Diff line change
Expand Up @@ -126,9 +126,9 @@
"source": [
"#Code task 1#\n",
"#Import pandas, matplotlib.pyplot, and seaborn in the correct lines below\n",
"import ___ as pd\n",
"import ___ as plt\n",
"import ___ as sns\n",
"import pandas as pd\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import os\n",
"\n",
"from library.sb_utils import save_file\n"
Expand Down Expand Up @@ -185,7 +185,7 @@
"source": [
"#Code task 2#\n",
"#Call the info method on ski_data to see a summary of the data\n",
"ski_data.___"
"ski_data.info()"
]
},
{
Expand All @@ -212,7 +212,7 @@
"source": [
"#Code task 3#\n",
"#Call the head method on ski_data to print the first several rows of the data\n",
"ski_data.___"
"ski_data.head()"
]
},
{
Expand Down Expand Up @@ -253,7 +253,7 @@
"#Filter the ski_data dataframe to display just the row for our resort with the name 'Big Mountain Resort'\n",
"#Hint: you will find that the transpose of the row will give a nicer output. DataFrame's do have a\n",
"#transpose method, but you can access this conveniently with the `T` property.\n",
"ski_data[ski_data.Name == ___].___"
"ski_data[ski_data.Name == 'Big Mountain Resort'].transpose()"
]
},
{
Expand Down Expand Up @@ -288,9 +288,9 @@
"#ski_data as well as the percentages (using `.mean()` instead of `.sum()`).\n",
"#Order them (increasing or decreasing) using sort_values\n",
"#Call `pd.concat` to present these in a single table (DataFrame) with the helpful column names 'count' and '%'\n",
"missing = ___([ski_data.___.___, 100 * ski_data.___.___], axis=1)\n",
"missing.columns=[___, ___]\n",
"missing.___(by=___)"
"missing = ski_data([ski_data.pd.concat, 100 * ski_data.pd.concat], axis=1)\n",
"missing.columns=['count', '%']\n",
"missing.sum(by=ski_data.isnull())"
]
},
{
Expand Down Expand Up @@ -322,7 +322,7 @@
"source": [
"#Code task 6#\n",
"#Use ski_data's `select_dtypes` method to select columns of dtype 'object'\n",
"ski_data.___(___)"
"ski_data.select_dtypes('object')"
]
},
{
Expand Down Expand Up @@ -350,7 +350,7 @@
"source": [
"#Code task 7#\n",
"#Use pandas' Series method `value_counts` to find any duplicated resort names\n",
"ski_data['Name'].___.head()"
"ski_data['Name'].value_counts.head()"
]
},
{
Expand All @@ -375,7 +375,7 @@
"source": [
"#Code task 8#\n",
"#Concatenate the string columns 'Name' and 'Region' and count the values again (as above)\n",
"(ski_data[___] + ', ' + ski_data[___]).___.head()"
"(ski_data['Name'] + ', ' + ski_data['Region']).value_counts.head()"
]
},
{
Expand All @@ -386,7 +386,7 @@
"source": [
"#Code task 9#\n",
"#Concatenate 'Name' and 'state' and count the values again (as above)\n",
"(ski_data[___] + ', ' + ski_data[___]).___.head()"
"(ski_data['Name'] + ', ' + ski_data['state']).value_counts.head()"
]
},
{
Expand Down Expand Up @@ -577,7 +577,7 @@
"source": [
"#Code task 10#\n",
"#Calculate the number of times Region does not equal state\n",
"(ski_data.Region ___ ski_data.state).___"
"(ski_data.Region != ski_data.state).sum()"
]
},
{
Expand Down Expand Up @@ -661,8 +661,8 @@
"#Code task 11#\n",
"#Filter the ski_data dataframe for rows where 'Region' and 'state' are different,\n",
"#group that by 'state' and perform `value_counts` on the 'Region'\n",
"(ski_data[ski_data.___ ___ ski_data.___]\n",
" .groupby(___)[___]\n",
"(ski_data[ski_data.sort != ski_data.rows]\n",
" .groupby('state')['Region']\n",
" .value_counts())"
]
},
Expand All @@ -689,7 +689,7 @@
"#Code task 12#\n",
"#Select the 'Region' and 'state' columns from ski_data and use the `nunique` method to calculate\n",
"#the number of unique values in each\n",
"ski_data[[___, ___]].___"
"ski_data[['Region', 'state']].nunique()"
]
},
{
Expand Down Expand Up @@ -721,21 +721,21 @@
"source": [
"#Code task 13#\n",
"#Create two subplots on 1 row and 2 columns with a figsize of (12, 8)\n",
"fig, ax = plt.subplots(___, ___, figsize=(___))\n",
"fig, ax = plt.subplots(row=1, col=2, figsize=(12, 8))\n",
"#Specify a horizontal barplot ('barh') as kind of plot (kind=)\n",
"ski_data.Region.value_counts().plot(kind=___, ax=ax[0])\n",
"ski_data.Region.value_counts().plot(kind='barh', ax=ax[0])\n",
"#Give the plot a helpful title of 'Region'\n",
"ax[0].set_title(___)\n",
"ax[0].set_title('Region')\n",
"#Label the xaxis 'Count'\n",
"ax[0].set_xlabel(___)\n",
"ax[0].set_xlabel('Count')\n",
"#Specify a horizontal barplot ('barh') as kind of plot (kind=)\n",
"ski_data.state.value_counts().plot(kind=___, ax=ax[1])\n",
"ski_data.state.value_counts().plot(kind='barh', ax=ax[1])\n",
"#Give the plot a helpful title of 'state'\n",
"ax[1].set_title(___)\n",
"ax[1].set_title('state')\n",
"#Label the xaxis 'Count'\n",
"ax[1].set_xlabel(___)\n",
"ax[1].set_xlabel('Count')\n",
"#Give the subplots a little \"breathing room\" with a wspace of 0.5\n",
"plt.subplots_adjust(wspace=___);\n",
"plt.subplots_adjust(wspace=0.5);\n",
"#You're encouraged to explore a few different figure sizes, orientations, and spacing here\n",
"# as the importance of easy-to-read and informative figures is frequently understated\n",
"# and you will find the ability to tweak figures invaluable later on"
Expand Down Expand Up @@ -778,7 +778,7 @@
"#Code task 14#\n",
"# Calculate average weekday and weekend price by state and sort by the average of the two\n",
"# Hint: use the pattern dataframe.groupby(<grouping variable>)[<list of columns>].mean()\n",
"state_price_means = ski_data.___(___)[[___, ___]].mean()\n",
"state_price_means = ski_data.groupby('price')[['weekday', 'weekend']].mean()\n",
"state_price_means.head()"
]
},
Expand Down Expand Up @@ -849,11 +849,11 @@
"#gather the ticket prices from the 'Adultweekday' and 'AdultWeekend' columns using the `value_vars` argument,\n",
"#call the resultant price column 'Price' via the `value_name` argument,\n",
"#name the weekday/weekend indicator column 'Ticket' via the `var_name` argument\n",
"ticket_prices = pd.melt(ski_data[[___, ___, ___]], \n",
" id_vars=___, \n",
" var_name=___, \n",
" value_vars=[___, ___], \n",
" value_name=___)"
"ticket_prices = pd.melt(ski_data[['state', 'AdultWeekday', 'Adultweekend']], \n",
" id_vars='state', \n",
" var_name='Ticket', \n",
" value_vars=['Adultweekday', 'AdultWeekend'], \n",
" value_name='Price')"
]
},
{
Expand Down Expand Up @@ -958,7 +958,7 @@
"#with 'state' on the x-axis, 'Price' as the y-value, and a hue that indicates 'Ticket'\n",
"#This will use boxplot's x, y, hue, and data arguments.\n",
"plt.subplots(figsize=(12, 8))\n",
"sns.boxplot(x=___, y=___, hue=___, data=ticket_prices)\n",
"sns.boxplot(x='state', y='Price', hue='Ticket', data=ticket_prices)\n",
"plt.xticks(rotation='vertical')\n",
"plt.ylabel('Price ($)')\n",
"plt.xlabel('State');"
Expand Down Expand Up @@ -1020,7 +1020,7 @@
"#Call ski_data's `describe` method for a statistical summary of the numerical columns\n",
"#Hint: there are fewer summary stat columns than features, so displaying the transpose\n",
"#will be useful again\n",
"ski_data.___.___"
"ski_data.describe.transpose()"
]
},
{
Expand Down Expand Up @@ -1086,8 +1086,8 @@
"#Try passing it an argument figsize=(15,10)\n",
"#Try calling plt.subplots_adjust() with an argument hspace=0.5 to adjust the spacing\n",
"#It's important you create legible and easy-to-read plots\n",
"ski_data.___(___)\n",
"#plt.subplots_adjust(hspace=___);\n",
"ski_data.hist(figsize=(15,10))\n",
"#plt.subplots_adjust(hspace=0.5);\n",
"#Hint: notice how the terminating ';' \"swallows\" some messy output and leads to a tidier notebook"
]
},
Expand Down Expand Up @@ -1120,7 +1120,7 @@
"source": [
"#Code task 19#\n",
"#Filter the 'SkiableTerrain_ac' column to print the values greater than 10000\n",
"ski_data.___[ski_data.___ > ___]"
"ski_data.sort_values[ski_data.column['SkiableTerrain_ac'] > 10000]"
]
},
{
Expand All @@ -1139,7 +1139,7 @@
"#Code task 20#\n",
"#Now you know there's only one, print the whole row to investigate all values, including seeing the resort name\n",
"#Hint: don't forget the transpose will be helpful here\n",
"ski_data[ski_data.___ > ___].___"
"ski_data[ski_data.column('SkiableTerrain_ac') > 0].transpose()"
]
},
{
Expand Down Expand Up @@ -1185,7 +1185,7 @@
"source": [
"#Code task 21#\n",
"#Use the .loc accessor to print the 'SkiableTerrain_ac' value only for this resort\n",
"ski_data.___[39, 'SkiableTerrain_ac']"
"ski_data.loc[39, 'SkiableTerrain_ac']"
]
},
{
Expand All @@ -1196,7 +1196,7 @@
"source": [
"#Code task 22#\n",
"#Use the .loc accessor again to modify this value with the correct value of 1819\n",
"ski_data.___[39, 'SkiableTerrain_ac'] = ___"
"ski_data.loc[39, 'SkiableTerrain_ac'] = 1819"
]
},
{
Expand All @@ -1207,7 +1207,7 @@
"source": [
"#Code task 23#\n",
"#Use the .loc accessor a final time to verify that the value has been modified\n",
"ski_data.___[39, 'SkiableTerrain_ac']"
"ski_data.loc[39, 'SkiableTerrain_ac']"
]
},
{
Expand Down Expand Up @@ -1559,7 +1559,7 @@
"source": [
"#Code task 24#\n",
"#Drop the 'fastEight' column from ski_data. Use inplace=True\n",
"ski_data.drop(columns=___, inplace=___)"
"ski_data.drop(columns='fastEight', inplace=True)"
]
},
{
Expand All @@ -1577,7 +1577,7 @@
"source": [
"#Code task 25#\n",
"#Filter the 'yearsOpen' column for values greater than 100\n",
"ski_data.___[ski_data.___ > ___]"
"ski_data.loc[ski_data.column['yearsOpen'] > 100]"
]
},
{
Expand All @@ -1603,7 +1603,7 @@
"#Code task 26#\n",
"#Call the hist method on 'yearsOpen' after filtering for values under 1000\n",
"#Pass the argument bins=30 to hist(), but feel free to explore other values\n",
"ski_data.___[ski_data.___ < ___].hist(___)\n",
"ski_data.loc[ski_data.column['yearsOpen'] < 1000].hist(bins=30)\n",
"plt.xlabel('Years open')\n",
"plt.ylabel('Count')\n",
"plt.title('Distribution of years open excluding 2019');"
Expand Down Expand Up @@ -1730,13 +1730,13 @@
"#respectively\n",
"#Finally, add a call to the reset_index() method (we recommend you experiment with and without this to see\n",
"#what it does)\n",
"state_summary = ski_data.groupby('state').agg(\n",
"c
" resorts_per_state=pd.NamedAgg(column='Name', aggfunc='size'), #could pick any column here\n",
" state_total_skiable_area_ac=pd.NamedAgg(column='SkiableTerrain_ac', aggfunc='sum'),\n",
" state_total_days_open=pd.NamedAgg(column=__, aggfunc='sum'),\n",
" ___=pd.NamedAgg(column=___, aggfunc=___),\n",
" ___=pd.NamedAgg(column=___, aggfunc=___)\n",
").___\n",
" state_total_days_open=pd.NamedAgg(column='state, aggfunc='sum'),\n",
" state_total_skiable_area_ac=pd.NamedAgg(column='SkiableTerrain_ac', aggfunc='sum'),\n",
" state_total_days_open=pd.NamedAgg(column='state, aggfunc='sum'),\n",
")state_summary = ski_data.groupby('state').agg(\n",
"state_summary.head()"
]
},
Expand Down Expand Up @@ -1856,7 +1856,7 @@
"#Code task 29#\n",
"#Use pandas' `read_html` method to read the table from the URL below\n",
"states_url = 'https://simple.wikipedia.org/w/index.php?title=List_of_U.S._states&oldid=7168473'\n",
"usa_states = pd.___(___)"
"usa_states = pd.read_html(states_url)"
]
},
{
Expand Down Expand Up @@ -2088,7 +2088,7 @@
"#Code task 30#\n",
"#Use the iloc accessor to get the pandas Series for column number 4 from `usa_states`\n",
"#It should be a column of dates\n",
"established = usa_sates.___[:, 4]"
"established = usa_sates.iloc[:, 4]"
]
},
{
Expand Down Expand Up @@ -2178,8 +2178,8 @@
"#Now use the iloc accessor again to extract columns 0, 5, and 6 and the dataframe's `copy()` method\n",
"#Set the names of these extracted columns to 'state', 'state_population', and 'state_area_sq_miles',\n",
"#respectively.\n",
"usa_states_sub = usa_states.___[:, [___]].copy()\n",
"usa_states_sub.columns = [___]\n",
"usa_states_sub = usa_states.iloc[:, [4:6]].copy()\n",
"usa_states_sub.columns = ['state', 'state_population', 'state_area_sq_miles']\n",
"usa_states_sub.head()"
]
},
Expand All @@ -2199,7 +2199,7 @@
"#Code task 32#\n",
"#Find the states in `state_summary` that are not in `usa_states_sub`\n",
"#Hint: set(list1) - set(list2) is an easy way to get items in list1 that are not in list2\n",
"missing_states = ___(state_summary.state) - ___(usa_states_sub.state)\n",
"missing_states = states(state_summary.state) - states(usa_states_sub.state)\n",
"missing_states"
]
},
Expand Down Expand Up @@ -2262,7 +2262,7 @@
"#value='' #empty string as replacement\n",
"#regex=True #we used a regex in our `to_replace` argument\n",
"#inplace=True #Do this \"in place\"\n",
"usa_states_sub.state.___(to_replace=___, value=__, regex=___, inplace=___)\n",
"usa_states_sub.state.replace(to_replace='\\[.*\\]', value='', regex=True, inplace=True)\n",
"usa_states_sub.state[usa_states_sub.state.str.contains('Massachusetts|Pennsylvania|Rhode Island|Virginia')]"
]
},
Expand All @@ -2275,7 +2275,7 @@
"#Code task 34#\n",
"#And now verify none of our states are missing by checking that there are no states in\n",
"#state_summary that are not in usa_states_sub (as earlier using `set()`)\n",
"missing_states = ___(state_summary.state) - ___(usa_states_sub.state)\n",
"missing_states = states(state_summary.state) - states(usa_states_sub.state)\n",
"missing_states"
]
},
Expand All @@ -2295,7 +2295,7 @@
"#Code task 35#\n",
"#Use 'state_summary's `merge()` method to combine our new data in 'usa_states_sub'\n",
"#specify the arguments how='left' and on='state'\n",
"state_summary = state_summary.___(usa_states_sub, ___=___, ___=___)\n",
"state_summary = state_summary.merge(usa_states_sub, how='left', on='state')\n",
"state_summary.head()"
]
},
Expand Down Expand Up @@ -2329,7 +2329,7 @@
"#Code task 36#\n",
"#Use ski_data's `plot()` method to create a scatterplot (kind='scatter') with 'AdultWeekday' on the x-axis and\n",
"#'AdultWeekend' on the y-axis\n",
"ski_data.___(x=___, y=___, kind=___);"
"ski_data.plot(x='AdultWeekday', y='AdultWeekend', kind='scatter');"
]
},
{
Expand All @@ -2347,7 +2347,7 @@
"source": [
"#Code task 37#\n",
"#Use the loc accessor on ski_data to print the 'AdultWeekend' and 'AdultWeekday' columns for Montana only\n",
"ski_data.___[ski_data.state == ___, [___, ___]]"
"ski_data.loc[ski_data.state == Montana, ['AdultWeekend', 'AdultWeekday']]"
]
},
{
Expand Down