diff --git a/student.ipynb b/student.ipynb index d3bb34af..f8cc5c88 100644 --- a/student.ipynb +++ b/student.ipynb @@ -6,27 +6,5209 @@ "source": [ "## Final Project Submission\n", "\n", - "Please fill out:\n", - "* Student name: \n", - "* Student pace: self paced / part time / full time\n", - "* Scheduled project review date/time: \n", - "* Instructor name: \n", - "* Blog post URL:\n" + "* Student names:\n", + "\n", + "Jeremiah Waiguru\n", + "\n", + "Olive Muloma\n", + "\n", + "Troye Gilbert\n", + "\n", + "Josephine Maro\n", + "\n", + "\n", + "* Student pace: FULL TIME HYBRID\n", + "* Scheduled project review date/time: N/A\n", + "* Instructor name: MARYANN MWIKALI\n", + "* Blog post URL: N/A\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data\n", + "We have been provided with a dataset with house sale prices in King County, Washington State, USA from 2014 to 2015 to use for this project.\n", + "\n", + "A dataset has been provided and can be found in the kc_house_data.csv file in this repository.\n", + "\n", + "The column names and descriptions as provided can be found in the column_names.md file in this repository.\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# import the necessary libraries\n", + "import pandas as pd\n", + "import numpy as np\n", + "import matplotlib as plt\n", + "import matplotlib.pyplot as plt\n", + "import seaborn as sns\n", + "import statsmodels.formula.api as sfm\n", + "import statsmodels.api as sm\n", + "import scipy.stats as stats\n", + "%matplotlib inline\n", + "\n", + "import warnings\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### loading dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewconditiongradesqft_abovesqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15
id
712930052010/13/2014221900.031.00118056501.0NaNNONEAverage7 Average11800.019550.09817847.5112-122.25713405650
641410019212/9/2014538000.032.25257072422.0NONONEAverage7 Average2170400.019511991.09812547.7210-122.31916907639
56315004002/25/2015180000.021.00770100001.0NONONEAverage6 Low Average7700.01933NaN9802847.7379-122.23327208062
248720087512/9/2014604000.043.00196050001.0NONONEVery Good7 Average1050910.019650.09813647.5208-122.39313605000
19544005102/18/2015510000.032.00168080801.0NONONEAverage8 Good16800.019870.09807447.6168-122.04518007503
\n", + "
" + ], + "text/plain": [ + " date price bedrooms bathrooms sqft_living sqft_lot \\\n", + "id \n", + "7129300520 10/13/2014 221900.0 3 1.00 1180 5650 \n", + "6414100192 12/9/2014 538000.0 3 2.25 2570 7242 \n", + "5631500400 2/25/2015 180000.0 2 1.00 770 10000 \n", + "2487200875 12/9/2014 604000.0 4 3.00 1960 5000 \n", + "1954400510 2/18/2015 510000.0 3 2.00 1680 8080 \n", + "\n", + " floors waterfront view condition grade sqft_above \\\n", + "id \n", + "7129300520 1.0 NaN NONE Average 7 Average 1180 \n", + "6414100192 2.0 NO NONE Average 7 Average 2170 \n", + "5631500400 1.0 NO NONE Average 6 Low Average 770 \n", + "2487200875 1.0 NO NONE Very Good 7 Average 1050 \n", + "1954400510 1.0 NO NONE Average 8 Good 1680 \n", + "\n", + " sqft_basement yr_built yr_renovated zipcode lat long \\\n", + "id \n", + "7129300520 0.0 1955 0.0 98178 47.5112 -122.257 \n", + "6414100192 400.0 1951 1991.0 98125 47.7210 -122.319 \n", + "5631500400 0.0 1933 NaN 98028 47.7379 -122.233 \n", + "2487200875 910.0 1965 0.0 98136 47.5208 -122.393 \n", + "1954400510 0.0 1987 0.0 98074 47.6168 -122.045 \n", + "\n", + " sqft_living15 sqft_lot15 \n", + "id \n", + "7129300520 1340 5650 \n", + "6414100192 1690 7639 \n", + "5631500400 2720 8062 \n", + "2487200875 1360 5000 \n", + "1954400510 1800 7503 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# loading the data and previewing the dataframe\n", + "df = pd.read_csv('data/kc_house_data.csv', index_col=0)\n", + "df.head()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preparation\n", + "In this section, we shall be preparing the data for further processing and modelling\n", + "\n", + "### Investigate data types" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(21597, 20)" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# shape of our data\n", + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pricebedroomsbathroomssqft_livingsqft_lotfloorssqft_aboveyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15
count2.159700e+0421597.00000021597.00000021597.0000002.159700e+0421597.00000021597.00000021597.00000017755.00000021597.00000021597.00000021597.00000021597.00000021597.000000
mean5.402966e+053.3732002.1158262080.3218501.509941e+041.4940961788.5968421970.99967683.63677898077.95184547.560093-122.2139821986.62031812758.283512
std3.673681e+050.9262990.768984918.1061254.141264e+040.539683827.75976129.375234399.94641453.5130720.1385520.140724685.23047227274.441950
min7.800000e+041.0000000.500000370.0000005.200000e+021.000000370.0000001900.0000000.00000098001.00000047.155900-122.519000399.000000651.000000
25%3.220000e+053.0000001.7500001430.0000005.040000e+031.0000001190.0000001951.0000000.00000098033.00000047.471100-122.3280001490.0000005100.000000
50%4.500000e+053.0000002.2500001910.0000007.618000e+031.5000001560.0000001975.0000000.00000098065.00000047.571800-122.2310001840.0000007620.000000
75%6.450000e+054.0000002.5000002550.0000001.068500e+042.0000002210.0000001997.0000000.00000098118.00000047.678000-122.1250002360.00000010083.000000
max7.700000e+0633.0000008.00000013540.0000001.651359e+063.5000009410.0000002015.0000002015.00000098199.00000047.777600-121.3150006210.000000871200.000000
\n", + "
" + ], + "text/plain": [ + " price bedrooms bathrooms sqft_living sqft_lot \\\n", + "count 2.159700e+04 21597.000000 21597.000000 21597.000000 2.159700e+04 \n", + "mean 5.402966e+05 3.373200 2.115826 2080.321850 1.509941e+04 \n", + "std 3.673681e+05 0.926299 0.768984 918.106125 4.141264e+04 \n", + "min 7.800000e+04 1.000000 0.500000 370.000000 5.200000e+02 \n", + "25% 3.220000e+05 3.000000 1.750000 1430.000000 5.040000e+03 \n", + "50% 4.500000e+05 3.000000 2.250000 1910.000000 7.618000e+03 \n", + "75% 6.450000e+05 4.000000 2.500000 2550.000000 1.068500e+04 \n", + "max 7.700000e+06 33.000000 8.000000 13540.000000 1.651359e+06 \n", + "\n", + " floors sqft_above yr_built yr_renovated zipcode \\\n", + "count 21597.000000 21597.000000 21597.000000 17755.000000 21597.000000 \n", + "mean 1.494096 1788.596842 1970.999676 83.636778 98077.951845 \n", + "std 0.539683 827.759761 29.375234 399.946414 53.513072 \n", + "min 1.000000 370.000000 1900.000000 0.000000 98001.000000 \n", + "25% 1.000000 1190.000000 1951.000000 0.000000 98033.000000 \n", + "50% 1.500000 1560.000000 1975.000000 0.000000 98065.000000 \n", + "75% 2.000000 2210.000000 1997.000000 0.000000 98118.000000 \n", + "max 3.500000 9410.000000 2015.000000 2015.000000 98199.000000 \n", + "\n", + " lat long sqft_living15 sqft_lot15 \n", + "count 21597.000000 21597.000000 21597.000000 21597.000000 \n", + "mean 47.560093 -122.213982 1986.620318 12758.283512 \n", + "std 0.138552 0.140724 685.230472 27274.441950 \n", + "min 47.155900 -122.519000 399.000000 651.000000 \n", + "25% 47.471100 -122.328000 1490.000000 5100.000000 \n", + "50% 47.571800 -122.231000 1840.000000 7620.000000 \n", + "75% 47.678000 -122.125000 2360.000000 10083.000000 \n", + "max 47.777600 -121.315000 6210.000000 871200.000000 " + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Describing the data\n", + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 21597 entries, 7129300520 to 1523300157\n", + "Data columns (total 20 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 date 21597 non-null object \n", + " 1 price 21597 non-null float64\n", + " 2 bedrooms 21597 non-null int64 \n", + " 3 bathrooms 21597 non-null float64\n", + " 4 sqft_living 21597 non-null int64 \n", + " 5 sqft_lot 21597 non-null int64 \n", + " 6 floors 21597 non-null float64\n", + " 7 waterfront 19221 non-null object \n", + " 8 view 21534 non-null object \n", + " 9 condition 21597 non-null object \n", + " 10 grade 21597 non-null object \n", + " 11 sqft_above 21597 non-null int64 \n", + " 12 sqft_basement 21597 non-null object \n", + " 13 yr_built 21597 non-null int64 \n", + " 14 yr_renovated 17755 non-null float64\n", + " 15 zipcode 21597 non-null int64 \n", + " 16 lat 21597 non-null float64\n", + " 17 long 21597 non-null float64\n", + " 18 sqft_living15 21597 non-null int64 \n", + " 19 sqft_lot15 21597 non-null int64 \n", + "dtypes: float64(6), int64(8), object(6)\n", + "memory usage: 3.5+ MB\n" + ] + } + ], + "source": [ + "# summary of the data\n", + "df.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The dataset contains 21,597 entries and 20 columns\n", + "\n", + "Some columns like 'waterfront', 'view', 'yr_renovated' have missing values\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### loading the column.md dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "with open('data/column_names.md', 'r') as file:\n", + " md_lines = file.readlines()\n", + "\n", + "df1 = pd.DataFrame({'Text': md_lines})\n", + "\n", + "pd.set_option('display.max_colwidth',None)\n", + "\n", + "# df1" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### cleaning the column_md dataset" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "df1[['Column name', 'Descriptions']] = df1['Text'].str.split('-', n=1,expand=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### drop the original 'text' column" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "df1.drop(columns=['Text'], inplace=True)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### remove rows where 'descriptions' columns contains 'None'" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Column nameDescriptions
1* `id`Unique identifier for a house\\n
2* `date`Date house was sold\\n
3* `price`Sale price (prediction target)\\n
4* `bedrooms`Number of bedrooms\\n
5* `bathrooms`Number of bathrooms\\n
6* `sqft_living`Square footage of living space in the home\\n
7* `sqft_lot`Square footage of the lot\\n
8* `floors`Number of floors (levels) in house\\n
9* `waterfront`Whether the house is on a waterfront\\n
11* `view`Quality of view from house\\n
13* `condition`How good the overall condition of the house is. Related to maintenance of house.\\n
15* `grade`Overall grade of the house. Related to the construction and design of the house.\\n
17* `sqft_above`Square footage of house apart from basement\\n
18* `sqft_basement`Square footage of the basement\\n
19* `yr_built`Year when house was built\\n
20* `yr_renovated`Year when house was renovated\\n
21* `zipcode`ZIP Code used by the United States Postal Service\\n
22* `lat`Latitude coordinate\\n
23* `long`Longitude coordinate\\n
24* `sqft_living15`The square footage of interior housing living space for the nearest 15 neighbors\\n
25* `sqft_lot15`The square footage of the land lots of the nearest 15 neighbors\\n
\n", + "
" + ], + "text/plain": [ + " Column name \\\n", + "1 * `id` \n", + "2 * `date` \n", + "3 * `price` \n", + "4 * `bedrooms` \n", + "5 * `bathrooms` \n", + "6 * `sqft_living` \n", + "7 * `sqft_lot` \n", + "8 * `floors` \n", + "9 * `waterfront` \n", + "11 * `view` \n", + "13 * `condition` \n", + "15 * `grade` \n", + "17 * `sqft_above` \n", + "18 * `sqft_basement` \n", + "19 * `yr_built` \n", + "20 * `yr_renovated` \n", + "21 * `zipcode` \n", + "22 * `lat` \n", + "23 * `long` \n", + "24 * `sqft_living15` \n", + "25 * `sqft_lot15` \n", + "\n", + " Descriptions \n", + "1 Unique identifier for a house\\n \n", + "2 Date house was sold\\n \n", + "3 Sale price (prediction target)\\n \n", + "4 Number of bedrooms\\n \n", + "5 Number of bathrooms\\n \n", + "6 Square footage of living space in the home\\n \n", + "7 Square footage of the lot\\n \n", + "8 Number of floors (levels) in house\\n \n", + "9 Whether the house is on a waterfront\\n \n", + "11 Quality of view from house\\n \n", + "13 How good the overall condition of the house is. Related to maintenance of house.\\n \n", + "15 Overall grade of the house. Related to the construction and design of the house.\\n \n", + "17 Square footage of house apart from basement\\n \n", + "18 Square footage of the basement\\n \n", + "19 Year when house was built\\n \n", + "20 Year when house was renovated\\n \n", + "21 ZIP Code used by the United States Postal Service\\n \n", + "22 Latitude coordinate\\n \n", + "23 Longitude coordinate\\n \n", + "24 The square footage of interior housing living space for the nearest 15 neighbors\\n \n", + "25 The square footage of the land lots of the nearest 15 neighbors\\n " + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df1 = df1[df1['Descriptions'].notna()]\n", + "df1" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "Index(['date', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',\n", + " 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above',\n", + " 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',\n", + " 'sqft_living15', 'sqft_lot15'],\n", + " dtype='object')" + ] + }, + "execution_count": 10, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# checking column names\n", + "df.columns" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "date 0\n", + "price 0\n", + "bedrooms 0\n", + "bathrooms 0\n", + "sqft_living 0\n", + "sqft_lot 0\n", + "floors 0\n", + "waterfront 2376\n", + "view 63\n", + "condition 0\n", + "grade 0\n", + "sqft_above 0\n", + "sqft_basement 0\n", + "yr_built 0\n", + "yr_renovated 3842\n", + "zipcode 0\n", + "lat 0\n", + "long 0\n", + "sqft_living15 0\n", + "sqft_lot15 0\n", + "dtype: int64" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# function to check null values\n", + "def check_null(df):\n", + " return df.isna().sum()\n", + "\n", + "# checking for null values in the data\n", + "check_null(df)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "There are missing values in three columns.\n", + "\n", + "Depending on the ratio of missing values, we will decide on what approach to take in dealing with them" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Dealing with missing values\n" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There is 11.00152798999861 percent of values missing in waterfront.\n", + "There is 0.29170718155299347 percent of values missing in view.\n", + "There is 17.78950780200954 percent of values missing in yr_renovated.\n" + ] + } + ], + "source": [ + "# function to calculate percentage of null values\n", + "def miss_percent(df,col):\n", + " miss = ((df[col].isna().sum()) / len(df[col])) * 100\n", + " return print(f'There is {miss} percent of values missing in {col}.')\n", + "\n", + "# checking percentage of missing values \n", + "miss_percent(df,'waterfront')\n", + "miss_percent(df, 'view')\n", + "miss_percent(df, 'yr_renovated')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The threshold on how to deal with missing values commonly used is 50% and also depends on the specific column. The percentages of missing values are very low for the specific columns so we can replace.\n", + "\n", + "Checking the year renovated column we may assume the missing value is because the house was never renovated, maybe the house did not have a view or a waterfront also for the other two columns hence we can Fill them with zeros.\n", + "\n", + "Since the missing values in the 3 columns are categorical and are a small percentage of the columns, replacing them with mode won`t skew the data nor give false conclusions" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "There is 11.00% of values missing in waterfront.\n", + "There is 0.29% of values missing in view.\n", + "There is 17.79% of values missing in yr_renovated.\n" + ] + } + ], + "source": [ + "def miss_percent(df, col, fill_value=None):\n", + " miss = ((df[col].isna().sum()) / len(df[col])) * 100\n", + " if fill_value is not None:\n", + " df[col].fillna(fill_value, inplace=True)\n", + " return miss\n", + "\n", + "# checking percentage of missing values and filling missing values with the mode\n", + "fill_values = {'waterfront': df['waterfront'].mode()[0], \n", + " 'view': df['view'].mode()[0], \n", + " 'yr_renovated': df['yr_renovated'].mode()[0]}\n", + "\n", + "for col in fill_values:\n", + " missing_percent = miss_percent(df, col, fill_value=fill_values[col])\n", + " print(f'There is {missing_percent:.2f}% of values missing in {col}.')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### check duplicates" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "duplicates = df.duplicated().sum()\n", + "duplicates" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### EDA 1\n", + "\n", + "Exploratory data analysis, including visualizations like scatter plots, bar plots, and heatmaps, is essential before creating regression models. These analyses help understand variable relationships, identify influential predictors, validate model assumptions, and guide feature selection. Visual exploration ensures that subsequent regression models are well-informed, optimized, and interpretable. It also aids in interpreting results and effectively communicating insights, enhancing the overall quality and utility of regression analysis. Conducting thorough exploratory analysis before modeling is crucial for building reliable and actionable regression models." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***1. Heatmap***\n", + "\n", + "The heatmap of correlations between price and features (bedrooms, bathrooms, sqft_living, sqft_lot) is essential for both linear and multilinear regression. It identifies influential predictors based on their relationships with the target variable (price). This helps prioritize predictors in linear models and detect multicollinearity in multilinear models, ensuring stable and interpretable models. Overall, the heatmap guides feature selection and model interpretation in regression analysis." + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10, 6))\n", + "sns.heatmap(df[['price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot']].corr(), annot=True, cmap='coolwarm')\n", + "plt.title('Correlation Heatmap')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***2. Bar Graph***\n", + "\n", + "The bar plot of price by condition is essential for developing regression models. It identifies influential condition categories for predictor selection and aids in understanding price variations. This visualization guides preprocessing of categorical variables and validates predictor-target relationships. Overall, it informs feature selection, interpretation, and validation in regression modeling." + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(10, 6))\n", + "sns.barplot(x='condition', y='price', data=df, color='blue')\n", + "plt.title('Price Distribution by Condition')\n", + "plt.xlabel('Condition')\n", + "plt.ylabel('Price')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***3. Scatter plot***\n", + "\n", + "The scatter plots of price against square footage of living space (`sqft_living`) and lot size (`sqft_lot`) provide insights for linear and multilinear regression models. They show how price relates to these predictors, helping assess linearity and identify outliers. Clear trends in these plots guide decisions on model complexity and feature engineering, essential for accurate regression analysis." + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": { + "needs_background": "light" + }, + "output_type": "display_data" + } + ], + "source": [ + "plt.figure(figsize=(12, 6))\n", + "plt.subplot(1, 2, 1)\n", + "sns.scatterplot(x='sqft_living', y='price', data=df)\n", + "plt.title('Price vs Sqft Living')\n", + "plt.subplot(1, 2, 2)\n", + "sns.scatterplot(x='sqft_lot', y='price', data=df)\n", + "plt.title('Price vs Sqft Lot')\n", + "plt.show()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### PRE-PROCESSING" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 21597 entries, 7129300520 to 1523300157\n", + "Data columns (total 20 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 date 21597 non-null object \n", + " 1 price 21597 non-null float64\n", + " 2 bedrooms 21597 non-null int64 \n", + " 3 bathrooms 21597 non-null float64\n", + " 4 sqft_living 21597 non-null int64 \n", + " 5 sqft_lot 21597 non-null int64 \n", + " 6 floors 21597 non-null float64\n", + " 7 waterfront 21597 non-null object \n", + " 8 view 21597 non-null object \n", + " 9 condition 21597 non-null object \n", + " 10 grade 21597 non-null object \n", + " 11 sqft_above 21597 non-null int64 \n", + " 12 sqft_basement 21597 non-null object \n", + " 13 yr_built 21597 non-null int64 \n", + " 14 yr_renovated 21597 non-null float64\n", + " 15 zipcode 21597 non-null int64 \n", + " 16 lat 21597 non-null float64\n", + " 17 long 21597 non-null float64\n", + " 18 sqft_living15 21597 non-null int64 \n", + " 19 sqft_lot15 21597 non-null int64 \n", + "dtypes: float64(6), int64(8), object(6)\n", + "memory usage: 4.1+ MB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewconditiongradesqft_abovesqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15
id
71293005202014-10-13221900.031.00118056501.0NONONEAverage7 Average11800.019550.09817847.5112-122.25713405650
64141001922014-12-09538000.032.25257072422.0NONONEAverage7 Average2170400.019511991.09812547.7210-122.31916907639
56315004002015-02-25180000.021.00770100001.0NONONEAverage6 Low Average7700.019330.09802847.7379-122.23327208062
24872008752014-12-09604000.043.00196050001.0NONONEVery Good7 Average1050910.019650.09813647.5208-122.39313605000
19544005102015-02-18510000.032.00168080801.0NONONEAverage8 Good16800.019870.09807447.6168-122.04518007503
...............................................................
2630000182014-05-21360000.032.50153011313.0NONONEAverage8 Good15300.020090.09810347.6993-122.34615301509
66000601202015-02-23400000.042.50231058132.0NONONEAverage8 Good23100.020140.09814647.5107-122.36218307200
15233001412014-06-23402101.020.75102013502.0NONONEAverage7 Average10200.020090.09814447.5944-122.29910202007
2913101002015-01-16400000.032.50160023882.0NONONEAverage8 Good16000.020040.09802747.5345-122.06914101287
15233001572014-10-15325000.020.75102010762.0NONONEAverage7 Average10200.020080.09814447.5941-122.29910201357
\n", + "

21597 rows × 20 columns

\n", + "
" + ], + "text/plain": [ + " date price bedrooms bathrooms sqft_living sqft_lot \\\n", + "id \n", + "7129300520 2014-10-13 221900.0 3 1.00 1180 5650 \n", + "6414100192 2014-12-09 538000.0 3 2.25 2570 7242 \n", + "5631500400 2015-02-25 180000.0 2 1.00 770 10000 \n", + "2487200875 2014-12-09 604000.0 4 3.00 1960 5000 \n", + "1954400510 2015-02-18 510000.0 3 2.00 1680 8080 \n", + "... ... ... ... ... ... ... \n", + "263000018 2014-05-21 360000.0 3 2.50 1530 1131 \n", + "6600060120 2015-02-23 400000.0 4 2.50 2310 5813 \n", + "1523300141 2014-06-23 402101.0 2 0.75 1020 1350 \n", + "291310100 2015-01-16 400000.0 3 2.50 1600 2388 \n", + "1523300157 2014-10-15 325000.0 2 0.75 1020 1076 \n", + "\n", + " floors waterfront view condition grade sqft_above \\\n", + "id \n", + "7129300520 1.0 NO NONE Average 7 Average 1180 \n", + "6414100192 2.0 NO NONE Average 7 Average 2170 \n", + "5631500400 1.0 NO NONE Average 6 Low Average 770 \n", + "2487200875 1.0 NO NONE Very Good 7 Average 1050 \n", + "1954400510 1.0 NO NONE Average 8 Good 1680 \n", + "... ... ... ... ... ... ... \n", + "263000018 3.0 NO NONE Average 8 Good 1530 \n", + "6600060120 2.0 NO NONE Average 8 Good 2310 \n", + "1523300141 2.0 NO NONE Average 7 Average 1020 \n", + "291310100 2.0 NO NONE Average 8 Good 1600 \n", + "1523300157 2.0 NO NONE Average 7 Average 1020 \n", + "\n", + " sqft_basement yr_built yr_renovated zipcode lat long \\\n", + "id \n", + "7129300520 0.0 1955 0.0 98178 47.5112 -122.257 \n", + "6414100192 400.0 1951 1991.0 98125 47.7210 -122.319 \n", + "5631500400 0.0 1933 0.0 98028 47.7379 -122.233 \n", + "2487200875 910.0 1965 0.0 98136 47.5208 -122.393 \n", + "1954400510 0.0 1987 0.0 98074 47.6168 -122.045 \n", + "... ... ... ... ... ... ... \n", + "263000018 0.0 2009 0.0 98103 47.6993 -122.346 \n", + "6600060120 0.0 2014 0.0 98146 47.5107 -122.362 \n", + "1523300141 0.0 2009 0.0 98144 47.5944 -122.299 \n", + "291310100 0.0 2004 0.0 98027 47.5345 -122.069 \n", + "1523300157 0.0 2008 0.0 98144 47.5941 -122.299 \n", + "\n", + " sqft_living15 sqft_lot15 \n", + "id \n", + "7129300520 1340 5650 \n", + "6414100192 1690 7639 \n", + "5631500400 2720 8062 \n", + "2487200875 1360 5000 \n", + "1954400510 1800 7503 \n", + "... ... ... \n", + "263000018 1530 1509 \n", + "6600060120 1830 7200 \n", + "1523300141 1020 2007 \n", + "291310100 1410 1287 \n", + "1523300157 1020 1357 \n", + "\n", + "[21597 rows x 20 columns]" + ] + }, + "execution_count": 21, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Converting date to datetime format \n", + "df['date'] = pd.to_datetime(df['date'])\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewconditiongradesqft_abovesqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15
id
71293005202014-10-13221900.031.00118056501.0NONONEAverage7 Average11800.019550.09817847.5112-122.25713405650
64141001922014-12-09538000.032.25257072422.0NONONEAverage7 Average2170400.019511.09812547.7210-122.31916907639
56315004002015-02-25180000.021.00770100001.0NONONEAverage6 Low Average7700.019330.09802847.7379-122.23327208062
24872008752014-12-09604000.043.00196050001.0NONONEVery Good7 Average1050910.019650.09813647.5208-122.39313605000
19544005102015-02-18510000.032.00168080801.0NONONEAverage8 Good16800.019870.09807447.6168-122.04518007503
...............................................................
2630000182014-05-21360000.032.50153011313.0NONONEAverage8 Good15300.020090.09810347.6993-122.34615301509
66000601202015-02-23400000.042.50231058132.0NONONEAverage8 Good23100.020140.09814647.5107-122.36218307200
15233001412014-06-23402101.020.75102013502.0NONONEAverage7 Average10200.020090.09814447.5944-122.29910202007
2913101002015-01-16400000.032.50160023882.0NONONEAverage8 Good16000.020040.09802747.5345-122.06914101287
15233001572014-10-15325000.020.75102010762.0NONONEAverage7 Average10200.020080.09814447.5941-122.29910201357
\n", + "

21597 rows × 20 columns

\n", + "
" + ], + "text/plain": [ + " date price bedrooms bathrooms sqft_living sqft_lot \\\n", + "id \n", + "7129300520 2014-10-13 221900.0 3 1.00 1180 5650 \n", + "6414100192 2014-12-09 538000.0 3 2.25 2570 7242 \n", + "5631500400 2015-02-25 180000.0 2 1.00 770 10000 \n", + "2487200875 2014-12-09 604000.0 4 3.00 1960 5000 \n", + "1954400510 2015-02-18 510000.0 3 2.00 1680 8080 \n", + "... ... ... ... ... ... ... \n", + "263000018 2014-05-21 360000.0 3 2.50 1530 1131 \n", + "6600060120 2015-02-23 400000.0 4 2.50 2310 5813 \n", + "1523300141 2014-06-23 402101.0 2 0.75 1020 1350 \n", + "291310100 2015-01-16 400000.0 3 2.50 1600 2388 \n", + "1523300157 2014-10-15 325000.0 2 0.75 1020 1076 \n", + "\n", + " floors waterfront view condition grade sqft_above \\\n", + "id \n", + "7129300520 1.0 NO NONE Average 7 Average 1180 \n", + "6414100192 2.0 NO NONE Average 7 Average 2170 \n", + "5631500400 1.0 NO NONE Average 6 Low Average 770 \n", + "2487200875 1.0 NO NONE Very Good 7 Average 1050 \n", + "1954400510 1.0 NO NONE Average 8 Good 1680 \n", + "... ... ... ... ... ... ... \n", + "263000018 3.0 NO NONE Average 8 Good 1530 \n", + "6600060120 2.0 NO NONE Average 8 Good 2310 \n", + "1523300141 2.0 NO NONE Average 7 Average 1020 \n", + "291310100 2.0 NO NONE Average 8 Good 1600 \n", + "1523300157 2.0 NO NONE Average 7 Average 1020 \n", + "\n", + " sqft_basement yr_built yr_renovated zipcode lat long \\\n", + "id \n", + "7129300520 0.0 1955 0.0 98178 47.5112 -122.257 \n", + "6414100192 400.0 1951 1.0 98125 47.7210 -122.319 \n", + "5631500400 0.0 1933 0.0 98028 47.7379 -122.233 \n", + "2487200875 910.0 1965 0.0 98136 47.5208 -122.393 \n", + "1954400510 0.0 1987 0.0 98074 47.6168 -122.045 \n", + "... ... ... ... ... ... ... \n", + "263000018 0.0 2009 0.0 98103 47.6993 -122.346 \n", + "6600060120 0.0 2014 0.0 98146 47.5107 -122.362 \n", + "1523300141 0.0 2009 0.0 98144 47.5944 -122.299 \n", + "291310100 0.0 2004 0.0 98027 47.5345 -122.069 \n", + "1523300157 0.0 2008 0.0 98144 47.5941 -122.299 \n", + "\n", + " sqft_living15 sqft_lot15 \n", + "id \n", + "7129300520 1340 5650 \n", + "6414100192 1690 7639 \n", + "5631500400 2720 8062 \n", + "2487200875 1360 5000 \n", + "1954400510 1800 7503 \n", + "... ... ... \n", + "263000018 1530 1509 \n", + "6600060120 1830 7200 \n", + "1523300141 1020 2007 \n", + "291310100 1410 1287 \n", + "1523300157 1020 1357 \n", + "\n", + "[21597 rows x 20 columns]" + ] + }, + "execution_count": 22, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Convert any houses that have been renovated to '1' to indicate true and any houses that have not been renovated to '0' to indicate false.\n", + "df['yr_renovated'] = df['yr_renovated'].apply(lambda x: 1 if x > 0 else x)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 21597 entries, 7129300520 to 1523300157\n", + "Data columns (total 20 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 date 21597 non-null datetime64[ns]\n", + " 1 price 21597 non-null float64 \n", + " 2 bedrooms 21597 non-null int64 \n", + " 3 bathrooms 21597 non-null float64 \n", + " 4 sqft_living 21597 non-null int64 \n", + " 5 sqft_lot 21597 non-null int64 \n", + " 6 floors 21597 non-null float64 \n", + " 7 waterfront 21597 non-null object \n", + " 8 view 21597 non-null object \n", + " 9 condition 21597 non-null object \n", + " 10 grade 21597 non-null object \n", + " 11 sqft_above 21597 non-null int64 \n", + " 12 sqft_basement 21597 non-null object \n", + " 13 yr_built 21597 non-null int64 \n", + " 14 yr_renovated 21597 non-null float64 \n", + " 15 zipcode 21597 non-null int64 \n", + " 16 lat 21597 non-null float64 \n", + " 17 long 21597 non-null float64 \n", + " 18 sqft_living15 21597 non-null int64 \n", + " 19 sqft_lot15 21597 non-null int64 \n", + "dtypes: datetime64[ns](1), float64(6), int64(8), object(5)\n", + "memory usage: 4.1+ MB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['0.0', '400.0', '910.0', '1530.0', '?', '730.0', '1700.0', '300.0',\n", + " '970.0', '760.0', '720.0', '700.0', '820.0', '780.0', '790.0',\n", + " '330.0', '1620.0', '360.0', '588.0', '1510.0', '410.0', '990.0',\n", + " '600.0', '560.0', '550.0', '1000.0', '1600.0', '500.0', '1040.0',\n", + " '880.0', '1010.0', '240.0', '265.0', '290.0', '800.0', '540.0',\n", + " '710.0', '840.0', '380.0', '770.0', '480.0', '570.0', '1490.0',\n", + " '620.0', '1250.0', '1270.0', '120.0', '650.0', '180.0', '1130.0',\n", + " '450.0', '1640.0', '1460.0', '1020.0', '1030.0', '750.0', '640.0',\n", + " '1070.0', '490.0', '1310.0', '630.0', '2000.0', '390.0', '430.0',\n", + " '850.0', '210.0', '1430.0', '1950.0', '440.0', '220.0', '1160.0',\n", + " '860.0', '580.0', '2060.0', '1820.0', '1180.0', '200.0', '1150.0',\n", + " '1200.0', '680.0', '530.0', '1450.0', '1170.0', '1080.0', '960.0',\n", + " '280.0', '870.0', '1100.0', '460.0', '1400.0', '660.0', '1220.0',\n", + " '900.0', '420.0', '1580.0', '1380.0', '475.0', '690.0', '270.0',\n", + " '350.0', '935.0', '1370.0', '980.0', '1470.0', '160.0', '950.0',\n", + " '50.0', '740.0', '1780.0', '1900.0', '340.0', '470.0', '370.0',\n", + " '140.0', '1760.0', '130.0', '520.0', '890.0', '1110.0', '150.0',\n", + " '1720.0', '810.0', '190.0', '1290.0', '670.0', '1800.0', '1120.0',\n", + " '1810.0', '60.0', '1050.0', '940.0', '310.0', '930.0', '1390.0',\n", + " '610.0', '1830.0', '1300.0', '510.0', '1330.0', '1590.0', '920.0',\n", + " '1320.0', '1420.0', '1240.0', '1960.0', '1560.0', '2020.0',\n", + " '1190.0', '2110.0', '1280.0', '250.0', '2390.0', '1230.0', '170.0',\n", + " '830.0', '1260.0', '1410.0', '1340.0', '590.0', '1500.0', '1140.0',\n", + " '260.0', '100.0', '320.0', '1480.0', '1060.0', '1284.0', '1670.0',\n", + " '1350.0', '2570.0', '1090.0', '110.0', '2500.0', '90.0', '1940.0',\n", + " '1550.0', '2350.0', '2490.0', '1481.0', '1360.0', '1135.0',\n", + " '1520.0', '1850.0', '1660.0', '2130.0', '2600.0', '1690.0',\n", + " '243.0', '1210.0', '1024.0', '1798.0', '1610.0', '1440.0',\n", + " '1570.0', '1650.0', '704.0', '1910.0', '1630.0', '2360.0',\n", + " '1852.0', '2090.0', '2400.0', '1790.0', '2150.0', '230.0', '70.0',\n", + " '1680.0', '2100.0', '3000.0', '1870.0', '1710.0', '2030.0',\n", + " '875.0', '1540.0', '2850.0', '2170.0', '506.0', '906.0', '145.0',\n", + " '2040.0', '784.0', '1750.0', '374.0', '518.0', '2720.0', '2730.0',\n", + " '1840.0', '3480.0', '2160.0', '1920.0', '2330.0', '1860.0',\n", + " '2050.0', '4820.0', '1913.0', '80.0', '2010.0', '3260.0', '2200.0',\n", + " '415.0', '1730.0', '652.0', '2196.0', '1930.0', '515.0', '40.0',\n", + " '2080.0', '2580.0', '1548.0', '1740.0', '235.0', '861.0', '1890.0',\n", + " '2220.0', '792.0', '2070.0', '4130.0', '2250.0', '2240.0',\n", + " '1990.0', '768.0', '2550.0', '435.0', '1008.0', '2300.0', '2610.0',\n", + " '666.0', '3500.0', '172.0', '1816.0', '2190.0', '1245.0', '1525.0',\n", + " '1880.0', '862.0', '946.0', '1281.0', '414.0', '2180.0', '276.0',\n", + " '1248.0', '602.0', '516.0', '176.0', '225.0', '1275.0', '266.0',\n", + " '283.0', '65.0', '2310.0', '10.0', '1770.0', '2120.0', '295.0',\n", + " '207.0', '915.0', '556.0', '417.0', '143.0', '508.0', '2810.0',\n", + " '20.0', '274.0', '248.0'], dtype=object)" + ] + }, + "execution_count": 25, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['sqft_basement'].unique()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "sqft_basement has a '?' value, let's replace it with a 0" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewconditiongradesqft_abovesqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15
id
71293005202014-10-13221900.031.00118056501.0NONONEAverage7 Average11800.019550.09817847.5112-122.25713405650
64141001922014-12-09538000.032.25257072422.0NONONEAverage7 Average2170400.019511.09812547.7210-122.31916907639
56315004002015-02-25180000.021.00770100001.0NONONEAverage6 Low Average7700.019330.09802847.7379-122.23327208062
24872008752014-12-09604000.043.00196050001.0NONONEVery Good7 Average1050910.019650.09813647.5208-122.39313605000
19544005102015-02-18510000.032.00168080801.0NONONEAverage8 Good16800.019870.09807447.6168-122.04518007503
...............................................................
2630000182014-05-21360000.032.50153011313.0NONONEAverage8 Good15300.020090.09810347.6993-122.34615301509
66000601202015-02-23400000.042.50231058132.0NONONEAverage8 Good23100.020140.09814647.5107-122.36218307200
15233001412014-06-23402101.020.75102013502.0NONONEAverage7 Average10200.020090.09814447.5944-122.29910202007
2913101002015-01-16400000.032.50160023882.0NONONEAverage8 Good16000.020040.09802747.5345-122.06914101287
15233001572014-10-15325000.020.75102010762.0NONONEAverage7 Average10200.020080.09814447.5941-122.29910201357
\n", + "

21597 rows × 20 columns

\n", + "
" + ], + "text/plain": [ + " date price bedrooms bathrooms sqft_living sqft_lot \\\n", + "id \n", + "7129300520 2014-10-13 221900.0 3 1.00 1180 5650 \n", + "6414100192 2014-12-09 538000.0 3 2.25 2570 7242 \n", + "5631500400 2015-02-25 180000.0 2 1.00 770 10000 \n", + "2487200875 2014-12-09 604000.0 4 3.00 1960 5000 \n", + "1954400510 2015-02-18 510000.0 3 2.00 1680 8080 \n", + "... ... ... ... ... ... ... \n", + "263000018 2014-05-21 360000.0 3 2.50 1530 1131 \n", + "6600060120 2015-02-23 400000.0 4 2.50 2310 5813 \n", + "1523300141 2014-06-23 402101.0 2 0.75 1020 1350 \n", + "291310100 2015-01-16 400000.0 3 2.50 1600 2388 \n", + "1523300157 2014-10-15 325000.0 2 0.75 1020 1076 \n", + "\n", + " floors waterfront view condition grade sqft_above \\\n", + "id \n", + "7129300520 1.0 NO NONE Average 7 Average 1180 \n", + "6414100192 2.0 NO NONE Average 7 Average 2170 \n", + "5631500400 1.0 NO NONE Average 6 Low Average 770 \n", + "2487200875 1.0 NO NONE Very Good 7 Average 1050 \n", + "1954400510 1.0 NO NONE Average 8 Good 1680 \n", + "... ... ... ... ... ... ... \n", + "263000018 3.0 NO NONE Average 8 Good 1530 \n", + "6600060120 2.0 NO NONE Average 8 Good 2310 \n", + "1523300141 2.0 NO NONE Average 7 Average 1020 \n", + "291310100 2.0 NO NONE Average 8 Good 1600 \n", + "1523300157 2.0 NO NONE Average 7 Average 1020 \n", + "\n", + " sqft_basement yr_built yr_renovated zipcode lat long \\\n", + "id \n", + "7129300520 0.0 1955 0.0 98178 47.5112 -122.257 \n", + "6414100192 400.0 1951 1.0 98125 47.7210 -122.319 \n", + "5631500400 0.0 1933 0.0 98028 47.7379 -122.233 \n", + "2487200875 910.0 1965 0.0 98136 47.5208 -122.393 \n", + "1954400510 0.0 1987 0.0 98074 47.6168 -122.045 \n", + "... ... ... ... ... ... ... \n", + "263000018 0.0 2009 0.0 98103 47.6993 -122.346 \n", + "6600060120 0.0 2014 0.0 98146 47.5107 -122.362 \n", + "1523300141 0.0 2009 0.0 98144 47.5944 -122.299 \n", + "291310100 0.0 2004 0.0 98027 47.5345 -122.069 \n", + "1523300157 0.0 2008 0.0 98144 47.5941 -122.299 \n", + "\n", + " sqft_living15 sqft_lot15 \n", + "id \n", + "7129300520 1340 5650 \n", + "6414100192 1690 7639 \n", + "5631500400 2720 8062 \n", + "2487200875 1360 5000 \n", + "1954400510 1800 7503 \n", + "... ... ... \n", + "263000018 1530 1509 \n", + "6600060120 1830 7200 \n", + "1523300141 1020 2007 \n", + "291310100 1410 1287 \n", + "1523300157 1020 1357 \n", + "\n", + "[21597 rows x 20 columns]" + ] + }, + "execution_count": 26, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Convert sqft_basement to int and replace ? with 0\n", + "df['sqft_basement'] = df['sqft_basement'].replace({'?':np.nan}).astype(float)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
datepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewcondition...sqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15month_of_dateyear_of_date
id
71293005202014-10-13221900.031.00118056501.0NONONEAverage...0.019550.09817847.5112-122.25713405650102014
64141001922014-12-09538000.032.25257072422.0NONONEAverage...400.019511.09812547.7210-122.31916907639122014
56315004002015-02-25180000.021.00770100001.0NONONEAverage...0.019330.09802847.7379-122.2332720806222015
24872008752014-12-09604000.043.00196050001.0NONONEVery Good...910.019650.09813647.5208-122.39313605000122014
19544005102015-02-18510000.032.00168080801.0NONONEAverage...0.019870.09807447.6168-122.0451800750322015
..................................................................
2630000182014-05-21360000.032.50153011313.0NONONEAverage...0.020090.09810347.6993-122.3461530150952014
66000601202015-02-23400000.042.50231058132.0NONONEAverage...0.020140.09814647.5107-122.3621830720022015
15233001412014-06-23402101.020.75102013502.0NONONEAverage...0.020090.09814447.5944-122.2991020200762014
2913101002015-01-16400000.032.50160023882.0NONONEAverage...0.020040.09802747.5345-122.0691410128712015
15233001572014-10-15325000.020.75102010762.0NONONEAverage...0.020080.09814447.5941-122.29910201357102014
\n", + "

21597 rows × 22 columns

\n", + "
" + ], + "text/plain": [ + " date price bedrooms bathrooms sqft_living sqft_lot \\\n", + "id \n", + "7129300520 2014-10-13 221900.0 3 1.00 1180 5650 \n", + "6414100192 2014-12-09 538000.0 3 2.25 2570 7242 \n", + "5631500400 2015-02-25 180000.0 2 1.00 770 10000 \n", + "2487200875 2014-12-09 604000.0 4 3.00 1960 5000 \n", + "1954400510 2015-02-18 510000.0 3 2.00 1680 8080 \n", + "... ... ... ... ... ... ... \n", + "263000018 2014-05-21 360000.0 3 2.50 1530 1131 \n", + "6600060120 2015-02-23 400000.0 4 2.50 2310 5813 \n", + "1523300141 2014-06-23 402101.0 2 0.75 1020 1350 \n", + "291310100 2015-01-16 400000.0 3 2.50 1600 2388 \n", + "1523300157 2014-10-15 325000.0 2 0.75 1020 1076 \n", + "\n", + " floors waterfront view condition ... sqft_basement yr_built \\\n", + "id ... \n", + "7129300520 1.0 NO NONE Average ... 0.0 1955 \n", + "6414100192 2.0 NO NONE Average ... 400.0 1951 \n", + "5631500400 1.0 NO NONE Average ... 0.0 1933 \n", + "2487200875 1.0 NO NONE Very Good ... 910.0 1965 \n", + "1954400510 1.0 NO NONE Average ... 0.0 1987 \n", + "... ... ... ... ... ... ... ... \n", + "263000018 3.0 NO NONE Average ... 0.0 2009 \n", + "6600060120 2.0 NO NONE Average ... 0.0 2014 \n", + "1523300141 2.0 NO NONE Average ... 0.0 2009 \n", + "291310100 2.0 NO NONE Average ... 0.0 2004 \n", + "1523300157 2.0 NO NONE Average ... 0.0 2008 \n", + "\n", + " yr_renovated zipcode lat long sqft_living15 \\\n", + "id \n", + "7129300520 0.0 98178 47.5112 -122.257 1340 \n", + "6414100192 1.0 98125 47.7210 -122.319 1690 \n", + "5631500400 0.0 98028 47.7379 -122.233 2720 \n", + "2487200875 0.0 98136 47.5208 -122.393 1360 \n", + "1954400510 0.0 98074 47.6168 -122.045 1800 \n", + "... ... ... ... ... ... \n", + "263000018 0.0 98103 47.6993 -122.346 1530 \n", + "6600060120 0.0 98146 47.5107 -122.362 1830 \n", + "1523300141 0.0 98144 47.5944 -122.299 1020 \n", + "291310100 0.0 98027 47.5345 -122.069 1410 \n", + "1523300157 0.0 98144 47.5941 -122.299 1020 \n", + "\n", + " sqft_lot15 month_of_date year_of_date \n", + "id \n", + "7129300520 5650 10 2014 \n", + "6414100192 7639 12 2014 \n", + "5631500400 8062 2 2015 \n", + "2487200875 5000 12 2014 \n", + "1954400510 7503 2 2015 \n", + "... ... ... ... \n", + "263000018 1509 5 2014 \n", + "6600060120 7200 2 2015 \n", + "1523300141 2007 6 2014 \n", + "291310100 1287 1 2015 \n", + "1523300157 1357 10 2014 \n", + "\n", + "[21597 rows x 22 columns]" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Retrieve the month and year from the date column.\n", + "df['month_of_date'] = pd.DatetimeIndex(df['date']).month\n", + "df['year_of_date'] = pd.DatetimeIndex(df['date']).year\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewconditiongrade...sqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15month_of_dateyear_of_date
id
7129300520221900.031.00118056501.0NONONEAverage7 Average...0.019550.09817847.5112-122.25713405650102014
6414100192538000.032.25257072422.0NONONEAverage7 Average...400.019511.09812547.7210-122.31916907639122014
5631500400180000.021.00770100001.0NONONEAverage6 Low Average...0.019330.09802847.7379-122.2332720806222015
2487200875604000.043.00196050001.0NONONEVery Good7 Average...910.019650.09813647.5208-122.39313605000122014
1954400510510000.032.00168080801.0NONONEAverage8 Good...0.019870.09807447.6168-122.0451800750322015
..................................................................
263000018360000.032.50153011313.0NONONEAverage8 Good...0.020090.09810347.6993-122.3461530150952014
6600060120400000.042.50231058132.0NONONEAverage8 Good...0.020140.09814647.5107-122.3621830720022015
1523300141402101.020.75102013502.0NONONEAverage7 Average...0.020090.09814447.5944-122.2991020200762014
291310100400000.032.50160023882.0NONONEAverage8 Good...0.020040.09802747.5345-122.0691410128712015
1523300157325000.020.75102010762.0NONONEAverage7 Average...0.020080.09814447.5941-122.29910201357102014
\n", + "

21597 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " price bedrooms bathrooms sqft_living sqft_lot floors \\\n", + "id \n", + "7129300520 221900.0 3 1.00 1180 5650 1.0 \n", + "6414100192 538000.0 3 2.25 2570 7242 2.0 \n", + "5631500400 180000.0 2 1.00 770 10000 1.0 \n", + "2487200875 604000.0 4 3.00 1960 5000 1.0 \n", + "1954400510 510000.0 3 2.00 1680 8080 1.0 \n", + "... ... ... ... ... ... ... \n", + "263000018 360000.0 3 2.50 1530 1131 3.0 \n", + "6600060120 400000.0 4 2.50 2310 5813 2.0 \n", + "1523300141 402101.0 2 0.75 1020 1350 2.0 \n", + "291310100 400000.0 3 2.50 1600 2388 2.0 \n", + "1523300157 325000.0 2 0.75 1020 1076 2.0 \n", + "\n", + " waterfront view condition grade ... sqft_basement \\\n", + "id ... \n", + "7129300520 NO NONE Average 7 Average ... 0.0 \n", + "6414100192 NO NONE Average 7 Average ... 400.0 \n", + "5631500400 NO NONE Average 6 Low Average ... 0.0 \n", + "2487200875 NO NONE Very Good 7 Average ... 910.0 \n", + "1954400510 NO NONE Average 8 Good ... 0.0 \n", + "... ... ... ... ... ... ... \n", + "263000018 NO NONE Average 8 Good ... 0.0 \n", + "6600060120 NO NONE Average 8 Good ... 0.0 \n", + "1523300141 NO NONE Average 7 Average ... 0.0 \n", + "291310100 NO NONE Average 8 Good ... 0.0 \n", + "1523300157 NO NONE Average 7 Average ... 0.0 \n", + "\n", + " yr_built yr_renovated zipcode lat long sqft_living15 \\\n", + "id \n", + "7129300520 1955 0.0 98178 47.5112 -122.257 1340 \n", + "6414100192 1951 1.0 98125 47.7210 -122.319 1690 \n", + "5631500400 1933 0.0 98028 47.7379 -122.233 2720 \n", + "2487200875 1965 0.0 98136 47.5208 -122.393 1360 \n", + "1954400510 1987 0.0 98074 47.6168 -122.045 1800 \n", + "... ... ... ... ... ... ... \n", + "263000018 2009 0.0 98103 47.6993 -122.346 1530 \n", + "6600060120 2014 0.0 98146 47.5107 -122.362 1830 \n", + "1523300141 2009 0.0 98144 47.5944 -122.299 1020 \n", + "291310100 2004 0.0 98027 47.5345 -122.069 1410 \n", + "1523300157 2008 0.0 98144 47.5941 -122.299 1020 \n", + "\n", + " sqft_lot15 month_of_date year_of_date \n", + "id \n", + "7129300520 5650 10 2014 \n", + "6414100192 7639 12 2014 \n", + "5631500400 8062 2 2015 \n", + "2487200875 5000 12 2014 \n", + "1954400510 7503 2 2015 \n", + "... ... ... ... \n", + "263000018 1509 5 2014 \n", + "6600060120 7200 2 2015 \n", + "1523300141 2007 6 2014 \n", + "291310100 1287 1 2015 \n", + "1523300157 1357 10 2014 \n", + "\n", + "[21597 rows x 21 columns]" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Drop the date column.\n", + "df.drop(columns=['date'], inplace=True)\n", + "df\n" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 30, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['Average', 'Very Good', 'Good', 'Poor', 'Fair'], dtype=object)" + ] + }, + "execution_count": 30, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Inspecting the condition column\n", + "df['condition'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewconditiongrade...sqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15month_of_dateyear_of_date
id
7129300520221900.031.00118056501.0NONONE37 Average...0.019550.09817847.5112-122.25713405650102014
6414100192538000.032.25257072422.0NONONE37 Average...400.019511.09812547.7210-122.31916907639122014
5631500400180000.021.00770100001.0NONONE36 Low Average...0.019330.09802847.7379-122.2332720806222015
2487200875604000.043.00196050001.0NONONE17 Average...910.019650.09813647.5208-122.39313605000122014
1954400510510000.032.00168080801.0NONONE38 Good...0.019870.09807447.6168-122.0451800750322015
\n", + "

5 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " price bedrooms bathrooms sqft_living sqft_lot floors \\\n", + "id \n", + "7129300520 221900.0 3 1.00 1180 5650 1.0 \n", + "6414100192 538000.0 3 2.25 2570 7242 2.0 \n", + "5631500400 180000.0 2 1.00 770 10000 1.0 \n", + "2487200875 604000.0 4 3.00 1960 5000 1.0 \n", + "1954400510 510000.0 3 2.00 1680 8080 1.0 \n", + "\n", + " waterfront view condition grade ... sqft_basement \\\n", + "id ... \n", + "7129300520 NO NONE 3 7 Average ... 0.0 \n", + "6414100192 NO NONE 3 7 Average ... 400.0 \n", + "5631500400 NO NONE 3 6 Low Average ... 0.0 \n", + "2487200875 NO NONE 1 7 Average ... 910.0 \n", + "1954400510 NO NONE 3 8 Good ... 0.0 \n", + "\n", + " yr_built yr_renovated zipcode lat long sqft_living15 \\\n", + "id \n", + "7129300520 1955 0.0 98178 47.5112 -122.257 1340 \n", + "6414100192 1951 1.0 98125 47.7210 -122.319 1690 \n", + "5631500400 1933 0.0 98028 47.7379 -122.233 2720 \n", + "2487200875 1965 0.0 98136 47.5208 -122.393 1360 \n", + "1954400510 1987 0.0 98074 47.6168 -122.045 1800 \n", + "\n", + " sqft_lot15 month_of_date year_of_date \n", + "id \n", + "7129300520 5650 10 2014 \n", + "6414100192 7639 12 2014 \n", + "5631500400 8062 2 2015 \n", + "2487200875 5000 12 2014 \n", + "1954400510 7503 2 2015 \n", + "\n", + "[5 rows x 21 columns]" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Mapping conditions with the respective number\n", + "# Ratings mapping\n", + "ratings_mapping = {\n", + " 'Average': 3,\n", + " 'Very Good': 1,\n", + " 'Good': 2,\n", + " 'Poor': 4,\n", + " 'Fair': 5\n", + "}\n", + "\n", + "# Replace categorical values with numerical values\n", + "df['condition'] = df['condition'].replace(ratings_mapping)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['NO', 'YES'], dtype=object)" + ] + }, + "execution_count": 32, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Inspecting the waterfront column\n", + "df['waterfront'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewconditiongrade...sqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15month_of_dateyear_of_date
id
7129300520221900.031.00118056501.00NONE37 Average...0.019550.09817847.5112-122.25713405650102014
6414100192538000.032.25257072422.00NONE37 Average...400.019511.09812547.7210-122.31916907639122014
5631500400180000.021.00770100001.00NONE36 Low Average...0.019330.09802847.7379-122.2332720806222015
2487200875604000.043.00196050001.00NONE17 Average...910.019650.09813647.5208-122.39313605000122014
1954400510510000.032.00168080801.00NONE38 Good...0.019870.09807447.6168-122.0451800750322015
\n", + "

5 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " price bedrooms bathrooms sqft_living sqft_lot floors \\\n", + "id \n", + "7129300520 221900.0 3 1.00 1180 5650 1.0 \n", + "6414100192 538000.0 3 2.25 2570 7242 2.0 \n", + "5631500400 180000.0 2 1.00 770 10000 1.0 \n", + "2487200875 604000.0 4 3.00 1960 5000 1.0 \n", + "1954400510 510000.0 3 2.00 1680 8080 1.0 \n", + "\n", + " waterfront view condition grade ... sqft_basement \\\n", + "id ... \n", + "7129300520 0 NONE 3 7 Average ... 0.0 \n", + "6414100192 0 NONE 3 7 Average ... 400.0 \n", + "5631500400 0 NONE 3 6 Low Average ... 0.0 \n", + "2487200875 0 NONE 1 7 Average ... 910.0 \n", + "1954400510 0 NONE 3 8 Good ... 0.0 \n", + "\n", + " yr_built yr_renovated zipcode lat long sqft_living15 \\\n", + "id \n", + "7129300520 1955 0.0 98178 47.5112 -122.257 1340 \n", + "6414100192 1951 1.0 98125 47.7210 -122.319 1690 \n", + "5631500400 1933 0.0 98028 47.7379 -122.233 2720 \n", + "2487200875 1965 0.0 98136 47.5208 -122.393 1360 \n", + "1954400510 1987 0.0 98074 47.6168 -122.045 1800 \n", + "\n", + " sqft_lot15 month_of_date year_of_date \n", + "id \n", + "7129300520 5650 10 2014 \n", + "6414100192 7639 12 2014 \n", + "5631500400 8062 2 2015 \n", + "2487200875 5000 12 2014 \n", + "1954400510 7503 2 2015 \n", + "\n", + "[5 rows x 21 columns]" + ] + }, + "execution_count": 33, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Mapping waterfront with the respective number \n", + "# Replacing YES with 1 and NO with 0\n", + "df['waterfront'] = df['waterfront'].astype(str).replace({'YES': 1, 'NO': 0})\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['NONE', 'GOOD', 'EXCELLENT', 'AVERAGE', 'FAIR'], dtype=object)" + ] + }, + "execution_count": 34, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Inspecting the view column\n", + "df['view'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewconditiongrade...sqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15month_of_dateyear_of_date
id
7129300520221900.031.00118056501.00037 Average...0.019550.09817847.5112-122.25713405650102014
6414100192538000.032.25257072422.00037 Average...400.019511.09812547.7210-122.31916907639122014
5631500400180000.021.00770100001.00036 Low Average...0.019330.09802847.7379-122.2332720806222015
2487200875604000.043.00196050001.00017 Average...910.019650.09813647.5208-122.39313605000122014
1954400510510000.032.00168080801.00038 Good...0.019870.09807447.6168-122.0451800750322015
\n", + "

5 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " price bedrooms bathrooms sqft_living sqft_lot floors \\\n", + "id \n", + "7129300520 221900.0 3 1.00 1180 5650 1.0 \n", + "6414100192 538000.0 3 2.25 2570 7242 2.0 \n", + "5631500400 180000.0 2 1.00 770 10000 1.0 \n", + "2487200875 604000.0 4 3.00 1960 5000 1.0 \n", + "1954400510 510000.0 3 2.00 1680 8080 1.0 \n", + "\n", + " waterfront view condition grade ... sqft_basement \\\n", + "id ... \n", + "7129300520 0 0 3 7 Average ... 0.0 \n", + "6414100192 0 0 3 7 Average ... 400.0 \n", + "5631500400 0 0 3 6 Low Average ... 0.0 \n", + "2487200875 0 0 1 7 Average ... 910.0 \n", + "1954400510 0 0 3 8 Good ... 0.0 \n", + "\n", + " yr_built yr_renovated zipcode lat long sqft_living15 \\\n", + "id \n", + "7129300520 1955 0.0 98178 47.5112 -122.257 1340 \n", + "6414100192 1951 1.0 98125 47.7210 -122.319 1690 \n", + "5631500400 1933 0.0 98028 47.7379 -122.233 2720 \n", + "2487200875 1965 0.0 98136 47.5208 -122.393 1360 \n", + "1954400510 1987 0.0 98074 47.6168 -122.045 1800 \n", + "\n", + " sqft_lot15 month_of_date year_of_date \n", + "id \n", + "7129300520 5650 10 2014 \n", + "6414100192 7639 12 2014 \n", + "5631500400 8062 2 2015 \n", + "2487200875 5000 12 2014 \n", + "1954400510 7503 2 2015 \n", + "\n", + "[5 rows x 21 columns]" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Define the mappings\n", + "quality_mapping = {\n", + " 'NONE': 0,\n", + " 'GOOD': 1,\n", + " 'EXCELLENT': 2,\n", + " 'AVERAGE': 3,\n", + " 'FAIR': 4\n", + "}\n", + "\n", + "# Replace the values using the mapping\n", + "df['view'] = df['view'].replace(quality_mapping)\n", + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "array(['7 Average', '6 Low Average', '8 Good', '11 Excellent', '9 Better',\n", + " '5 Fair', '10 Very Good', '12 Luxury', '4 Low', '3 Poor',\n", + " '13 Mansion'], dtype=object)" + ] + }, + "execution_count": 36, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Inspecting the grade column\n", + "df['grade'].unique()" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewconditiongrade...sqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15month_of_dateyear_of_date
id
7129300520221900.031.00118056501.00037...0.019550.09817847.5112-122.25713405650102014
6414100192538000.032.25257072422.00037...400.019511.09812547.7210-122.31916907639122014
5631500400180000.021.00770100001.00036...0.019330.09802847.7379-122.2332720806222015
2487200875604000.043.00196050001.00017...910.019650.09813647.5208-122.39313605000122014
1954400510510000.032.00168080801.00038...0.019870.09807447.6168-122.0451800750322015
..................................................................
263000018360000.032.50153011313.00038...0.020090.09810347.6993-122.3461530150952014
6600060120400000.042.50231058132.00038...0.020140.09814647.5107-122.3621830720022015
1523300141402101.020.75102013502.00037...0.020090.09814447.5944-122.2991020200762014
291310100400000.032.50160023882.00038...0.020040.09802747.5345-122.0691410128712015
1523300157325000.020.75102010762.00037...0.020080.09814447.5941-122.29910201357102014
\n", + "

21597 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " price bedrooms bathrooms sqft_living sqft_lot floors \\\n", + "id \n", + "7129300520 221900.0 3 1.00 1180 5650 1.0 \n", + "6414100192 538000.0 3 2.25 2570 7242 2.0 \n", + "5631500400 180000.0 2 1.00 770 10000 1.0 \n", + "2487200875 604000.0 4 3.00 1960 5000 1.0 \n", + "1954400510 510000.0 3 2.00 1680 8080 1.0 \n", + "... ... ... ... ... ... ... \n", + "263000018 360000.0 3 2.50 1530 1131 3.0 \n", + "6600060120 400000.0 4 2.50 2310 5813 2.0 \n", + "1523300141 402101.0 2 0.75 1020 1350 2.0 \n", + "291310100 400000.0 3 2.50 1600 2388 2.0 \n", + "1523300157 325000.0 2 0.75 1020 1076 2.0 \n", + "\n", + " waterfront view condition grade ... sqft_basement yr_built \\\n", + "id ... \n", + "7129300520 0 0 3 7 ... 0.0 1955 \n", + "6414100192 0 0 3 7 ... 400.0 1951 \n", + "5631500400 0 0 3 6 ... 0.0 1933 \n", + "2487200875 0 0 1 7 ... 910.0 1965 \n", + "1954400510 0 0 3 8 ... 0.0 1987 \n", + "... ... ... ... ... ... ... ... \n", + "263000018 0 0 3 8 ... 0.0 2009 \n", + "6600060120 0 0 3 8 ... 0.0 2014 \n", + "1523300141 0 0 3 7 ... 0.0 2009 \n", + "291310100 0 0 3 8 ... 0.0 2004 \n", + "1523300157 0 0 3 7 ... 0.0 2008 \n", + "\n", + " yr_renovated zipcode lat long sqft_living15 \\\n", + "id \n", + "7129300520 0.0 98178 47.5112 -122.257 1340 \n", + "6414100192 1.0 98125 47.7210 -122.319 1690 \n", + "5631500400 0.0 98028 47.7379 -122.233 2720 \n", + "2487200875 0.0 98136 47.5208 -122.393 1360 \n", + "1954400510 0.0 98074 47.6168 -122.045 1800 \n", + "... ... ... ... ... ... \n", + "263000018 0.0 98103 47.6993 -122.346 1530 \n", + "6600060120 0.0 98146 47.5107 -122.362 1830 \n", + "1523300141 0.0 98144 47.5944 -122.299 1020 \n", + "291310100 0.0 98027 47.5345 -122.069 1410 \n", + "1523300157 0.0 98144 47.5941 -122.299 1020 \n", + "\n", + " sqft_lot15 month_of_date year_of_date \n", + "id \n", + "7129300520 5650 10 2014 \n", + "6414100192 7639 12 2014 \n", + "5631500400 8062 2 2015 \n", + "2487200875 5000 12 2014 \n", + "1954400510 7503 2 2015 \n", + "... ... ... ... \n", + "263000018 1509 5 2014 \n", + "6600060120 7200 2 2015 \n", + "1523300141 2007 6 2014 \n", + "291310100 1287 1 2015 \n", + "1523300157 1357 10 2014 \n", + "\n", + "[21597 rows x 21 columns]" + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# Define the mappings\n", + "rating_mapping = {\n", + " 'Average': 7,\n", + " 'Low Average': 6,\n", + " 'Good': 8,\n", + " 'Excellent': 11,\n", + " 'Better': 9,\n", + " 'Fair': 5,\n", + " 'Very Good': 10,\n", + " 'Luxury': 12,\n", + " 'Low': 4,\n", + " 'Poor': 3,\n", + " 'Mansion': 13\n", + "}\n", + "\n", + "# Extract the rating string and replace with the corresponding numerical value\n", + "df['grade'] = df['grade'].str.extract('(\\d+)').astype(int)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Int64Index: 21597 entries, 7129300520 to 1523300157\n", + "Data columns (total 21 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 price 21597 non-null float64\n", + " 1 bedrooms 21597 non-null int64 \n", + " 2 bathrooms 21597 non-null float64\n", + " 3 sqft_living 21597 non-null int64 \n", + " 4 sqft_lot 21597 non-null int64 \n", + " 5 floors 21597 non-null float64\n", + " 6 waterfront 21597 non-null int64 \n", + " 7 view 21597 non-null int64 \n", + " 8 condition 21597 non-null int64 \n", + " 9 grade 21597 non-null int64 \n", + " 10 sqft_above 21597 non-null int64 \n", + " 11 sqft_basement 21143 non-null float64\n", + " 12 yr_built 21597 non-null int64 \n", + " 13 yr_renovated 21597 non-null float64\n", + " 14 zipcode 21597 non-null int64 \n", + " 15 lat 21597 non-null float64\n", + " 16 long 21597 non-null float64\n", + " 17 sqft_living15 21597 non-null int64 \n", + " 18 sqft_lot15 21597 non-null int64 \n", + " 19 month_of_date 21597 non-null int64 \n", + " 20 year_of_date 21597 non-null int64 \n", + "dtypes: float64(7), int64(14)\n", + "memory usage: 4.2 MB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "sqft_living 0.701917\n", + "grade 0.667951\n", + "sqft_above 0.605368\n", + "sqft_living15 0.585241\n", + "bathrooms 0.525906\n", + "sqft_basement 0.325008\n", + "bedrooms 0.308787\n", + "lat 0.306692\n", + "view 0.290620\n", + "waterfront 0.264306\n", + "floors 0.256804\n", + "yr_renovated 0.117543\n", + "sqft_lot 0.089876\n", + "sqft_lot15 0.082845\n", + "yr_built 0.053953\n", + "zipcode 0.053402\n", + "condition 0.040742\n", + "long 0.022036\n", + "month_of_date 0.009928\n", + "year_of_date 0.003727\n", + "Name: price, dtype: float64" + ] + }, + "execution_count": 39, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#Checking the correlation between price (target) and predictors\n", + "df.corr()['price'].drop(['price']).map(abs).sort_values(ascending=False)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "sqft_living, grade, sqft_above have the highest correlation with the target while year_of_date, month_of_date and long have the lowest correlation with the target." + ] + }, + { + "cell_type": "code", + "execution_count": 40, "metadata": {}, "outputs": [], "source": [ - "# Your code here - remember to use markdown cells for comments as well!" + "# Identfying Numeric and categorical columns\n", + "numeric = ['bedrooms', \n", + " 'bathrooms', \n", + " 'sqft_living', \n", + " 'sqft_lot', \n", + " 'sqft_above', \n", + " 'sqft_basement',\n", + " 'lat', \n", + " 'long',\n", + " 'sqft_living15', \n", + " 'sqft_lot15']\n", + "\n", + "categorical = ['floors',\n", + " 'waterfront', \n", + " 'view', \n", + " 'condition', \n", + " 'grade',\n", + " 'yr_renovated',\n", + " 'zipcode',\n", + " 'month_of_date']" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
pricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewconditiongrade...sqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15month_of_dateyear_of_date
0221900.031.00118056501.00037...0.019550.09817847.5112-122.25713405650102014
1538000.032.25257072422.00037...400.019511.09812547.7210-122.31916907639122014
2180000.021.00770100001.00036...0.019330.09802847.7379-122.2332720806222015
3604000.043.00196050001.00017...910.019650.09813647.5208-122.39313605000122014
4510000.032.00168080801.00038...0.019870.09807447.6168-122.0451800750322015
..................................................................
21592360000.032.50153011313.00038...0.020090.09810347.6993-122.3461530150952014
21593400000.042.50231058132.00038...0.020140.09814647.5107-122.3621830720022015
21594402101.020.75102013502.00037...0.020090.09814447.5944-122.2991020200762014
21595400000.032.50160023882.00038...0.020040.09802747.5345-122.0691410128712015
21596325000.020.75102010762.00037...0.020080.09814447.5941-122.29910201357102014
\n", + "

21597 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " price bedrooms bathrooms sqft_living sqft_lot floors \\\n", + "0 221900.0 3 1.00 1180 5650 1.0 \n", + "1 538000.0 3 2.25 2570 7242 2.0 \n", + "2 180000.0 2 1.00 770 10000 1.0 \n", + "3 604000.0 4 3.00 1960 5000 1.0 \n", + "4 510000.0 3 2.00 1680 8080 1.0 \n", + "... ... ... ... ... ... ... \n", + "21592 360000.0 3 2.50 1530 1131 3.0 \n", + "21593 400000.0 4 2.50 2310 5813 2.0 \n", + "21594 402101.0 2 0.75 1020 1350 2.0 \n", + "21595 400000.0 3 2.50 1600 2388 2.0 \n", + "21596 325000.0 2 0.75 1020 1076 2.0 \n", + "\n", + " waterfront view condition grade ... sqft_basement yr_built \\\n", + "0 0 0 3 7 ... 0.0 1955 \n", + "1 0 0 3 7 ... 400.0 1951 \n", + "2 0 0 3 6 ... 0.0 1933 \n", + "3 0 0 1 7 ... 910.0 1965 \n", + "4 0 0 3 8 ... 0.0 1987 \n", + "... ... ... ... ... ... ... ... \n", + "21592 0 0 3 8 ... 0.0 2009 \n", + "21593 0 0 3 8 ... 0.0 2014 \n", + "21594 0 0 3 7 ... 0.0 2009 \n", + "21595 0 0 3 8 ... 0.0 2004 \n", + "21596 0 0 3 7 ... 0.0 2008 \n", + "\n", + " yr_renovated zipcode lat long sqft_living15 sqft_lot15 \\\n", + "0 0.0 98178 47.5112 -122.257 1340 5650 \n", + "1 1.0 98125 47.7210 -122.319 1690 7639 \n", + "2 0.0 98028 47.7379 -122.233 2720 8062 \n", + "3 0.0 98136 47.5208 -122.393 1360 5000 \n", + "4 0.0 98074 47.6168 -122.045 1800 7503 \n", + "... ... ... ... ... ... ... \n", + "21592 0.0 98103 47.6993 -122.346 1530 1509 \n", + "21593 0.0 98146 47.5107 -122.362 1830 7200 \n", + "21594 0.0 98144 47.5944 -122.299 1020 2007 \n", + "21595 0.0 98027 47.5345 -122.069 1410 1287 \n", + "21596 0.0 98144 47.5941 -122.299 1020 1357 \n", + "\n", + " month_of_date year_of_date \n", + "0 10 2014 \n", + "1 12 2014 \n", + "2 2 2015 \n", + "3 12 2014 \n", + "4 2 2015 \n", + "... ... ... \n", + "21592 5 2014 \n", + "21593 2 2015 \n", + "21594 6 2014 \n", + "21595 1 2015 \n", + "21596 10 2014 \n", + "\n", + "[21597 rows x 21 columns]" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#reset index and dropping the id column\n", + "df.reset_index(inplace=True, drop=True)\n", + "df" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 21597 entries, 0 to 21596\n", + "Data columns (total 21 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 price 21597 non-null float64\n", + " 1 bedrooms 21597 non-null int64 \n", + " 2 bathrooms 21597 non-null float64\n", + " 3 sqft_living 21597 non-null int64 \n", + " 4 sqft_lot 21597 non-null int64 \n", + " 5 floors 21597 non-null float64\n", + " 6 waterfront 21597 non-null int64 \n", + " 7 view 21597 non-null int64 \n", + " 8 condition 21597 non-null int64 \n", + " 9 grade 21597 non-null int64 \n", + " 10 sqft_above 21597 non-null int64 \n", + " 11 sqft_basement 21143 non-null float64\n", + " 12 yr_built 21597 non-null int64 \n", + " 13 yr_renovated 21597 non-null float64\n", + " 14 zipcode 21597 non-null int64 \n", + " 15 lat 21597 non-null float64\n", + " 16 long 21597 non-null float64\n", + " 17 sqft_living15 21597 non-null int64 \n", + " 18 sqft_lot15 21597 non-null int64 \n", + " 19 month_of_date 21597 non-null int64 \n", + " 20 year_of_date 21597 non-null int64 \n", + "dtypes: float64(7), int64(14)\n", + "memory usage: 3.5 MB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### MODELLING" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "***1. Linear Regression model***\n", + "\n", + "We will pick **`sqft_living` - Square footage of living space in the home** to be used to create our linear regression model because it has the most correlation with the price and it has the most linear scatter plor hence a good candidate." ] } ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "learn-env", "language": "python", "name": "python3" }, @@ -40,7 +5222,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.4" + "version": "3.8.5" } }, "nbformat": 4,