diff --git a/KINGS_COUNTY_HOUSING-Updated 2.pdf b/KINGS_COUNTY_HOUSING-Updated 2.pdf
new file mode 100644
index 00000000..bc64d3db
Binary files /dev/null and b/KINGS_COUNTY_HOUSING-Updated 2.pdf differ
diff --git a/Updated Presentation.pdf b/Updated Presentation.pdf
new file mode 100644
index 00000000..f50479b8
Binary files /dev/null and b/Updated Presentation.pdf differ
diff --git a/presentation.pdf b/presentation.pdf
new file mode 100644
index 00000000..ed038166
Binary files /dev/null and b/presentation.pdf differ
diff --git a/student.ipynb b/student.ipynb
index d3bb34af..c3e0bb0e 100644
--- a/student.ipynb
+++ b/student.ipynb
@@ -14,13 +14,2030 @@
"* Blog post URL:\n"
]
},
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### IMPORTING OF LIBRARIES"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 53,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "import pandas as pd # for reading our data\n",
+ "import numpy as np # for performing calculations\n",
+ "import seaborn as sns # for visualization\n",
+ "import matplotlib.pyplot as plt # for visualization\n",
+ "%matplotlib inline\n",
+ "\n",
+ "import scipy.stats as stat # to calculate statistical operations\n",
+ "\n",
+ "from statsmodels.formula.api import ols #for creating a model\n",
+ "\n",
+ "from sklearn.model_selection import train_test_split # for performing train train_test_split on our data\n",
+ "from sklearn.linear_model import LinearRegression # making a LinearRegression model\n",
+ "from sklearn.metrics import mean_squared_error # for calculating error metrics to evaluate our model"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### LOADING DATA INTO A DATAFRAME\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 54,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " date | \n",
+ " price | \n",
+ " bedrooms | \n",
+ " bathrooms | \n",
+ " sqft_living | \n",
+ " sqft_lot | \n",
+ " floors | \n",
+ " waterfront | \n",
+ " view | \n",
+ " ... | \n",
+ " grade | \n",
+ " sqft_above | \n",
+ " sqft_basement | \n",
+ " yr_built | \n",
+ " yr_renovated | \n",
+ " zipcode | \n",
+ " lat | \n",
+ " long | \n",
+ " sqft_living15 | \n",
+ " sqft_lot15 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 7129300520 | \n",
+ " 10/13/2014 | \n",
+ " 221900.0 | \n",
+ " 3 | \n",
+ " 1.00 | \n",
+ " 1180 | \n",
+ " 5650 | \n",
+ " 1.0 | \n",
+ " NaN | \n",
+ " NONE | \n",
+ " ... | \n",
+ " 7 Average | \n",
+ " 1180 | \n",
+ " 0.0 | \n",
+ " 1955 | \n",
+ " 0.0 | \n",
+ " 98178 | \n",
+ " 47.5112 | \n",
+ " -122.257 | \n",
+ " 1340 | \n",
+ " 5650 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 6414100192 | \n",
+ " 12/9/2014 | \n",
+ " 538000.0 | \n",
+ " 3 | \n",
+ " 2.25 | \n",
+ " 2570 | \n",
+ " 7242 | \n",
+ " 2.0 | \n",
+ " NO | \n",
+ " NONE | \n",
+ " ... | \n",
+ " 7 Average | \n",
+ " 2170 | \n",
+ " 400.0 | \n",
+ " 1951 | \n",
+ " 1991.0 | \n",
+ " 98125 | \n",
+ " 47.7210 | \n",
+ " -122.319 | \n",
+ " 1690 | \n",
+ " 7639 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 5631500400 | \n",
+ " 2/25/2015 | \n",
+ " 180000.0 | \n",
+ " 2 | \n",
+ " 1.00 | \n",
+ " 770 | \n",
+ " 10000 | \n",
+ " 1.0 | \n",
+ " NO | \n",
+ " NONE | \n",
+ " ... | \n",
+ " 6 Low Average | \n",
+ " 770 | \n",
+ " 0.0 | \n",
+ " 1933 | \n",
+ " NaN | \n",
+ " 98028 | \n",
+ " 47.7379 | \n",
+ " -122.233 | \n",
+ " 2720 | \n",
+ " 8062 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2487200875 | \n",
+ " 12/9/2014 | \n",
+ " 604000.0 | \n",
+ " 4 | \n",
+ " 3.00 | \n",
+ " 1960 | \n",
+ " 5000 | \n",
+ " 1.0 | \n",
+ " NO | \n",
+ " NONE | \n",
+ " ... | \n",
+ " 7 Average | \n",
+ " 1050 | \n",
+ " 910.0 | \n",
+ " 1965 | \n",
+ " 0.0 | \n",
+ " 98136 | \n",
+ " 47.5208 | \n",
+ " -122.393 | \n",
+ " 1360 | \n",
+ " 5000 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1954400510 | \n",
+ " 2/18/2015 | \n",
+ " 510000.0 | \n",
+ " 3 | \n",
+ " 2.00 | \n",
+ " 1680 | \n",
+ " 8080 | \n",
+ " 1.0 | \n",
+ " NO | \n",
+ " NONE | \n",
+ " ... | \n",
+ " 8 Good | \n",
+ " 1680 | \n",
+ " 0.0 | \n",
+ " 1987 | \n",
+ " 0.0 | \n",
+ " 98074 | \n",
+ " 47.6168 | \n",
+ " -122.045 | \n",
+ " 1800 | \n",
+ " 7503 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
5 rows × 21 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id date price bedrooms bathrooms sqft_living \\\n",
+ "0 7129300520 10/13/2014 221900.0 3 1.00 1180 \n",
+ "1 6414100192 12/9/2014 538000.0 3 2.25 2570 \n",
+ "2 5631500400 2/25/2015 180000.0 2 1.00 770 \n",
+ "3 2487200875 12/9/2014 604000.0 4 3.00 1960 \n",
+ "4 1954400510 2/18/2015 510000.0 3 2.00 1680 \n",
+ "\n",
+ " sqft_lot floors waterfront view ... grade sqft_above \\\n",
+ "0 5650 1.0 NaN NONE ... 7 Average 1180 \n",
+ "1 7242 2.0 NO NONE ... 7 Average 2170 \n",
+ "2 10000 1.0 NO NONE ... 6 Low Average 770 \n",
+ "3 5000 1.0 NO NONE ... 7 Average 1050 \n",
+ "4 8080 1.0 NO NONE ... 8 Good 1680 \n",
+ "\n",
+ " sqft_basement yr_built yr_renovated zipcode lat long \\\n",
+ "0 0.0 1955 0.0 98178 47.5112 -122.257 \n",
+ "1 400.0 1951 1991.0 98125 47.7210 -122.319 \n",
+ "2 0.0 1933 NaN 98028 47.7379 -122.233 \n",
+ "3 910.0 1965 0.0 98136 47.5208 -122.393 \n",
+ "4 0.0 1987 0.0 98074 47.6168 -122.045 \n",
+ "\n",
+ " sqft_living15 sqft_lot15 \n",
+ "0 1340 5650 \n",
+ "1 1690 7639 \n",
+ "2 2720 8062 \n",
+ "3 1360 5000 \n",
+ "4 1800 7503 \n",
+ "\n",
+ "[5 rows x 21 columns]"
+ ]
+ },
+ "execution_count": 54,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "kc_data_df = pd.read_csv('data/kc_house_data.csv') # reading our data into a pandas data frame\n",
+ "kc_data_df.head() # checking the first 5 rows\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Lets explore our data by creating a function data_summary to show us the info and shape of our data frame"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 55,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def data_summary(data):# a function that gives us a brief summary of our data frame\n",
+ " # Shape of Data\n",
+ " shape = data.shape\n",
+ " # Info of Data\n",
+ " info = data.info() \n",
+ "\n",
+ " # Combining the information into a single string\n",
+ " summary = f\"Dataframe Shape: {shape}\\n\"\n",
+ " summary += f\"Dataframe Info:\\n{info}\" \n",
+ "\n",
+ " return summary\n",
+ "\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 56,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "\n",
+ "RangeIndex: 21597 entries, 0 to 21596\n",
+ "Data columns (total 21 columns):\n",
+ " # Column Non-Null Count Dtype \n",
+ "--- ------ -------------- ----- \n",
+ " 0 id 21597 non-null int64 \n",
+ " 1 date 21597 non-null object \n",
+ " 2 price 21597 non-null float64\n",
+ " 3 bedrooms 21597 non-null int64 \n",
+ " 4 bathrooms 21597 non-null float64\n",
+ " 5 sqft_living 21597 non-null int64 \n",
+ " 6 sqft_lot 21597 non-null int64 \n",
+ " 7 floors 21597 non-null float64\n",
+ " 8 waterfront 19221 non-null object \n",
+ " 9 view 21534 non-null object \n",
+ " 10 condition 21597 non-null object \n",
+ " 11 grade 21597 non-null object \n",
+ " 12 sqft_above 21597 non-null int64 \n",
+ " 13 sqft_basement 21597 non-null object \n",
+ " 14 yr_built 21597 non-null int64 \n",
+ " 15 yr_renovated 17755 non-null float64\n",
+ " 16 zipcode 21597 non-null int64 \n",
+ " 17 lat 21597 non-null float64\n",
+ " 18 long 21597 non-null float64\n",
+ " 19 sqft_living15 21597 non-null int64 \n",
+ " 20 sqft_lot15 21597 non-null int64 \n",
+ "dtypes: float64(6), int64(9), object(6)\n",
+ "memory usage: 3.5+ MB\n"
+ ]
+ },
+ {
+ "data": {
+ "text/plain": [
+ "'Dataframe Shape: (21597, 21)\\nDataframe Info:\\nNone'"
+ ]
+ },
+ "execution_count": 56,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "data_summary(kc_data_df) # using the function to obtain a summary of our dataframe"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### DATA CLEANING\n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "At these stage we will clean our data using the following steps\n",
+ "\n",
+ ". **Completeness** (we will check for missing values , how they affect our data set and how we will handle them)\n",
+ "\n",
+ ". **Consistency** (we will check for duplicate values and how to handle them)\n",
+ "\n",
+ ". **Uniformity** ( we will check the data types as well as our columns naming for uniformity)\n",
+ "\n",
+ ". **Validity** (we will handlle irrelevant columns and check for outliers )"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### COMPLETENESS"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "id 0\n",
+ "date 0\n",
+ "price 0\n",
+ "bedrooms 0\n",
+ "bathrooms 0\n",
+ "sqft_living 0\n",
+ "sqft_lot 0\n",
+ "floors 0\n",
+ "waterfront 2376\n",
+ "view 63\n",
+ "condition 0\n",
+ "grade 0\n",
+ "sqft_above 0\n",
+ "sqft_basement 0\n",
+ "yr_built 0\n",
+ "yr_renovated 3842\n",
+ "zipcode 0\n",
+ "lat 0\n",
+ "long 0\n",
+ "sqft_living15 0\n",
+ "sqft_lot15 0\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 57,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# checking and summing up our missing values in our data set\n",
+ "kc_data_df.isnull().sum()\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We seem to have missing values in our waterfront(2376),view(63) and yr_renovated(3842). We will have to investigate further"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "The column waterfront has 2376 missing values, which is 11.0 % of it's total\n",
+ "The column view has 63 missing values, which is 0.3 % of it's total\n",
+ "The column yr_renovated has 3842 missing values, which is 17.8 % of it's total\n"
+ ]
+ }
+ ],
+ "source": [
+ "# lets check for the percentage of missing values in our data set\n",
+ "for col in kc_data_df.columns: # we are using a for loop to iterate over our data\n",
+ " if kc_data_df[col].isnull().sum() > 0:\n",
+ " percentage = (kc_data_df[col].isnull().sum()/len(kc_data_df[col]))*100\n",
+ " print(\"The column\", col,\"has\",kc_data_df[col].isnull().sum(),\"missing values, which is\", round(percentage, 1),\"% of it's total\")"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Lets further check each column with missing values"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "###### Waterfront column"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Lets check for the value count of the unique elements"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "The Waterfront Column\n",
+ "\n",
+ "Number of distinct elements is: 2 \n",
+ "\n",
+ "This is the count of unique values:\n",
+ "NO 19075\n",
+ "YES 146\n",
+ "Name: waterfront, dtype: int64 \n",
+ "\n",
+ "The unique values:\n",
+ "[nan 'NO' 'YES'] \n",
+ "\n",
+ "Number of missing values: 2376\n"
+ ]
+ }
+ ],
+ "source": [
+ "#checking for unique elements value count\n",
+ "print(\"The Waterfront Column\\n\")\n",
+ "\n",
+ "print(\"Number of distinct elements is:\", kc_data_df['waterfront'].nunique(),\"\\n\")\n",
+ "\n",
+ "print(\"This is the count of unique values:\")\n",
+ "print(kc_data_df['waterfront'].value_counts(),\"\\n\")\n",
+ "\n",
+ "print('The unique values:')\n",
+ "print(kc_data_df['waterfront'].unique(),\"\\n\")\n",
+ "\n",
+ "print(\"Number of missing values:\",kc_data_df['waterfront'].isnull().sum())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The two unique values are YES and NO.NO is the most common value in this column with(19875) entries, whilst YES has just (146). This indicates that the majority of these homes lack a waterfront, hence it seems reasonable to presume that the homes with missing values lack a waterfront. it is safe to substitute the missing values with NO"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "NO 21451\n",
+ "YES 146\n",
+ "Name: waterfront, dtype: int64\n",
+ "['NO' 'YES']\n"
+ ]
+ }
+ ],
+ "source": [
+ "# replacing missing values with 'NO'\n",
+ "kc_data_df['waterfront'].fillna('NO',inplace=True)\n",
+ "\n",
+ "# confirming if the missing values have been replaced\n",
+ "print(kc_data_df['waterfront'].value_counts())\n",
+ "print(kc_data_df['waterfront'].unique())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The change was successful because the number of NO entries increased from 19875 to 21451."
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ " ###### View column"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Lets create a function to get our unique elements and sum up there value counts"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "def unique_counts(data, column): # creating a function for checking for unique elements and ther counts\n",
+ " print(\"Number of distinct elements in\", column, \"column:\", data[column].nunique()) # checking for unique elements in the column\n",
+ "\n",
+ " value_counts = data[column].value_counts() # counting the value of each unique element\n",
+ "\n",
+ " # Use Series.apply with a Lambda Function\n",
+ " format_lambda = lambda x: f\"{x}: {value_counts[x]} ({value_counts[x] / len(data) * 100:.1f}%)\"\n",
+ "\n",
+ " formatted_counts = value_counts.index.map(format_lambda) # it will execute without creating the formatted_counts variable or printing its contents.\n",
+ " print(formatted_counts)\n",
+ "\n",
+ " print(f\"\\nMissing values:\", data[column].isnull().sum()) # combining the information\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of distinct elements in view column: 5\n",
+ "Index(['NONE: 19422 (89.9%)', 'AVERAGE: 957 (4.4%)', 'GOOD: 508 (2.4%)',\n",
+ " 'FAIR: 330 (1.5%)', 'EXCELLENT: 317 (1.5%)'],\n",
+ " dtype='object')\n",
+ "\n",
+ "Missing values: 63\n"
+ ]
+ }
+ ],
+ "source": [
+ "unique_counts(kc_data_df,'view')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "In this column, NONE is the most frequent unique element. This indicates that the 63 missing values are representing homes that don't have a view. Hence I WILL substitute the missing values with NONE."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "NONE 19485\n",
+ "AVERAGE 957\n",
+ "GOOD 508\n",
+ "FAIR 330\n",
+ "EXCELLENT 317\n",
+ "Name: view, dtype: int64\n",
+ "['NONE' 'GOOD' 'EXCELLENT' 'AVERAGE' 'FAIR']\n"
+ ]
+ }
+ ],
+ "source": [
+ "# replacing missing values with 'NONE'\n",
+ "kc_data_df['view'].fillna('NONE',inplace=True)\n",
+ "\n",
+ "# confirming if the missing values have been replaced\n",
+ "print(kc_data_df['view'].value_counts())\n",
+ "print(kc_data_df['view'].unique())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Changes successfully made"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "###### Yr_renovated column"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Number of distinct elements in yr_renovated column: 70\n",
+ "Index(['0.0: 17011 (78.8%)', '2014.0: 73 (0.3%)', '2003.0: 31 (0.1%)',\n",
+ " '2013.0: 31 (0.1%)', '2007.0: 30 (0.1%)', '2000.0: 29 (0.1%)',\n",
+ " '2005.0: 29 (0.1%)', '1990.0: 22 (0.1%)', '2004.0: 22 (0.1%)',\n",
+ " '2009.0: 21 (0.1%)', '1989.0: 20 (0.1%)', '2006.0: 20 (0.1%)',\n",
+ " '2002.0: 17 (0.1%)', '1991.0: 16 (0.1%)', '1998.0: 16 (0.1%)',\n",
+ " '1984.0: 16 (0.1%)', '1999.0: 15 (0.1%)', '2008.0: 15 (0.1%)',\n",
+ " '2010.0: 15 (0.1%)', '2001.0: 15 (0.1%)', '1983.0: 15 (0.1%)',\n",
+ " '2015.0: 14 (0.1%)', '1985.0: 14 (0.1%)', '1986.0: 14 (0.1%)',\n",
+ " '1987.0: 14 (0.1%)', '1994.0: 14 (0.1%)', '1992.0: 13 (0.1%)',\n",
+ " '1993.0: 12 (0.1%)', '1997.0: 12 (0.1%)', '1995.0: 12 (0.1%)',\n",
+ " '1996.0: 11 (0.1%)', '1988.0: 11 (0.1%)', '1970.0: 9 (0.0%)',\n",
+ " '2011.0: 9 (0.0%)', '1980.0: 8 (0.0%)', '1982.0: 8 (0.0%)',\n",
+ " '2012.0: 8 (0.0%)', '1979.0: 7 (0.0%)', '1977.0: 7 (0.0%)',\n",
+ " '1968.0: 7 (0.0%)', '1975.0: 5 (0.0%)', '1964.0: 5 (0.0%)',\n",
+ " '1969.0: 4 (0.0%)', '1963.0: 4 (0.0%)', '1973.0: 4 (0.0%)',\n",
+ " '1981.0: 4 (0.0%)', '1965.0: 4 (0.0%)', '1978.0: 3 (0.0%)',\n",
+ " '1960.0: 3 (0.0%)', '1958.0: 3 (0.0%)', '1956.0: 3 (0.0%)',\n",
+ " '1955.0: 3 (0.0%)', '1945.0: 3 (0.0%)', '1972.0: 3 (0.0%)',\n",
+ " '1967.0: 2 (0.0%)', '1957.0: 2 (0.0%)', '1940.0: 2 (0.0%)',\n",
+ " '1974.0: 2 (0.0%)', '1962.0: 2 (0.0%)', '1953.0: 1 (0.0%)',\n",
+ " '1950.0: 1 (0.0%)', '1934.0: 1 (0.0%)', '1944.0: 1 (0.0%)',\n",
+ " '1976.0: 1 (0.0%)', '1948.0: 1 (0.0%)', '1946.0: 1 (0.0%)',\n",
+ " '1959.0: 1 (0.0%)', '1971.0: 1 (0.0%)', '1951.0: 1 (0.0%)',\n",
+ " '1954.0: 1 (0.0%)'],\n",
+ " dtype='object')\n",
+ "\n",
+ "Missing values: 3842\n"
+ ]
+ }
+ ],
+ "source": [
+ "unique_counts(kc_data_df,'yr_renovated' )# using the unique_count function"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ " The years span from 1948–2014 and 0.0 is the most frequent value thus we'll replace the missing values with 0.0 because we don't know what 0.0 means based on this data."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 65,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "0.0 20853\n",
+ "2014.0 73\n",
+ "2003.0 31\n",
+ "2013.0 31\n",
+ "2007.0 30\n",
+ " ... \n",
+ "1946.0 1\n",
+ "1959.0 1\n",
+ "1971.0 1\n",
+ "1951.0 1\n",
+ "1954.0 1\n",
+ "Name: yr_renovated, Length: 70, dtype: int64\n",
+ "[ 0. 1991. 2002. 2010. 1992. 2013. 1994. 1978. 2005. 2003. 1984. 1954.\n",
+ " 2014. 2011. 1983. 1945. 1990. 1988. 1977. 1981. 1995. 2000. 1999. 1998.\n",
+ " 1970. 1989. 2004. 1986. 2007. 1987. 2006. 1985. 2001. 1980. 1971. 1979.\n",
+ " 1997. 1950. 1969. 1948. 2009. 2015. 1974. 2008. 1968. 2012. 1963. 1951.\n",
+ " 1962. 1953. 1993. 1996. 1955. 1982. 1956. 1940. 1976. 1946. 1975. 1964.\n",
+ " 1973. 1957. 1959. 1960. 1967. 1965. 1934. 1972. 1944. 1958.]\n"
+ ]
+ }
+ ],
+ "source": [
+ "# replacing missing values with '0.0'\n",
+ "kc_data_df['yr_renovated'].fillna(0.0,inplace=True)\n",
+ "\n",
+ "# confirming if the missing values have been replaced\n",
+ "print(kc_data_df['yr_renovated'].value_counts())\n",
+ "print(kc_data_df['yr_renovated'].unique())"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The changes are made successfully"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 66,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "id 0\n",
+ "date 0\n",
+ "price 0\n",
+ "bedrooms 0\n",
+ "bathrooms 0\n",
+ "sqft_living 0\n",
+ "sqft_lot 0\n",
+ "floors 0\n",
+ "waterfront 0\n",
+ "view 0\n",
+ "condition 0\n",
+ "grade 0\n",
+ "sqft_above 0\n",
+ "sqft_basement 0\n",
+ "yr_built 0\n",
+ "yr_renovated 0\n",
+ "zipcode 0\n",
+ "lat 0\n",
+ "long 0\n",
+ "sqft_living15 0\n",
+ "sqft_lot15 0\n",
+ "dtype: int64"
+ ]
+ },
+ "execution_count": 66,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# checking to see if there are any more missing values\n",
+ "kc_data_df.isnull().sum()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "No missing values"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ " #### CONSISTENCY"
+ ]
+ },
{
"cell_type": "code",
- "execution_count": null,
+ "execution_count": 67,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "execution_count": 67,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# Checking for duplicate values\n",
+ "kc_data_df.duplicated().sum()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "There are no duplicate values"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### UNIFORMITY"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "**Converting Data Types of Values in Columns from Object to Float**"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The sqft_basement values are in objects data type, given that this column has numeric values. let's try to investigate the reason why the datatype isn't a float or integer. "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 68,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0.0 12826\n",
+ "? 454\n",
+ "600.0 217\n",
+ "500.0 209\n",
+ "700.0 208\n",
+ " ... \n",
+ "1880.0 1\n",
+ "768.0 1\n",
+ "666.0 1\n",
+ "1284.0 1\n",
+ "2050.0 1\n",
+ "Name: sqft_basement, Length: 304, dtype: int64"
+ ]
+ },
+ "execution_count": 68,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "kc_data_df['sqft_basement'].value_counts()"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ " These values represented by \"?\" string can be regarded as null values. We will replace the \"?\" with 0.0, because the majority of the values are at 0.0."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 69,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "dtype('float64')"
+ ]
+ },
+ "execution_count": 69,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "# replacing the ? with 0.0\n",
+ "kc_data_df['sqft_basement'].replace('?','0.0',inplace=True)\n",
+ "\n",
+ "#converting column to data type 'float'\n",
+ "kc_data_df['sqft_basement'] = kc_data_df['sqft_basement'].astype(float)\n",
+ "\n",
+ "#confirming the change \n",
+ "kc_data_df['sqft_basement'].dtype"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We have successfully changed the data type to a float"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ " **Converting the Date Column to month and year and Creating new Columns month and year**"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "The month and year the houses were sold are shown in the date column and data can be analysed easily by creating new columns called year and month from this column, "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 70,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "DataFrame after splitting date:\n",
+ " id price bedrooms bathrooms sqft_living sqft_lot floors \\\n",
+ "0 7129300520 221900.0 3 1.00 1180 5650 1.0 \n",
+ "1 6414100192 538000.0 3 2.25 2570 7242 2.0 \n",
+ "2 5631500400 180000.0 2 1.00 770 10000 1.0 \n",
+ "3 2487200875 604000.0 4 3.00 1960 5000 1.0 \n",
+ "4 1954400510 510000.0 3 2.00 1680 8080 1.0 \n",
+ "\n",
+ " waterfront view condition ... sqft_basement yr_built yr_renovated \\\n",
+ "0 NO NONE Average ... 0.0 1955 0.0 \n",
+ "1 NO NONE Average ... 400.0 1951 1991.0 \n",
+ "2 NO NONE Average ... 0.0 1933 0.0 \n",
+ "3 NO NONE Very Good ... 910.0 1965 0.0 \n",
+ "4 NO NONE Average ... 0.0 1987 0.0 \n",
+ "\n",
+ " zipcode lat long sqft_living15 sqft_lot15 month_sold year_sold \n",
+ "0 98178 47.5112 -122.257 1340 5650 10 2014 \n",
+ "1 98125 47.7210 -122.319 1690 7639 12 2014 \n",
+ "2 98028 47.7379 -122.233 2720 8062 2 2015 \n",
+ "3 98136 47.5208 -122.393 1360 5000 12 2014 \n",
+ "4 98074 47.6168 -122.045 1800 7503 2 2015 \n",
+ "\n",
+ "[5 rows x 22 columns]\n"
+ ]
+ }
+ ],
+ "source": [
+ "# Spliting the date into month, day, and year\n",
+ "date_split = kc_data_df['date'].str.split('/', expand=True)\n",
+ "\n",
+ "# Creating new columns for month and year and converting the values to integers\n",
+ "kc_data_df['month_sold'] = date_split[0].astype(int)\n",
+ "kc_data_df['year_sold'] = date_split[2].astype(int)\n",
+ "\n",
+ "# Droping the original date column\n",
+ "kc_data_df.drop(columns=['date'], inplace=True)\n",
+ "\n",
+ "# Verifying the changes\n",
+ "print(\"DataFrame after splitting date:\")\n",
+ "print(kc_data_df.head())\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "We can seen the two columns have been created and added to our dataset"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### VALIDITY"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 71,
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "id 5.185851e+09\n",
+ "price 3.230000e+05\n",
+ "bedrooms 1.000000e+00\n",
+ "bathrooms 7.500000e-01\n",
+ "sqft_living 1.120000e+03\n",
+ "sqft_lot 5.645000e+03\n",
+ "floors 1.000000e+00\n",
+ "sqft_above 1.020000e+03\n",
+ "sqft_basement 5.500000e+02\n",
+ "yr_built 4.600000e+01\n",
+ "yr_renovated 0.000000e+00\n",
+ "zipcode 8.500000e+01\n",
+ "lat 2.069000e-01\n",
+ "long 2.030000e-01\n",
+ "sqft_living15 8.700000e+02\n",
+ "sqft_lot15 4.983000e+03\n",
+ "month_sold 5.000000e+00\n",
+ "year_sold 1.000000e+00\n",
+ "dtype: float64\n"
+ ]
+ }
+ ],
+ "source": [
+ "#checking for outliers using intequatrile for each column\n",
+ "\n",
+ "Q1 = kc_data_df.quantile(0.25) # First quartile\n",
+ "Q3 = kc_data_df.quantile(0.75) # Third quartile\n",
+ "IQR = Q3 - Q1\n",
+ "\n",
+ "print(IQR)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "we have the above outliers lets visualize them using boxplots to investigate further"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 72,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "image/png": "\n",
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {
+ "needs_background": "light"
+ },
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "\n",
+ "# Plotting box plots to visualize our outliers\n",
+ "# Creating a list of column names excluding non-numeric columns (if any)\n",
+ "numeric_columns = kc_data_df.select_dtypes(include=['number']).columns\n",
+ "\n",
+ "# Seting up the figure and axes\n",
+ "fig, axes = plt.subplots(nrows=len(numeric_columns), figsize=(10, 6 * len(numeric_columns)))\n",
+ "\n",
+ "# Iterating over each numeric column and create a boxplot\n",
+ "for i, column in enumerate(numeric_columns):\n",
+ " ax = axes[i] if len(numeric_columns) > 1 else axes # If there's only one numeric column, axes is not a list\n",
+ " \n",
+ " # Creating the boxplots\n",
+ " sns.boxplot(x=kc_data_df[column], ax=ax)\n",
+ " \n",
+ " # Seting titles and labels\n",
+ " ax.set_title(f'Boxplot of {column}')\n",
+ " ax.set_xlabel(column)\n",
+ " \n",
+ "\n",
+ "plt.tight_layout(pad=3.0) # Adjusting the spacing between subplots\n",
+ "plt.subplots_adjust(top=0.95) # Adjusting the top margin to accommodate titles \n",
+ "\n"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "There is a significant amount of outliers in a number of our columns let remove them"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 73,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# Removing the outliers \n",
+ "# outliers_df_iqr = kc_data_df[~((kc_data_df < (Q1 - 1.5 * IQR)) |(kc_data_df > (Q3 + 1.5 * IQR))).any(axis=1)]\n",
+ "\n",
+ "#print(outliers_df_iqr.shape) # The number of rows and columns in the new dataframe"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "After we began modelling we chose to undo the removing of outlier since the removal of outliers really affected the performance of our models"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Conversion of the bedroom ,zip code and floors feature into a Categorical Feature\n",
+ "\n",
+ "Analysing the `bedroom`,'zipcode' and floors features in our dataset we observe that although they are stored in numerical form, they are actually a categorical features."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 74,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# creating a function to change the data type of values of a specific column\n",
+ "def change_dtype(df, column_name):\n",
+ " df[column_name] = df[column_name].astype('category')\n",
+ " new_dtype = df[column_name].dtype\n",
+ " return new_dtype\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 75,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "CategoricalDtype(categories=[1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 33], ordered=False)"
+ ]
+ },
+ "execution_count": 75,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "change_dtype(kc_data_df, 'bedrooms') # changing the bedroom feature into categorical data type"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 76,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "CategoricalDtype(categories=[1.0, 1.5, 2.0, 2.5, 3.0, 3.5], ordered=False)"
+ ]
+ },
+ "execution_count": 76,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "change_dtype(kc_data_df, 'floors') # changing the floors feature into categorical data type"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 77,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "CategoricalDtype(categories=[98001, 98002, 98003, 98004, 98005, 98006, 98007, 98008,\n",
+ " 98010, 98011, 98014, 98019, 98022, 98023, 98024, 98027,\n",
+ " 98028, 98029, 98030, 98031, 98032, 98033, 98034, 98038,\n",
+ " 98039, 98040, 98042, 98045, 98052, 98053, 98055, 98056,\n",
+ " 98058, 98059, 98065, 98070, 98072, 98074, 98075, 98077,\n",
+ " 98092, 98102, 98103, 98105, 98106, 98107, 98108, 98109,\n",
+ " 98112, 98115, 98116, 98117, 98118, 98119, 98122, 98125,\n",
+ " 98126, 98133, 98136, 98144, 98146, 98148, 98155, 98166,\n",
+ " 98168, 98177, 98178, 98188, 98198, 98199],\n",
+ " ordered=False)"
+ ]
+ },
+ "execution_count": 77,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "change_dtype(kc_data_df, 'zipcode') # changing the zipcode feature into categorical data type"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### FEATURE ENGINEERING"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "Lets create additional columns to determine the age of a house by getting the differnce between 2015 (that's when the data was last recorded) the reference year and yr_built, and another column age of the house after renovation by also getting the difference between 2015 and yr_renovated \n",
+ "\n",
+ "These new columns will help to determine whether the age of built or renovation has an impact on the price of the house"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 78,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "# creating a new column 'age_built' showing the age of house since its was built upto 2015\n",
+ "kc_data_df['age_built'] = 2015 - kc_data_df['yr_built']"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 79,
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "#creating a new column 'age_renovated'showing the age of a house after renovation\n",
+ "kc_data_df['age_renovated'] = (2015 - kc_data_df['yr_renovated']).apply(lambda x: 0 if x == 2015 else x)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 80,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "Index(['id', 'price', 'bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot',\n",
+ " 'floors', 'waterfront', 'view', 'condition', 'grade', 'sqft_above',\n",
+ " 'sqft_basement', 'yr_built', 'yr_renovated', 'zipcode', 'lat', 'long',\n",
+ " 'sqft_living15', 'sqft_lot15', 'month_sold', 'year_sold', 'age_built',\n",
+ " 'age_renovated'],\n",
+ " dtype='object')"
+ ]
+ },
+ "execution_count": 80,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "kc_data_df.columns # checking to see if the two columns have been added"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 81,
+ "metadata": {},
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " price | \n",
+ " bedrooms | \n",
+ " bathrooms | \n",
+ " sqft_living | \n",
+ " sqft_lot | \n",
+ " floors | \n",
+ " waterfront | \n",
+ " view | \n",
+ " condition | \n",
+ " ... | \n",
+ " yr_renovated | \n",
+ " zipcode | \n",
+ " lat | \n",
+ " long | \n",
+ " sqft_living15 | \n",
+ " sqft_lot15 | \n",
+ " month_sold | \n",
+ " year_sold | \n",
+ " age_built | \n",
+ " age_renovated | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 7129300520 | \n",
+ " 221900.0 | \n",
+ " 3 | \n",
+ " 1.00 | \n",
+ " 1180 | \n",
+ " 5650 | \n",
+ " 1.0 | \n",
+ " NO | \n",
+ " NONE | \n",
+ " Average | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 98178 | \n",
+ " 47.5112 | \n",
+ " -122.257 | \n",
+ " 1340 | \n",
+ " 5650 | \n",
+ " 10 | \n",
+ " 2014 | \n",
+ " 60 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 6414100192 | \n",
+ " 538000.0 | \n",
+ " 3 | \n",
+ " 2.25 | \n",
+ " 2570 | \n",
+ " 7242 | \n",
+ " 2.0 | \n",
+ " NO | \n",
+ " NONE | \n",
+ " Average | \n",
+ " ... | \n",
+ " 1991.0 | \n",
+ " 98125 | \n",
+ " 47.7210 | \n",
+ " -122.319 | \n",
+ " 1690 | \n",
+ " 7639 | \n",
+ " 12 | \n",
+ " 2014 | \n",
+ " 64 | \n",
+ " 24.0 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 5631500400 | \n",
+ " 180000.0 | \n",
+ " 2 | \n",
+ " 1.00 | \n",
+ " 770 | \n",
+ " 10000 | \n",
+ " 1.0 | \n",
+ " NO | \n",
+ " NONE | \n",
+ " Average | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 98028 | \n",
+ " 47.7379 | \n",
+ " -122.233 | \n",
+ " 2720 | \n",
+ " 8062 | \n",
+ " 2 | \n",
+ " 2015 | \n",
+ " 82 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2487200875 | \n",
+ " 604000.0 | \n",
+ " 4 | \n",
+ " 3.00 | \n",
+ " 1960 | \n",
+ " 5000 | \n",
+ " 1.0 | \n",
+ " NO | \n",
+ " NONE | \n",
+ " Very Good | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 98136 | \n",
+ " 47.5208 | \n",
+ " -122.393 | \n",
+ " 1360 | \n",
+ " 5000 | \n",
+ " 12 | \n",
+ " 2014 | \n",
+ " 50 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1954400510 | \n",
+ " 510000.0 | \n",
+ " 3 | \n",
+ " 2.00 | \n",
+ " 1680 | \n",
+ " 8080 | \n",
+ " 1.0 | \n",
+ " NO | \n",
+ " NONE | \n",
+ " Average | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 98074 | \n",
+ " 47.6168 | \n",
+ " -122.045 | \n",
+ " 1800 | \n",
+ " 7503 | \n",
+ " 2 | \n",
+ " 2015 | \n",
+ " 28 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 7237550310 | \n",
+ " 1230000.0 | \n",
+ " 4 | \n",
+ " 4.50 | \n",
+ " 5420 | \n",
+ " 101930 | \n",
+ " 1.0 | \n",
+ " NO | \n",
+ " NONE | \n",
+ " Average | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 98053 | \n",
+ " 47.6561 | \n",
+ " -122.005 | \n",
+ " 4760 | \n",
+ " 101930 | \n",
+ " 5 | \n",
+ " 2014 | \n",
+ " 14 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 1321400060 | \n",
+ " 257500.0 | \n",
+ " 3 | \n",
+ " 2.25 | \n",
+ " 1715 | \n",
+ " 6819 | \n",
+ " 2.0 | \n",
+ " NO | \n",
+ " NONE | \n",
+ " Average | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 98003 | \n",
+ " 47.3097 | \n",
+ " -122.327 | \n",
+ " 2238 | \n",
+ " 6819 | \n",
+ " 6 | \n",
+ " 2014 | \n",
+ " 20 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 2008000270 | \n",
+ " 291850.0 | \n",
+ " 3 | \n",
+ " 1.50 | \n",
+ " 1060 | \n",
+ " 9711 | \n",
+ " 1.0 | \n",
+ " NO | \n",
+ " NONE | \n",
+ " Average | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 98198 | \n",
+ " 47.4095 | \n",
+ " -122.315 | \n",
+ " 1650 | \n",
+ " 9711 | \n",
+ " 1 | \n",
+ " 2015 | \n",
+ " 52 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 2414600126 | \n",
+ " 229500.0 | \n",
+ " 3 | \n",
+ " 1.00 | \n",
+ " 1780 | \n",
+ " 7470 | \n",
+ " 1.0 | \n",
+ " NO | \n",
+ " NONE | \n",
+ " Average | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 98146 | \n",
+ " 47.5123 | \n",
+ " -122.337 | \n",
+ " 1780 | \n",
+ " 8113 | \n",
+ " 4 | \n",
+ " 2015 | \n",
+ " 55 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 3793500160 | \n",
+ " 323000.0 | \n",
+ " 3 | \n",
+ " 2.50 | \n",
+ " 1890 | \n",
+ " 6560 | \n",
+ " 2.0 | \n",
+ " NO | \n",
+ " NONE | \n",
+ " Average | \n",
+ " ... | \n",
+ " 0.0 | \n",
+ " 98038 | \n",
+ " 47.3684 | \n",
+ " -122.031 | \n",
+ " 2390 | \n",
+ " 7570 | \n",
+ " 3 | \n",
+ " 2015 | \n",
+ " 12 | \n",
+ " 0.0 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
10 rows × 24 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id price bedrooms bathrooms sqft_living sqft_lot floors \\\n",
+ "0 7129300520 221900.0 3 1.00 1180 5650 1.0 \n",
+ "1 6414100192 538000.0 3 2.25 2570 7242 2.0 \n",
+ "2 5631500400 180000.0 2 1.00 770 10000 1.0 \n",
+ "3 2487200875 604000.0 4 3.00 1960 5000 1.0 \n",
+ "4 1954400510 510000.0 3 2.00 1680 8080 1.0 \n",
+ "5 7237550310 1230000.0 4 4.50 5420 101930 1.0 \n",
+ "6 1321400060 257500.0 3 2.25 1715 6819 2.0 \n",
+ "7 2008000270 291850.0 3 1.50 1060 9711 1.0 \n",
+ "8 2414600126 229500.0 3 1.00 1780 7470 1.0 \n",
+ "9 3793500160 323000.0 3 2.50 1890 6560 2.0 \n",
+ "\n",
+ " waterfront view condition ... yr_renovated zipcode lat long \\\n",
+ "0 NO NONE Average ... 0.0 98178 47.5112 -122.257 \n",
+ "1 NO NONE Average ... 1991.0 98125 47.7210 -122.319 \n",
+ "2 NO NONE Average ... 0.0 98028 47.7379 -122.233 \n",
+ "3 NO NONE Very Good ... 0.0 98136 47.5208 -122.393 \n",
+ "4 NO NONE Average ... 0.0 98074 47.6168 -122.045 \n",
+ "5 NO NONE Average ... 0.0 98053 47.6561 -122.005 \n",
+ "6 NO NONE Average ... 0.0 98003 47.3097 -122.327 \n",
+ "7 NO NONE Average ... 0.0 98198 47.4095 -122.315 \n",
+ "8 NO NONE Average ... 0.0 98146 47.5123 -122.337 \n",
+ "9 NO NONE Average ... 0.0 98038 47.3684 -122.031 \n",
+ "\n",
+ " sqft_living15 sqft_lot15 month_sold year_sold age_built age_renovated \n",
+ "0 1340 5650 10 2014 60 0.0 \n",
+ "1 1690 7639 12 2014 64 24.0 \n",
+ "2 2720 8062 2 2015 82 0.0 \n",
+ "3 1360 5000 12 2014 50 0.0 \n",
+ "4 1800 7503 2 2015 28 0.0 \n",
+ "5 4760 101930 5 2014 14 0.0 \n",
+ "6 2238 6819 6 2014 20 0.0 \n",
+ "7 1650 9711 1 2015 52 0.0 \n",
+ "8 1780 8113 4 2015 55 0.0 \n",
+ "9 2390 7570 3 2015 12 0.0 \n",
+ "\n",
+ "[10 rows x 24 columns]"
+ ]
+ },
+ "execution_count": 81,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "kc_data_df.head(10) # viewing the first ten rows of our data"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "lets create a new column named season_sold that tell us which at which season was a specific house sold at these will helps us to identify which season had the most sales and whether season determines the price of a house"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 82,
"metadata": {},
"outputs": [],
"source": [
- "# Your code here - remember to use markdown cells for comments as well!"
+ "# creating a season dictonary using months in numerical order\n",
+ "season_dict = {1:'Winter',\n",
+ " 2:'Winter',\n",
+ " 3:'Spring',\n",
+ " 4:'Spring',\n",
+ " 5:'Spring',\n",
+ " 6:'Summer',\n",
+ " 7:'Summer',\n",
+ " 8:'Summer',\n",
+ " 9:'Fall',\n",
+ " 10:'Fall',\n",
+ " 11:'Fall',\n",
+ " 12:'Winter'}\n",
+ "# creating a new column and mapping it to season_dict\n",
+ "kc_data_df['season_sold'] = kc_data_df['month_sold'].map(season_dict)"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 83,
+ "metadata": {
+ "scrolled": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " id | \n",
+ " price | \n",
+ " bedrooms | \n",
+ " bathrooms | \n",
+ " sqft_living | \n",
+ " sqft_lot | \n",
+ " floors | \n",
+ " waterfront | \n",
+ " view | \n",
+ " condition | \n",
+ " ... | \n",
+ " zipcode | \n",
+ " lat | \n",
+ " long | \n",
+ " sqft_living15 | \n",
+ " sqft_lot15 | \n",
+ " month_sold | \n",
+ " year_sold | \n",
+ " age_built | \n",
+ " age_renovated | \n",
+ " season_sold | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 7129300520 | \n",
+ " 221900.0 | \n",
+ " 3 | \n",
+ " 1.00 | \n",
+ " 1180 | \n",
+ " 5650 | \n",
+ " 1.0 | \n",
+ " NO | \n",
+ " NONE | \n",
+ " Average | \n",
+ " ... | \n",
+ " 98178 | \n",
+ " 47.5112 | \n",
+ " -122.257 | \n",
+ " 1340 | \n",
+ " 5650 | \n",
+ " 10 | \n",
+ " 2014 | \n",
+ " 60 | \n",
+ " 0.0 | \n",
+ " Fall | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 6414100192 | \n",
+ " 538000.0 | \n",
+ " 3 | \n",
+ " 2.25 | \n",
+ " 2570 | \n",
+ " 7242 | \n",
+ " 2.0 | \n",
+ " NO | \n",
+ " NONE | \n",
+ " Average | \n",
+ " ... | \n",
+ " 98125 | \n",
+ " 47.7210 | \n",
+ " -122.319 | \n",
+ " 1690 | \n",
+ " 7639 | \n",
+ " 12 | \n",
+ " 2014 | \n",
+ " 64 | \n",
+ " 24.0 | \n",
+ " Winter | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 5631500400 | \n",
+ " 180000.0 | \n",
+ " 2 | \n",
+ " 1.00 | \n",
+ " 770 | \n",
+ " 10000 | \n",
+ " 1.0 | \n",
+ " NO | \n",
+ " NONE | \n",
+ " Average | \n",
+ " ... | \n",
+ " 98028 | \n",
+ " 47.7379 | \n",
+ " -122.233 | \n",
+ " 2720 | \n",
+ " 8062 | \n",
+ " 2 | \n",
+ " 2015 | \n",
+ " 82 | \n",
+ " 0.0 | \n",
+ " Winter | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 2487200875 | \n",
+ " 604000.0 | \n",
+ " 4 | \n",
+ " 3.00 | \n",
+ " 1960 | \n",
+ " 5000 | \n",
+ " 1.0 | \n",
+ " NO | \n",
+ " NONE | \n",
+ " Very Good | \n",
+ " ... | \n",
+ " 98136 | \n",
+ " 47.5208 | \n",
+ " -122.393 | \n",
+ " 1360 | \n",
+ " 5000 | \n",
+ " 12 | \n",
+ " 2014 | \n",
+ " 50 | \n",
+ " 0.0 | \n",
+ " Winter | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 1954400510 | \n",
+ " 510000.0 | \n",
+ " 3 | \n",
+ " 2.00 | \n",
+ " 1680 | \n",
+ " 8080 | \n",
+ " 1.0 | \n",
+ " NO | \n",
+ " NONE | \n",
+ " Average | \n",
+ " ... | \n",
+ " 98074 | \n",
+ " 47.6168 | \n",
+ " -122.045 | \n",
+ " 1800 | \n",
+ " 7503 | \n",
+ " 2 | \n",
+ " 2015 | \n",
+ " 28 | \n",
+ " 0.0 | \n",
+ " Winter | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 21592 | \n",
+ " 263000018 | \n",
+ " 360000.0 | \n",
+ " 3 | \n",
+ " 2.50 | \n",
+ " 1530 | \n",
+ " 1131 | \n",
+ " 3.0 | \n",
+ " NO | \n",
+ " NONE | \n",
+ " Average | \n",
+ " ... | \n",
+ " 98103 | \n",
+ " 47.6993 | \n",
+ " -122.346 | \n",
+ " 1530 | \n",
+ " 1509 | \n",
+ " 5 | \n",
+ " 2014 | \n",
+ " 6 | \n",
+ " 0.0 | \n",
+ " Spring | \n",
+ "
\n",
+ " \n",
+ " 21593 | \n",
+ " 6600060120 | \n",
+ " 400000.0 | \n",
+ " 4 | \n",
+ " 2.50 | \n",
+ " 2310 | \n",
+ " 5813 | \n",
+ " 2.0 | \n",
+ " NO | \n",
+ " NONE | \n",
+ " Average | \n",
+ " ... | \n",
+ " 98146 | \n",
+ " 47.5107 | \n",
+ " -122.362 | \n",
+ " 1830 | \n",
+ " 7200 | \n",
+ " 2 | \n",
+ " 2015 | \n",
+ " 1 | \n",
+ " 0.0 | \n",
+ " Winter | \n",
+ "
\n",
+ " \n",
+ " 21594 | \n",
+ " 1523300141 | \n",
+ " 402101.0 | \n",
+ " 2 | \n",
+ " 0.75 | \n",
+ " 1020 | \n",
+ " 1350 | \n",
+ " 2.0 | \n",
+ " NO | \n",
+ " NONE | \n",
+ " Average | \n",
+ " ... | \n",
+ " 98144 | \n",
+ " 47.5944 | \n",
+ " -122.299 | \n",
+ " 1020 | \n",
+ " 2007 | \n",
+ " 6 | \n",
+ " 2014 | \n",
+ " 6 | \n",
+ " 0.0 | \n",
+ " Summer | \n",
+ "
\n",
+ " \n",
+ " 21595 | \n",
+ " 291310100 | \n",
+ " 400000.0 | \n",
+ " 3 | \n",
+ " 2.50 | \n",
+ " 1600 | \n",
+ " 2388 | \n",
+ " 2.0 | \n",
+ " NO | \n",
+ " NONE | \n",
+ " Average | \n",
+ " ... | \n",
+ " 98027 | \n",
+ " 47.5345 | \n",
+ " -122.069 | \n",
+ " 1410 | \n",
+ " 1287 | \n",
+ " 1 | \n",
+ " 2015 | \n",
+ " 11 | \n",
+ " 0.0 | \n",
+ " Winter | \n",
+ "
\n",
+ " \n",
+ " 21596 | \n",
+ " 1523300157 | \n",
+ " 325000.0 | \n",
+ " 2 | \n",
+ " 0.75 | \n",
+ " 1020 | \n",
+ " 1076 | \n",
+ " 2.0 | \n",
+ " NO | \n",
+ " NONE | \n",
+ " Average | \n",
+ " ... | \n",
+ " 98144 | \n",
+ " 47.5941 | \n",
+ " -122.299 | \n",
+ " 1020 | \n",
+ " 1357 | \n",
+ " 10 | \n",
+ " 2014 | \n",
+ " 7 | \n",
+ " 0.0 | \n",
+ " Fall | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
21597 rows × 25 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " id price bedrooms bathrooms sqft_living sqft_lot floors \\\n",
+ "0 7129300520 221900.0 3 1.00 1180 5650 1.0 \n",
+ "1 6414100192 538000.0 3 2.25 2570 7242 2.0 \n",
+ "2 5631500400 180000.0 2 1.00 770 10000 1.0 \n",
+ "3 2487200875 604000.0 4 3.00 1960 5000 1.0 \n",
+ "4 1954400510 510000.0 3 2.00 1680 8080 1.0 \n",
+ "... ... ... ... ... ... ... ... \n",
+ "21592 263000018 360000.0 3 2.50 1530 1131 3.0 \n",
+ "21593 6600060120 400000.0 4 2.50 2310 5813 2.0 \n",
+ "21594 1523300141 402101.0 2 0.75 1020 1350 2.0 \n",
+ "21595 291310100 400000.0 3 2.50 1600 2388 2.0 \n",
+ "21596 1523300157 325000.0 2 0.75 1020 1076 2.0 \n",
+ "\n",
+ " waterfront view condition ... zipcode lat long \\\n",
+ "0 NO NONE Average ... 98178 47.5112 -122.257 \n",
+ "1 NO NONE Average ... 98125 47.7210 -122.319 \n",
+ "2 NO NONE Average ... 98028 47.7379 -122.233 \n",
+ "3 NO NONE Very Good ... 98136 47.5208 -122.393 \n",
+ "4 NO NONE Average ... 98074 47.6168 -122.045 \n",
+ "... ... ... ... ... ... ... ... \n",
+ "21592 NO NONE Average ... 98103 47.6993 -122.346 \n",
+ "21593 NO NONE Average ... 98146 47.5107 -122.362 \n",
+ "21594 NO NONE Average ... 98144 47.5944 -122.299 \n",
+ "21595 NO NONE Average ... 98027 47.5345 -122.069 \n",
+ "21596 NO NONE Average ... 98144 47.5941 -122.299 \n",
+ "\n",
+ " sqft_living15 sqft_lot15 month_sold year_sold age_built \\\n",
+ "0 1340 5650 10 2014 60 \n",
+ "1 1690 7639 12 2014 64 \n",
+ "2 2720 8062 2 2015 82 \n",
+ "3 1360 5000 12 2014 50 \n",
+ "4 1800 7503 2 2015 28 \n",
+ "... ... ... ... ... ... \n",
+ "21592 1530 1509 5 2014 6 \n",
+ "21593 1830 7200 2 2015 1 \n",
+ "21594 1020 2007 6 2014 6 \n",
+ "21595 1410 1287 1 2015 11 \n",
+ "21596 1020 1357 10 2014 7 \n",
+ "\n",
+ " age_renovated season_sold \n",
+ "0 0.0 Fall \n",
+ "1 24.0 Winter \n",
+ "2 0.0 Winter \n",
+ "3 0.0 Winter \n",
+ "4 0.0 Winter \n",
+ "... ... ... \n",
+ "21592 0.0 Spring \n",
+ "21593 0.0 Winter \n",
+ "21594 0.0 Summer \n",
+ "21595 0.0 Winter \n",
+ "21596 0.0 Fall \n",
+ "\n",
+ "[21597 rows x 25 columns]"
+ ]
+ },
+ "execution_count": 83,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "kc_data_df # checking if the new column was added"
]
}
],
@@ -40,7 +2057,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
- "version": "3.6.4"
+ "version": "3.8.5"
}
},
"nbformat": 4,