From d8a9e6435fb14ee67336f5d597c0472d5bbf3b16 Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Sat, 27 Apr 2024 18:46:36 +0300 Subject: [PATCH 01/42] Update student.ipynb --- student.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/student.ipynb b/student.ipynb index d3bb34af..d4ba3217 100644 --- a/student.ipynb +++ b/student.ipynb @@ -7,7 +7,7 @@ "## Final Project Submission\n", "\n", "Please fill out:\n", - "* Student name: \n", + "* Student name: Solphine Joseph\n", "* Student pace: self paced / part time / full time\n", "* Scheduled project review date/time: \n", "* Instructor name: \n", From edf85ba829ba11bb5049eb9a34db1a33f06081cf Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Sat, 27 Apr 2024 19:08:04 +0300 Subject: [PATCH 02/42] Update student.ipynb --- student.ipynb | 41 ++++++++++++++++++++++++++++++++++------- 1 file changed, 34 insertions(+), 7 deletions(-) diff --git a/student.ipynb b/student.ipynb index d4ba3217..a39a5a52 100644 --- a/student.ipynb +++ b/student.ipynb @@ -7,20 +7,47 @@ "## Final Project Submission\n", "\n", "Please fill out:\n", - "* Student name: Solphine Joseph\n", - "* Student pace: self paced / part time / full time\n", + "* Student name: Solphine Joseph, Grace Rotich, Mather Rotich, Hilary Simiyu, Clyde Ochieng.\n", + "* Student pace: full time\n", "* Scheduled project review date/time: \n", - "* Instructor name: \n", + "* Instructor name: Nikita \n", "* Blog post URL:\n" ] }, { - "cell_type": "code", - "execution_count": null, + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Kings County Housing Analysis with Multiple Linear Regression" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Overview\n", + "\n", + "\n", + "A real estate agency in Kingsway seeks to determine what are the contributing factors that affect the price of houses to make improvements where necessary. They want to employ an analytical approach rather than sentimental before arriving at a decision. Multilinear regression has been used for this project to understand how various features affect their pricing to better their services." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Business Problem\n", + "\n", + "In the face of market fluctuations and heightened competition within the real estate sector, our agency is grappling with pricing volatility, which poses significant challenges for our agents in devising effective business strategies. We seek strategic guidance to optimize our purchasing and selling endeavors, prioritizing informed decision-making to identify key areas of focus that promise maximum returns on investment." + ] + }, + { + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "# Your code here - remember to use markdown cells for comments as well!" + "### Objectives\n", + "* To determine the key factors influencing house prices.\n", + "* To develop multilinear regression models to predict house prices based on relevant features.\n", + "* To use insights from the regression analysis to optimize pricing strategies for both purchasing and selling properties.\n" ] } ], From 9a914828148112c36789f9fdad61f7b00a90897b Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Sat, 27 Apr 2024 19:08:08 +0300 Subject: [PATCH 03/42] Update README.md --- README.md | 288 ++---------------------------------------------------- 1 file changed, 8 insertions(+), 280 deletions(-) diff --git a/README.md b/README.md index 5dd0f84d..39359350 100644 --- a/README.md +++ b/README.md @@ -1,285 +1,13 @@ # Phase 2 Project Description +## Overview -Another module down - you're almost half way there! -![awesome](https://raw.githubusercontent.com/learn-co-curriculum/dsc-phase-2-project-v2-3/main/halfway-there.gif) +A real estate agency in Kingsway seeks to determine what are the contributing factors that affect the price of houses to make improvements where necessary. They want to employ an analytical approach rather than sentimental before arriving at a decision. Multilinear regression has been used for this project to understand how various features affect their pricing to better their services. +## Business Problem -All that remains in Phase 2 is to put your newfound data science skills to use with a large project! +In the face of market fluctuations and heightened competition within the real estate sector, our agency is grappling with pricing volatility, which poses significant challenges for our agents in devising effective business strategies. We seek strategic guidance to optimize our purchasing and selling endeavors, prioritizing informed decision-making to identify key areas of focus that promise maximum returns on investment. +### Objectives -In this project description, we will cover: - -* Project Overview: the project goal, audience, and dataset -* Deliverables: the specific items you are required to produce for this project -* Grading: how your project will be scored -* Getting Started: guidance for how to begin working - -## Project Overview - -For this project, you will use multiple linear regression modeling to analyze house sales in a northwestern county. - -### Business Problem - -It is up to you to define a stakeholder and business problem appropriate to this dataset. - -If you are struggling to define a stakeholder, we recommend you complete a project for a real estate agency that helps homeowners buy and/or sell homes. A business problem you could focus on for this stakeholder is the need to provide advice to homeowners about how home renovations might increase the estimated value of their homes, and by what amount. - -### The Data - -This project uses the King County House Sales dataset, which can be found in `kc_house_data.csv` in the data folder in this assignment's GitHub repository. The description of the column names can be found in `column_names.md` in the same folder. As with most real world data sets, the column names are not perfectly described, so you'll have to do some research or use your best judgment if you have questions about what the data means. - -It is up to you to decide what data from this dataset to use and how to use it. If you are feeling overwhelmed or behind, we recommend you **ignore** some or all of the following features: - -* `date` -* `view` -* `sqft_above` -* `sqft_basement` -* `yr_renovated` -* `zipcode` -* `lat` -* `long` -* `sqft_living15` -* `sqft_lot15` - -### Key Points - -* **Your goal in regression modeling is to yield findings to support relevant recommendations. Those findings should include a metric describing overall model performance as well as at least two regression model coefficients.** As you explore the data and refine your stakeholder and business problem definitions, make sure you are also thinking about how a linear regression model adds value to your analysis. "The assignment was to use linear regression" is not an acceptable answer! You can also use additional statistical techniques other than linear regression, so long as you clearly explain why you are using each technique. - -* **You should demonstrate an iterative approach to modeling.** This means that you must build multiple models. Begin with a basic model, evaluate it, and then provide justification for and proceed to a new model. After you finish refining your models, you should provide 1-3 paragraphs in the notebook discussing your final model. - -* **Data visualization and analysis are no longer explicit project requirements, but they are still very important.** In Phase 1, your project stopped earlier in the CRISP-DM process. Now you are going a step further, to modeling. Data visualization and analysis will help you build better models and tell a better story to your stakeholders. - -## Deliverables - -There are three deliverables for this project: - -* A **non-technical presentation** -* A **Jupyter Notebook** -* A **GitHub repository** - -The deliverables requirements are almost the same as in the Phase 1 Project, and you can review those extended descriptions [here](https://github.com/learn-co-curriculum/dsc-phase-1-project-v2-3#deliverables). In general, everything is the same except the "Data Visualization" and "Data Analysis" requirements have been replaced by "Modeling" and "Regression Results" requirements. - -### Non-Technical Presentation - -Recall that the non-technical presentation is a slide deck presenting your analysis to ***business stakeholders***, and should be presented live as well as submitted in PDF form on Canvas. - -We recommend that you follow this structure, although the slide titles should be specific to your project: - -1. Beginning - - Overview - - Business and Data Understanding -2. Middle - - **Modeling** - - **Regression Results** -3. End - - Recommendations - - Next Steps - - Thank you - -Make sure that your discussion of modeling and regression results is geared towards a non-technical audience! Assume that their prior knowledge of regression modeling is minimal. You don't need to explain how linear regression works, but you should explain why linear regression is useful for the problem context. Make sure you translate any metrics or coefficients into their plain language implications. - -The graded elements for the non-technical presentation are the same as in [Phase 1](https://github.com/learn-co-curriculum/dsc-phase-1-project-v2-3#deliverables). - -### Jupyter Notebook - -Recall that the Jupyter Notebook is a notebook that uses Python and Markdown to present your analysis to a ***data science audience***. You will submit the notebook in PDF format on Canvas as well as in `.ipynb` format in your GitHub repository. - -The graded elements for the Jupyter Notebook are: - -* Business Understanding -* Data Understanding -* Data Preparation -* **Modeling** -* **Regression Results** -* Code Quality - -### GitHub Repository - -Recall that the GitHub repository is the cloud-hosted directory containing all of your project files as well as their version history. - -The requirements are the same as in [Phase 1](https://github.com/learn-co-curriculum/dsc-phase-1-project-v2-3#github-repository), except for the required sections in the `README.md`. - -For this project, the `README.md` file should contain: - -* Overview -* Business and Data Understanding - * Explain your stakeholder audience here -* **Modeling** -* **Regression Results** -* Conclusion - -Just like in Phase 1, the `README.md` file should be the bridge between your non technical presentation and the Jupyter Notebook. It should not contain the code used to develop your analysis, but should provide a more in-depth explanation of your methodology and analysis than what is described in your presentation slides. - -## Grading - -***To pass this project, you must pass each project rubric objective.*** The project rubric objectives for Phase 2 are: - -1. Attention to Detail -2. Statistical Communication -3. Data Preparation Fundamentals -4. Linear Modeling - -### Attention to Detail - -Just like in Phase 1, this rubric objective is based on your completion of checklist items. ***In Phase 2, you need to complete 70% (7 out of 10) or more of the checklist elements in order to pass the Attention to Detail objective.*** - -**NOTE THAT THE PASSING BAR IS HIGHER IN PHASE 2 THAN IT WAS IN PHASE 1!** - -The standard will increase with each Phase, until you will be required to complete all elements to pass Phase 5 (Capstone). - -#### Exceeds Objective - -80% or more of the project checklist items are complete - -#### Meets Objective (Passing Bar) - -70% of the project checklist items are complete - -#### Approaching Objective - -60% of the project checklist items are complete - -#### Does Not Meet Objective - -50% or fewer of the project checklist items are complete - -### Statistical Communication - -Recall that communication is one of the key data science "soft skills". In Phase 2, we are specifically focused on Statistical Communication. We define Statistical Communication as: - -> Communicating **results of statistical analyses** to diverse audiences via writing and live presentation - -Note that this is the same as in Phase 1, except we are replacing "basic data analysis" with "statistical analyses". - -High-quality Statistical Communication includes rationale, results, limitations, and recommendations: - -* **Rationale:** Explaining why you are using statistical analyses rather than basic data analysis - * For example, why are you using regression coefficients rather than just a graph? - * What about the problem or data is suitable for this form of analysis? - * For a data science audience, this includes your reasoning for the changes you applied while iterating between models. -* **Results:** Describing the overall model metrics and feature coefficients - * You need at least one overall model metric (e.g. r-squared or RMSE) and at least two feature coefficients. - * For a business audience, make sure you connect any metrics to real-world implications. You do not need to get into the details of how linear regression works. - * For a data science audience, you don't need to explain what a metric is, but make sure you explain why you chose that particular one. -* **Limitations:** Identifying the limitations and/or uncertainty present in your analysis - * This could include p-values/alpha values, confidence intervals, assumptions of linear regression, missing data, etc. - * In general, this should be more in-depth for a data science audience and more surface-level for a business audience. -* **Recommendations:** Interpreting the model results and limitations in the context of the business problem - * What should stakeholders _do_ with this information? - -#### Exceeds Objective - -Communicates the rationale, results, limitations, and specific recommendations of statistical analyses - -> See above for extended explanations of these terms. - -#### Meets Objective (Passing Bar) - -Successfully communicates the results of statistical analyses without any major errors - -> The minimum requirement is to communicate the _results_, meaning at least one overall model metric (e.g. r-squared or RMSE) as well as at least two feature coefficients. See the Approaching Objective section for an explanation of what a "major error" means. - -#### Approaching Objective - -Communicates the results of statistical analyses with at least one major error - -> A major error means that some aspect of your explanation is fundamentally incorrect. For example, if a feature coefficient is negative and you say that an increase in that feature results in an increase of the target, that would be a major error. Another example would be if you say that the feature with the highest coefficient is the "most statistically significant" while ignoring the p-value. One more example would be reporting a coefficient that is not statistically significant, rather than saying "no statistically significant linear relationship was found" - -> "**If a coefficient's t-statistic is not significant, don't interpret it at all.** You can't be sure that the value of the corresponding parameter in the underlying regression model isn't really zero." _DeVeaux, Velleman, and Bock (2012), Stats: Data and Models, 3rd edition, pg. 801_. Check out [this website](https://web.ma.utexas.edu/users/mks/statmistakes/TOC.html) for extensive additional examples of mistakes using statistics. - -> The easiest way to avoid making a major error is to have someone double-check your work. Reach out to peers on Slack and ask them to confirm whether your interpretation makes sense! - -#### Does Not Meet Objective - -Does not communicate the results of statistical analyses - -> It is not sufficient to just display the entire results summary. You need to pull out at least one overall model metric (e.g. r-squared, RMSE) and at least two feature coefficients, and explain what those numbers mean. - -### Data Preparation Fundamentals - -We define this objective as: - -> Applying appropriate **preprocessing** and feature engineering steps to tabular data in preparation for statistical modeling - -The two most important components of preprocessing for the Phase 2 project are: - -* **Handling Missing Values:** Missing values may be present in the features you want to use, either encoded as `NaN` or as some other value such as `"?"`. Before you can build a linear regression model, make sure you identify and address any missing values using techniques such as dropping or replacing data. -* **Handling Non-Numeric Data:** A linear regression model needs all of the features to be numeric, not categorical. For this project, ***be sure to pick at least one non-numeric feature and try including it in a model.*** You can identify that a feature is currently non-numeric if the type is `object` when you run `.info()` on your dataframe. Once you have identified the non-numeric features, address them using techniques such as ordinal or one-hot (dummy) encoding. - -There is no single correct way to handle either of these situations! Use your best judgement to decide what to do, and be sure to explain your rationale in the Markdown of your notebook. - -Feature engineering is encouraged but not required for this project. - -#### Exceeds Objective - -Goes above and beyond with data preparation, such as feature engineering or merging in outside datasets - -> One example of feature engineering could be using the `date` feature to create a new feature called `season`, which represents whether the home was sold in Spring, Summer, Fall, or Winter. - -> One example of merging in outside datasets could be finding data based on ZIP Code, such as household income or walkability, and joining that data with the provided CSV. - -#### Meets Objective (Passing Bar) - -Successfully prepares data for modeling, including converting at least one non-numeric feature into ordinal or binary data and handling missing data as needed - -> As a reminder, you can identify the non-numeric features by calling `.info()` on the dataframe and looking for type `object`. - -> Your final model does not necessarily need to include any features that were originally non-numeric, but you need to demonstrate your ability to handle this type of data. - -#### Approaching Objective - -Prepares some data successfully, but is unable to utilize non-numeric data - -> If you simply subset the dataframe to only columns with type `int64` or `float64`, your model will run, but you will not pass this objective. - -#### Does Not Meet Objective - -Does not prepare data for modeling - -### Linear Modeling - -According to [Kaggle's 2020 State of Data Science and Machine Learning Survey](https://www.kaggle.com/kaggle-survey-2020), linear and logistic regression are the most popular machine learning algorithms, used by 83.7% of data scientists. They are small, fast models compared to some of the models you will learn later, but have limitations in the kinds of relationships they are able to learn. - -In this project you are required to use linear regression as the primary statistical analysis, although you are free to use additional statistical techniques as appropriate. - -#### Exceeds Objective - -Goes above and beyond in the modeling process, such as recursive feature selection - -#### Meets Objective (Passing Bar) - -Successfully builds a baseline model as well as at least one iterated model, and correctly extracts insights from a final model without any major errors - -> We are looking for you to (1) create a baseline model, (2) iterate on that model, making adjustments that are supported by regression theory or by descriptive analysis of the data, and (3) select a final model and report on its metrics and coefficients - -> Ideally you would include written justifications for each model iteration, but at minimum the iterations must be _justifiable_ - -> For an explanation of "major errors", see the description below - -#### Approaching Objective - -Builds multiple models with at least one major error - -> The number one major error to avoid is including the target as one of your features. For example, if the target is `price` you should NOT make a "price per square foot" feature, because that feature would not be available if you didn't already know the price. - -> Other examples of major errors include: using a target other than `price`, attempting only simple linear regression (not multiple linear regression), dropping multiple one-hot encoded columns without explaining the resulting baseline, or using a unique identifier (`id` in this dataset) as a feature. - -#### Does Not Meet Objective - -Does not build multiple linear regression models - -## Getting Started - -Please start by reviewing the contents of this project description. If you have any questions, please ask your instructor ASAP. - -Next, you will need to complete the [***Project Proposal***](#project_proposal) which must be reviewed by your instructor before you can continue with the project. - -Here are some suggestions for creating your GitHub repository: - -1. Fork the [Phase 2 Project Repository](https://github.com/learn-co-curriculum/dsc-phase-2-project-v2-3), clone it locally, and work in the `student.ipynb` file. Make sure to also add and commit a PDF of your presentation to your repository with a file name of `presentation.pdf`. -2. Or, create a new repository from scratch by going to [github.com/new](https://github.com/new) and copying the data files from the Phase 2 Project Repository into your new repository. - - Recall that you can refer to the [Phase 1 Project Template](https://github.com/learn-co-curriculum/dsc-project-template) as an example structure - - This option will result in the most professional-looking portfolio repository, but can be more complicated to use. So if you are getting stuck with this option, try forking the project repository instead - -## Summary - -This is your first modeling project! Take what you have learned in Phase 2 to create a project with a more sophisticated analysis than you completed in Phase 1. You will build on these skills as we move into the predictive machine learning mindset in Phase 3. You've got this! +* To determine the key factors influencing house prices. +* To develop multilinear regression models to predict house prices based on relevant features. +* To use insights from the regression analysis to optimize pricing strategies for both purchasing and selling properties. From 44945a8b49c270aaa7febb792115f4b93f5d9c61 Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Sat, 27 Apr 2024 23:50:20 +0300 Subject: [PATCH 04/42] Update student.ipynb --- student.ipynb | 38 ++++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/student.ipynb b/student.ipynb index a39a5a52..1b57076a 100644 --- a/student.ipynb +++ b/student.ipynb @@ -49,6 +49,44 @@ "* To develop multilinear regression models to predict house prices based on relevant features.\n", "* To use insights from the regression analysis to optimize pricing strategies for both purchasing and selling properties.\n" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Data Understanding:\n", + "\n", + "The real estate agency in Kingsway is analyzing a dataset to determine the factors affecting house prices. The dataset likely includes features such as property size, location, age, and market trends. Key steps include assessing data quality, exploring relationships between features and prices, and preprocessing data for multilinear regression analysis. Multilinear regression will be used to model how these features collectively influence house prices, with evaluation metrics used to assess predictive accuracy." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### King County Housing Data Columns \n", + "\n", + "* `id` - Unique identifier for a house\n", + "* `date` - Date house was sold\n", + "* `price` - Sale price (prediction target)\n", + "* `bedrooms` - Number of bedrooms\n", + "* `bathrooms` - Number of bathrooms\n", + "* `sqft_living` - Square footage of living space in the home\n", + "* `sqft_lot` - Square footage of the lot\n", + "* `floors` - Number of floors (levels) in house\n", + "* `waterfront` - Whether the house is on a waterfront\n", + "* `view` - Quality of view from house\n", + "* `condition` - How good the overall condition of the house is. \n", + "* `grade` - Overall grade of the house. \n", + "* `sqft_above` - Square footage of house apart from basement \n", + "* `sqft_basement` - Square footage of the basement – (Ignored)\n", + "* `yr_built` - Year when house was built\n", + "* `yr_renovated` - Year when house was renovated – (Ignored)\n", + "* `zipcode` - ZIP Code used by the United States Postal Service \n", + "* `lat` - Latitude coordinate\n", + "* `long` - Longitude coordinate\n", + "* `sqft_living15` - The square footage of interior housing living space for the nearest 15 neighbors\n", + "* `sqft_lot15` - The square footage of the land lots of the nearest 15 neighbors" + ] } ], "metadata": { From cc69bf6eaebae6771f6177b5754a3335140cdc07 Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Sat, 27 Apr 2024 23:50:27 +0300 Subject: [PATCH 05/42] Update README.md --- README.md | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 39359350..7f581e17 100644 --- a/README.md +++ b/README.md @@ -2,12 +2,6 @@ ## Overview -A real estate agency in Kingsway seeks to determine what are the contributing factors that affect the price of houses to make improvements where necessary. They want to employ an analytical approach rather than sentimental before arriving at a decision. Multilinear regression has been used for this project to understand how various features affect their pricing to better their services. -## Business Problem +Data Understanding: -In the face of market fluctuations and heightened competition within the real estate sector, our agency is grappling with pricing volatility, which poses significant challenges for our agents in devising effective business strategies. We seek strategic guidance to optimize our purchasing and selling endeavors, prioritizing informed decision-making to identify key areas of focus that promise maximum returns on investment. -### Objectives - -* To determine the key factors influencing house prices. -* To develop multilinear regression models to predict house prices based on relevant features. -* To use insights from the regression analysis to optimize pricing strategies for both purchasing and selling properties. +The real estate agency in Kingsway is analyzing a dataset to determine the factors affecting house prices. The dataset likely includes features such as property size, location, age, and market trends. Key steps include assessing data quality, exploring relationships between features and prices, and preprocessing data for multilinear regression analysis. Multilinear regression will be used to model how these features collectively influence house prices, with evaluation metrics used to assess predictive accuracy. \ No newline at end of file From 69d9b53e29fe61e9d1838e8fbe203b883e713f09 Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Sat, 27 Apr 2024 23:54:15 +0300 Subject: [PATCH 06/42] Update student.ipynb --- student.ipynb | 28 +++++++++++++++++++++++++++- 1 file changed, 27 insertions(+), 1 deletion(-) diff --git a/student.ipynb b/student.ipynb index 1b57076a..567e978d 100644 --- a/student.ipynb +++ b/student.ipynb @@ -87,6 +87,32 @@ "* `sqft_living15` - The square footage of interior housing living space for the nearest 15 neighbors\n", "* `sqft_lot15` - The square footage of the land lots of the nearest 15 neighbors" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Explotory Data Analyis\n", + "\n", + "Importing data." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "#load necessary modules \n", + "import numpy as np\n", + "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "import scipy.stats as stats\n", + "import statsmodels.formula.api as smf\n", + "import statsmodels.stats.api as sms\n", + "import statsmodels.api as sm" + ] } ], "metadata": { @@ -105,7 +131,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.4" + "version": "3.10.13" } }, "nbformat": 4, From 47f9a238257ff095bf0edf0818244fa726a9fbe4 Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Sun, 28 Apr 2024 18:27:02 +0300 Subject: [PATCH 07/42] Update student.ipynb --- student.ipynb | 394 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 386 insertions(+), 8 deletions(-) diff --git a/student.ipynb b/student.ipynb index 567e978d..013a05d1 100644 --- a/student.ipynb +++ b/student.ipynb @@ -99,19 +99,397 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ - "#load necessary modules \n", - "import numpy as np\n", + "#import libraries\n", "import pandas as pd\n", - "import matplotlib.pyplot as plt\n", - "%matplotlib inline\n", + "import numpy as np\n", "import scipy.stats as stats\n", - "import statsmodels.formula.api as smf\n", - "import statsmodels.stats.api as sms\n", - "import statsmodels.api as sm" + "import math\n", + "import matplotlib.pyplot as plt\n", + "from sklearn.impute import MissingIndicator\n", + "from sklearn.impute import SimpleImputer\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn.model_selection import train_test_split\n", + "from sklearn import preprocessing" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Removing irrelvant columns" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontview...gradesqft_abovesqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15
0712930052010/13/2014221900.031.00118056501.0NaNNONE...7 Average11800.019550.09817847.5112-122.25713405650
1641410019212/9/2014538000.032.25257072422.0NONONE...7 Average2170400.019511991.09812547.7210-122.31916907639
256315004002/25/2015180000.021.00770100001.0NONONE...6 Low Average7700.01933NaN9802847.7379-122.23327208062
3248720087512/9/2014604000.043.00196050001.0NONONE...7 Average1050910.019650.09813647.5208-122.39313605000
419544005102/18/2015510000.032.00168080801.0NONONE...8 Good16800.019870.09807447.6168-122.04518007503
\n", + "

5 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " id date price bedrooms bathrooms sqft_living \\\n", + "0 7129300520 10/13/2014 221900.0 3 1.00 1180 \n", + "1 6414100192 12/9/2014 538000.0 3 2.25 2570 \n", + "2 5631500400 2/25/2015 180000.0 2 1.00 770 \n", + "3 2487200875 12/9/2014 604000.0 4 3.00 1960 \n", + "4 1954400510 2/18/2015 510000.0 3 2.00 1680 \n", + "\n", + " sqft_lot floors waterfront view ... grade sqft_above \\\n", + "0 5650 1.0 NaN NONE ... 7 Average 1180 \n", + "1 7242 2.0 NO NONE ... 7 Average 2170 \n", + "2 10000 1.0 NO NONE ... 6 Low Average 770 \n", + "3 5000 1.0 NO NONE ... 7 Average 1050 \n", + "4 8080 1.0 NO NONE ... 8 Good 1680 \n", + "\n", + " sqft_basement yr_built yr_renovated zipcode lat long \\\n", + "0 0.0 1955 0.0 98178 47.5112 -122.257 \n", + "1 400.0 1951 1991.0 98125 47.7210 -122.319 \n", + "2 0.0 1933 NaN 98028 47.7379 -122.233 \n", + "3 910.0 1965 0.0 98136 47.5208 -122.393 \n", + "4 0.0 1987 0.0 98074 47.6168 -122.045 \n", + "\n", + " sqft_living15 sqft_lot15 \n", + "0 1340 5650 \n", + "1 1690 7639 \n", + "2 2720 8062 \n", + "3 1360 5000 \n", + "4 1800 7503 \n", + "\n", + "[5 rows x 21 columns]" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#load and preiview data\n", + "df = pd.read_csv('data/kc_house_data.csv')\n", + "df.head(5)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 21597 entries, 0 to 21596\n", + "Data columns (total 21 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 21597 non-null int64 \n", + " 1 date 21597 non-null object \n", + " 2 price 21597 non-null float64\n", + " 3 bedrooms 21597 non-null int64 \n", + " 4 bathrooms 21597 non-null float64\n", + " 5 sqft_living 21597 non-null int64 \n", + " 6 sqft_lot 21597 non-null int64 \n", + " 7 floors 21597 non-null float64\n", + " 8 waterfront 19221 non-null object \n", + " 9 view 21534 non-null object \n", + " 10 condition 21597 non-null object \n", + " 11 grade 21597 non-null object \n", + " 12 sqft_above 21597 non-null int64 \n", + " 13 sqft_basement 21597 non-null object \n", + " 14 yr_built 21597 non-null int64 \n", + " 15 yr_renovated 17755 non-null float64\n", + " 16 zipcode 21597 non-null int64 \n", + " 17 lat 21597 non-null float64\n", + " 18 long 21597 non-null float64\n", + " 19 sqft_living15 21597 non-null int64 \n", + " 20 sqft_lot15 21597 non-null int64 \n", + "dtypes: float64(6), int64(9), object(6)\n", + "memory usage: 3.5+ MB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Before conducting regression analysis, six non-numeric columns in the data—\"date\", \"waterfront\", \"view\", \"condition\", \"grade\", and \"sqft_basement\"—will require manipulation or removal." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preparation" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data Cleaning\n", + "Drop irrelevant columns, address missing values and manipulate data into desired forms" + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": {}, + "outputs": [], + "source": [ + "#drop irrelevant columns\n", + "df.drop(['id', 'date', 'zipcode', 'lat', 'long', 'yr_renovated', 'view'],\n", + " axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": {}, + "outputs": [], + "source": [ + "#fill in missing sqft_basement values\n", + "df.loc[df.sqft_basement == '?', 'sqft_basement'] = (\n", + " df[df.sqft_basement == '?'].sqft_living - \n", + " df[df.sqft_basement == '?'].sqft_above\n", + ")\n", + "\n", + "#convert into numeric\n", + "df['sqft_basement'] = df.sqft_basement.astype('float64')\n", + "\n", + "#sqft_basement is a zero inflated variable, so I convert it into \n", + "#a categorical variable\n", + "df['is_basement'] = df.sqft_basement.map(lambda x: 0 if x == 0 else 1)\n", + "df.drop('sqft_basement', axis=1, inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "#convert condition and grade into numeric values\n", + "df['condition'] = df.condition.map(lambda x: 0 if x=='Poor' \n", + " else (1 if x=='Fair'\n", + " else (2 if x=='Average'\n", + " else (3 if x=='Good' else 4))))\n", + "\n", + "df['grade'] = df.grade.map(lambda x: int(x[0:2]) - 3)" + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "#convert waterfront strings to 0 and 1\n", + "df['waterfront'] = df.waterfront.map(lambda x: 0 if x==\"NO\" \n", + " else (1 if x==\"YES\" else None))\n", + "\n", + "#create new column indicating if waterfront value is missing\n", + "waterfront = df[[\"waterfront\"]]\n", + "missing_indicator = MissingIndicator()\n", + "missing_indicator.fit(waterfront)\n", + "waterfront_missing = missing_indicator.transform(waterfront)\n", + "\n", + "#add waterfront missing to dataframe and convert to binary\n", + "df['waterfront_missing'] = waterfront_missing\n", + "\n", + "df['waterfront_missing'] = df.waterfront_missing.map(lambda x: 0 if x==False\n", + " else 1)" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "#fill in missing waterfront values with median\n", + "imputer = SimpleImputer(strategy=\"median\")\n", + "\n", + "imputer.fit(waterfront)\n", + "waterfront_imputed = imputer.transform(waterfront)\n", + "\n", + "df['waterfront'] = waterfront_imputed" ] } ], From f7904674776620894a789f559ddd99ba88078fdf Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Sun, 28 Apr 2024 18:27:52 +0300 Subject: [PATCH 08/42] Update student.ipynb --- student.ipynb | 159 ++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 159 insertions(+) diff --git a/student.ipynb b/student.ipynb index 013a05d1..11c2db8d 100644 --- a/student.ipynb +++ b/student.ipynb @@ -491,6 +491,165 @@ "\n", "df['waterfront'] = waterfront_imputed" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Removing irrelvant columns" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idpricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontconditiongradeyr_built
07129300520221900.031.00118056501.0NaNAverage7 Average1955
16414100192538000.032.25257072422.0NOAverage7 Average1951
25631500400180000.021.00770100001.0NOAverage6 Low Average1933
32487200875604000.043.00196050001.0NOVery Good7 Average1965
41954400510510000.032.00168080801.0NOAverage8 Good1987
\n", + "
" + ], + "text/plain": [ + " id price bedrooms bathrooms sqft_living sqft_lot floors \\\n", + "0 7129300520 221900.0 3 1.00 1180 5650 1.0 \n", + "1 6414100192 538000.0 3 2.25 2570 7242 2.0 \n", + "2 5631500400 180000.0 2 1.00 770 10000 1.0 \n", + "3 2487200875 604000.0 4 3.00 1960 5000 1.0 \n", + "4 1954400510 510000.0 3 2.00 1680 8080 1.0 \n", + "\n", + " waterfront condition grade yr_built \n", + "0 NaN Average 7 Average 1955 \n", + "1 NO Average 7 Average 1951 \n", + "2 NO Average 6 Low Average 1933 \n", + "3 NO Very Good 7 Average 1965 \n", + "4 NO Average 8 Good 1987 " + ] + }, + "execution_count": 2, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# reading the csv file\n", + "# drop irrelevant columns\n", + "df = pd.read_csv('data/kc_house_data.csv').drop(['date',\n", + " 'view', \n", + " 'sqft_above', \n", + " 'sqft_basement', \n", + " 'yr_renovated',\n", + " 'zipcode', \n", + " 'lat', \n", + " 'long', \n", + " 'sqft_living15',\n", + " 'sqft_lot15'], axis = 1)\n", + "# previewing the DataFrame\n", + "df.head()" + ] } ], "metadata": { From d0245ea30107cb1beee26f9fa15cc74814b104f5 Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Sun, 28 Apr 2024 19:54:11 +0300 Subject: [PATCH 09/42] Update student.ipynb --- student.ipynb | 341 ++++++++++++++++++++------------------------------ 1 file changed, 134 insertions(+), 207 deletions(-) diff --git a/student.ipynb b/student.ipynb index 11c2db8d..52ce98fd 100644 --- a/student.ipynb +++ b/student.ipynb @@ -54,7 +54,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Data Understanding:\n", + "### Data Understanding:\n", "\n", "The real estate agency in Kingsway is analyzing a dataset to determine the factors affecting house prices. The dataset likely includes features such as property size, location, age, and market trends. Key steps include assessing data quality, exploring relationships between features and prices, and preprocessing data for multilinear regression analysis. Multilinear regression will be used to model how these features collectively influence house prices, with evaluation metrics used to assess predictive accuracy." ] @@ -99,21 +99,26 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 50, "metadata": {}, "outputs": [], "source": [ - "#import libraries\n", + "#importing libraries \n", "import pandas as pd\n", "import numpy as np\n", - "import scipy.stats as stats\n", - "import math\n", - "import matplotlib.pyplot as plt\n", - "from sklearn.impute import MissingIndicator\n", - "from sklearn.impute import SimpleImputer\n", + "from matplotlib import pyplot as plt\n", + "import seaborn as sns\n", + "from sklearn import metrics\n", + "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", "from sklearn.linear_model import LinearRegression\n", + "import statsmodels.api as sm\n", + "from statsmodels.formula.api import ols\n", + "from scipy import stats\n", + "from sklearn.linear_model import LinearRegression\n", + "from sklearn import tree\n", "from sklearn.model_selection import train_test_split\n", - "from sklearn import preprocessing" + "from sklearn.tree import DecisionTreeRegressor\n", + "from sklearn.dummy import DummyRegressor" ] }, { @@ -125,7 +130,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 51, "metadata": {}, "outputs": [ { @@ -177,12 +182,12 @@ " 0\n", " 7129300520\n", " 10/13/2014\n", - " 221900.0\n", + " 221900.00000\n", " 3\n", - " 1.00\n", + " 1.00000\n", " 1180\n", " 5650\n", - " 1.0\n", + " 1.00000\n", " NaN\n", " NONE\n", " ...\n", @@ -190,10 +195,10 @@ " 1180\n", " 0.0\n", " 1955\n", - " 0.0\n", + " 0.00000\n", " 98178\n", - " 47.5112\n", - " -122.257\n", + " 47.51120\n", + " -122.25700\n", " 1340\n", " 5650\n", " \n", @@ -201,12 +206,12 @@ " 1\n", " 6414100192\n", " 12/9/2014\n", - " 538000.0\n", + " 538000.00000\n", " 3\n", - " 2.25\n", + " 2.25000\n", " 2570\n", " 7242\n", - " 2.0\n", + " 2.00000\n", " NO\n", " NONE\n", " ...\n", @@ -214,10 +219,10 @@ " 2170\n", " 400.0\n", " 1951\n", - " 1991.0\n", + " 1991.00000\n", " 98125\n", - " 47.7210\n", - " -122.319\n", + " 47.72100\n", + " -122.31900\n", " 1690\n", " 7639\n", " \n", @@ -225,12 +230,12 @@ " 2\n", " 5631500400\n", " 2/25/2015\n", - " 180000.0\n", + " 180000.00000\n", " 2\n", - " 1.00\n", + " 1.00000\n", " 770\n", " 10000\n", - " 1.0\n", + " 1.00000\n", " NO\n", " NONE\n", " ...\n", @@ -240,8 +245,8 @@ " 1933\n", " NaN\n", " 98028\n", - " 47.7379\n", - " -122.233\n", + " 47.73790\n", + " -122.23300\n", " 2720\n", " 8062\n", " \n", @@ -249,12 +254,12 @@ " 3\n", " 2487200875\n", " 12/9/2014\n", - " 604000.0\n", + " 604000.00000\n", " 4\n", - " 3.00\n", + " 3.00000\n", " 1960\n", " 5000\n", - " 1.0\n", + " 1.00000\n", " NO\n", " NONE\n", " ...\n", @@ -262,10 +267,10 @@ " 1050\n", " 910.0\n", " 1965\n", - " 0.0\n", + " 0.00000\n", " 98136\n", - " 47.5208\n", - " -122.393\n", + " 47.52080\n", + " -122.39300\n", " 1360\n", " 5000\n", " \n", @@ -273,12 +278,12 @@ " 4\n", " 1954400510\n", " 2/18/2015\n", - " 510000.0\n", + " 510000.00000\n", " 3\n", - " 2.00\n", + " 2.00000\n", " 1680\n", " 8080\n", - " 1.0\n", + " 1.00000\n", " NO\n", " NONE\n", " ...\n", @@ -286,10 +291,10 @@ " 1680\n", " 0.0\n", " 1987\n", - " 0.0\n", + " 0.00000\n", " 98074\n", - " 47.6168\n", - " -122.045\n", + " 47.61680\n", + " -122.04500\n", " 1800\n", " 7503\n", " \n", @@ -299,26 +304,26 @@ "" ], "text/plain": [ - " id date price bedrooms bathrooms sqft_living \\\n", - "0 7129300520 10/13/2014 221900.0 3 1.00 1180 \n", - "1 6414100192 12/9/2014 538000.0 3 2.25 2570 \n", - "2 5631500400 2/25/2015 180000.0 2 1.00 770 \n", - "3 2487200875 12/9/2014 604000.0 4 3.00 1960 \n", - "4 1954400510 2/18/2015 510000.0 3 2.00 1680 \n", + " id date price bedrooms bathrooms sqft_living \\\n", + "0 7129300520 10/13/2014 221900.00000 3 1.00000 1180 \n", + "1 6414100192 12/9/2014 538000.00000 3 2.25000 2570 \n", + "2 5631500400 2/25/2015 180000.00000 2 1.00000 770 \n", + "3 2487200875 12/9/2014 604000.00000 4 3.00000 1960 \n", + "4 1954400510 2/18/2015 510000.00000 3 2.00000 1680 \n", "\n", " sqft_lot floors waterfront view ... grade sqft_above \\\n", - "0 5650 1.0 NaN NONE ... 7 Average 1180 \n", - "1 7242 2.0 NO NONE ... 7 Average 2170 \n", - "2 10000 1.0 NO NONE ... 6 Low Average 770 \n", - "3 5000 1.0 NO NONE ... 7 Average 1050 \n", - "4 8080 1.0 NO NONE ... 8 Good 1680 \n", + "0 5650 1.00000 NaN NONE ... 7 Average 1180 \n", + "1 7242 2.00000 NO NONE ... 7 Average 2170 \n", + "2 10000 1.00000 NO NONE ... 6 Low Average 770 \n", + "3 5000 1.00000 NO NONE ... 7 Average 1050 \n", + "4 8080 1.00000 NO NONE ... 8 Good 1680 \n", "\n", - " sqft_basement yr_built yr_renovated zipcode lat long \\\n", - "0 0.0 1955 0.0 98178 47.5112 -122.257 \n", - "1 400.0 1951 1991.0 98125 47.7210 -122.319 \n", - "2 0.0 1933 NaN 98028 47.7379 -122.233 \n", - "3 910.0 1965 0.0 98136 47.5208 -122.393 \n", - "4 0.0 1987 0.0 98074 47.6168 -122.045 \n", + " sqft_basement yr_built yr_renovated zipcode lat long \\\n", + "0 0.0 1955 0.00000 98178 47.51120 -122.25700 \n", + "1 400.0 1951 1991.00000 98125 47.72100 -122.31900 \n", + "2 0.0 1933 NaN 98028 47.73790 -122.23300 \n", + "3 910.0 1965 0.00000 98136 47.52080 -122.39300 \n", + "4 0.0 1987 0.00000 98074 47.61680 -122.04500 \n", "\n", " sqft_living15 sqft_lot15 \n", "0 1340 5650 \n", @@ -330,7 +335,7 @@ "[5 rows x 21 columns]" ] }, - "execution_count": 14, + "execution_count": 51, "metadata": {}, "output_type": "execute_result" } @@ -338,12 +343,12 @@ "source": [ "#load and preiview data\n", "df = pd.read_csv('data/kc_house_data.csv')\n", - "df.head(5)\n" + "df.head()\n" ] }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 52, "metadata": {}, "outputs": [ { @@ -389,7 +394,7 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "Before conducting regression analysis, six non-numeric columns in the data—\"date\", \"waterfront\", \"view\", \"condition\", \"grade\", and \"sqft_basement\"—will require manipulation or removal." + "We see that waterfrront is missing about 11% of its values, year renovated is missing about 17% of its values and view is missing a few values as well. " ] }, { @@ -403,13 +408,12 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Data Cleaning\n", - "Drop irrelevant columns, address missing values and manipulate data into desired forms" + "### Data Cleaning\n" ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 53, "metadata": {}, "outputs": [], "source": [ @@ -420,7 +424,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 54, "metadata": {}, "outputs": [], "source": [ @@ -441,7 +445,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 55, "metadata": {}, "outputs": [], "source": [ @@ -456,7 +460,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 56, "metadata": {}, "outputs": [], "source": [ @@ -474,12 +478,12 @@ "df['waterfront_missing'] = waterfront_missing\n", "\n", "df['waterfront_missing'] = df.waterfront_missing.map(lambda x: 0 if x==False\n", - " else 1)" + " else 1)\n" ] }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 57, "metadata": {}, "outputs": [], "source": [ @@ -493,162 +497,85 @@ ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 58, "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "Index: 19479 entries, 0 to 21596\n", + "Data columns (total 12 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 price 19479 non-null float64\n", + " 1 bedrooms 19479 non-null float64\n", + " 2 bathrooms 19479 non-null float64\n", + " 3 sqft_living 19479 non-null float64\n", + " 4 sqft_lot 19479 non-null float64\n", + " 5 floors 19479 non-null float64\n", + " 6 sqft_above 19479 non-null float64\n", + " 7 sqft_living15 19479 non-null float64\n", + " 8 sqft_lot15 19479 non-null float64\n", + " 9 grade_num 19479 non-null float64\n", + " 10 bed_bath_ratio 19479 non-null float64\n", + " 11 mean_price 19479 non-null float64\n", + "dtypes: float64(12)\n", + "memory usage: 1.9 MB\n" + ] + } + ], "source": [ - "##### Removing irrelvant columns" + "#removing outliers \n", + "\n", + "#make a copy of the clean dataframe \n", + "no_out = main_df.copy()\n", + "\n", + "#drop columns that we cannot use \n", + "no_out = no_out.drop(columns= ['id', 'zip_city', 'Waterfront'], axis=1)\n", + "\n", + "#change data type so that we can math \n", + "no_out = no_out.astype('float')\n", + "\n", + "#pull out the columns \n", + "columns = no_out.columns\n", + "\n", + "#for each column in the dataframe, get the mean and standard deviation \n", + "#then get the z-score for within 3 standard devaitions\n", + "for col in columns:\n", + " \n", + " mean = no_out[col].mean()\n", + " sd = no_out[col].std()\n", + " \n", + " no_out = no_out[(no_out[col] <= mean+(3*sd))]\n", + " \n", + "pd.set_option('display.float_format', lambda x: '%.5f' % x)\n", + "no_out.info()" ] }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 59, "metadata": {}, "outputs": [ { "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idpricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontconditiongradeyr_built
07129300520221900.031.00118056501.0NaNAverage7 Average1955
16414100192538000.032.25257072422.0NOAverage7 Average1951
25631500400180000.021.00770100001.0NOAverage6 Low Average1933
32487200875604000.043.00196050001.0NOVery Good7 Average1965
41954400510510000.032.00168080801.0NOAverage8 Good1987
\n", - "
" - ], + "image/png": "", "text/plain": [ - " id price bedrooms bathrooms sqft_living sqft_lot floors \\\n", - "0 7129300520 221900.0 3 1.00 1180 5650 1.0 \n", - "1 6414100192 538000.0 3 2.25 2570 7242 2.0 \n", - "2 5631500400 180000.0 2 1.00 770 10000 1.0 \n", - "3 2487200875 604000.0 4 3.00 1960 5000 1.0 \n", - "4 1954400510 510000.0 3 2.00 1680 8080 1.0 \n", - "\n", - " waterfront condition grade yr_built \n", - "0 NaN Average 7 Average 1955 \n", - "1 NO Average 7 Average 1951 \n", - "2 NO Average 6 Low Average 1933 \n", - "3 NO Very Good 7 Average 1965 \n", - "4 NO Average 8 Good 1987 " + "
" ] }, - "execution_count": 2, "metadata": {}, - "output_type": "execute_result" + "output_type": "display_data" } ], "source": [ - "# reading the csv file\n", - "# drop irrelevant columns\n", - "df = pd.read_csv('data/kc_house_data.csv').drop(['date',\n", - " 'view', \n", - " 'sqft_above', \n", - " 'sqft_basement', \n", - " 'yr_renovated',\n", - " 'zipcode', \n", - " 'lat', \n", - " 'long', \n", - " 'sqft_living15',\n", - " 'sqft_lot15'], axis = 1)\n", - "# previewing the DataFrame\n", - "df.head()" + "#visual of the clean_df in a heatmap to look at correlations\n", + "plt.figure(figsize=(20,20))\n", + "sns.heatmap(df.corr().abs(), annot=True)\n", + "plt.show()" ] } ], From a6a8120921d49b77194c6a5cf9ec8939d4bff3e7 Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Sun, 28 Apr 2024 22:20:02 +0300 Subject: [PATCH 10/42] Update student.ipynb --- student.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/student.ipynb b/student.ipynb index 1b57076a..b3e32a00 100644 --- a/student.ipynb +++ b/student.ipynb @@ -7,7 +7,7 @@ "## Final Project Submission\n", "\n", "Please fill out:\n", - "* Student name: Solphine Joseph, Grace Rotich, Mather Rotich, Hilary Simiyu, Clyde Ochieng.\n", + "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng.\n", "* Student pace: full time\n", "* Scheduled project review date/time: \n", "* Instructor name: Nikita \n", From 5d27220b4b36466d5c1fd2aab29b08806fda503b Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Mon, 29 Apr 2024 16:47:13 +0300 Subject: [PATCH 11/42] Update student.ipynb --- student.ipynb | 1655 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 1395 insertions(+), 260 deletions(-) diff --git a/student.ipynb b/student.ipynb index 52ce98fd..1beeaa73 100644 --- a/student.ipynb +++ b/student.ipynb @@ -7,7 +7,7 @@ "## Final Project Submission\n", "\n", "Please fill out:\n", - "* Student name: Solphine Joseph, Grace Rotich, Mather Rotich, Hilary Simiyu, Clyde Ochieng.\n", + "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng.\n", "* Student pace: full time\n", "* Scheduled project review date/time: \n", "* Instructor name: Nikita \n", @@ -59,6 +59,15 @@ "The real estate agency in Kingsway is analyzing a dataset to determine the factors affecting house prices. The dataset likely includes features such as property size, location, age, and market trends. Key steps include assessing data quality, exploring relationships between features and prices, and preprocessing data for multilinear regression analysis. Multilinear regression will be used to model how these features collectively influence house prices, with evaluation metrics used to assess predictive accuracy." ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The dataset utilized in this analysis is the King County Housing dataset, encompassing details on over 21,000 homes within King County. Each entry in the dataset includes information on various features such as bedroom/bathroom/floor counts, living space and lot square footage, zip code, building grade, condition, and more.\n", + "\n", + "The King County Housing Dataset comprises multiple features contributing to the final sale price of homes in King County. Descriptions of these features are provided below." + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -92,45 +101,1110 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Explotory Data Analyis\n", + "### Data Preparation\n", + "\n", + "Importing data." + ] + }, + { + "cell_type": "code", + "execution_count": 98, + "metadata": {}, + "outputs": [], + "source": [ + "#importing libraries \n", + "import pandas as pd\n", + "import numpy as np\n", + "from matplotlib import pyplot as plt\n", + "import seaborn as sns\n", + "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", + "from sklearn.linear_model import LinearRegression\n", + "import statsmodels.api as sm\n", + "from statsmodels.formula.api import ols\n", + "from scipy import stats\n", + "import warnings\n", + "warnings.filterwarnings('ignore')" + ] + }, + { + "cell_type": "code", + "execution_count": 99, + "metadata": {}, + "outputs": [], + "source": [ + "df = pd.read_csv('data/kc_house_data.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 100, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(21597, 21)" + ] + }, + "execution_count": 100, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The dataset contains 21,597 houses with 21 features." + ] + }, + { + "cell_type": "code", + "execution_count": 101, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 21597 entries, 0 to 21596\n", + "Data columns (total 21 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 21597 non-null int64 \n", + " 1 date 21597 non-null object \n", + " 2 price 21597 non-null float64\n", + " 3 bedrooms 21597 non-null int64 \n", + " 4 bathrooms 21597 non-null float64\n", + " 5 sqft_living 21597 non-null int64 \n", + " 6 sqft_lot 21597 non-null int64 \n", + " 7 floors 21597 non-null float64\n", + " 8 waterfront 19221 non-null object \n", + " 9 view 21534 non-null object \n", + " 10 condition 21597 non-null object \n", + " 11 grade 21597 non-null object \n", + " 12 sqft_above 21597 non-null int64 \n", + " 13 sqft_basement 21597 non-null object \n", + " 14 yr_built 21597 non-null int64 \n", + " 15 yr_renovated 17755 non-null float64\n", + " 16 zipcode 21597 non-null int64 \n", + " 17 lat 21597 non-null float64\n", + " 18 long 21597 non-null float64\n", + " 19 sqft_living15 21597 non-null int64 \n", + " 20 sqft_lot15 21597 non-null int64 \n", + "dtypes: float64(6), int64(9), object(6)\n", + "memory usage: 3.5+ MB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 102, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
idpricebedroomsbathroomssqft_livingsqft_lotfloorssqft_aboveyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15
count21597.0000021597.0000021597.0000021597.0000021597.0000021597.0000021597.0000021597.0000021597.0000017755.0000021597.0000021597.0000021597.0000021597.0000021597.00000
mean4580474287.77099540296.573513.373202.115832080.3218515099.408761.494101788.596841970.9996883.6367898077.9518547.56009-122.213981986.6203212758.28351
std2876735715.74778367368.140100.926300.76898918.1061341412.636880.53968827.7597629.37523399.9464153.513070.138550.14072685.2304727274.44195
min1000102.0000078000.000001.000000.50000370.00000520.000001.00000370.000001900.000000.0000098001.0000047.15590-122.51900399.00000651.00000
25%2123049175.00000322000.000003.000001.750001430.000005040.000001.000001190.000001951.000000.0000098033.0000047.47110-122.328001490.000005100.00000
50%3904930410.00000450000.000003.000002.250001910.000007618.000001.500001560.000001975.000000.0000098065.0000047.57180-122.231001840.000007620.00000
75%7308900490.00000645000.000004.000002.500002550.0000010685.000002.000002210.000001997.000000.0000098118.0000047.67800-122.125002360.0000010083.00000
max9900000190.000007700000.0000033.000008.0000013540.000001651359.000003.500009410.000002015.000002015.0000098199.0000047.77760-121.315006210.00000871200.00000
\n", + "
" + ], + "text/plain": [ + " id price bedrooms bathrooms sqft_living \\\n", + "count 21597.00000 21597.00000 21597.00000 21597.00000 21597.00000 \n", + "mean 4580474287.77099 540296.57351 3.37320 2.11583 2080.32185 \n", + "std 2876735715.74778 367368.14010 0.92630 0.76898 918.10613 \n", + "min 1000102.00000 78000.00000 1.00000 0.50000 370.00000 \n", + "25% 2123049175.00000 322000.00000 3.00000 1.75000 1430.00000 \n", + "50% 3904930410.00000 450000.00000 3.00000 2.25000 1910.00000 \n", + "75% 7308900490.00000 645000.00000 4.00000 2.50000 2550.00000 \n", + "max 9900000190.00000 7700000.00000 33.00000 8.00000 13540.00000 \n", + "\n", + " sqft_lot floors sqft_above yr_built yr_renovated \\\n", + "count 21597.00000 21597.00000 21597.00000 21597.00000 17755.00000 \n", + "mean 15099.40876 1.49410 1788.59684 1970.99968 83.63678 \n", + "std 41412.63688 0.53968 827.75976 29.37523 399.94641 \n", + "min 520.00000 1.00000 370.00000 1900.00000 0.00000 \n", + "25% 5040.00000 1.00000 1190.00000 1951.00000 0.00000 \n", + "50% 7618.00000 1.50000 1560.00000 1975.00000 0.00000 \n", + "75% 10685.00000 2.00000 2210.00000 1997.00000 0.00000 \n", + "max 1651359.00000 3.50000 9410.00000 2015.00000 2015.00000 \n", + "\n", + " zipcode lat long sqft_living15 sqft_lot15 \n", + "count 21597.00000 21597.00000 21597.00000 21597.00000 21597.00000 \n", + "mean 98077.95185 47.56009 -122.21398 1986.62032 12758.28351 \n", + "std 53.51307 0.13855 0.14072 685.23047 27274.44195 \n", + "min 98001.00000 47.15590 -122.51900 399.00000 651.00000 \n", + "25% 98033.00000 47.47110 -122.32800 1490.00000 5100.00000 \n", + "50% 98065.00000 47.57180 -122.23100 1840.00000 7620.00000 \n", + "75% 98118.00000 47.67800 -122.12500 2360.00000 10083.00000 \n", + "max 98199.00000 47.77760 -121.31500 6210.00000 871200.00000 " + ] + }, + "execution_count": 102, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 103, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 21597.00000\n", + "mean 540296.57351\n", + "std 367368.14010\n", + "min 78000.00000\n", + "25% 322000.00000\n", + "50% 450000.00000\n", + "75% 645000.00000\n", + "max 7700000.00000\n", + "Name: price, dtype: float64" + ] + }, + "execution_count": 103, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# descriptive statistics for our target price.\n", + "df['price'].describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The average price of homes in the data set is 540,297 dollars. \n", + "The prices ranges from 78,000 to 8,000,000 dollars and\n", + "the median house price is 450,000 dollars" + ] + }, + { + "cell_type": "code", + "execution_count": 104, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 21597.00000\n", + "mean 2080.32185\n", + "std 918.10613\n", + "min 370.00000\n", + "25% 1430.00000\n", + "50% 1910.00000\n", + "75% 2550.00000\n", + "max 13540.00000\n", + "Name: sqft_living, dtype: float64" + ] + }, + "execution_count": 104, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "# descriptive statistics for square footage\n", + "df['sqft_living'].describe()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The mean square-feet of living space is 2,080 sq-ft and the range of living space ranges from 370 sq-ft to 13,540 sq-ft. The median sq footage is 1,910." + ] + }, + { + "cell_type": "code", + "execution_count": 105, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "bedrooms\n", + "3 9824\n", + "4 6882\n", + "2 2760\n", + "5 1601\n", + "6 272\n", + "1 196\n", + "7 38\n", + "8 13\n", + "9 6\n", + "10 3\n", + "11 1\n", + "33 1\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 105, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['bedrooms'].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The bedroom counts range from 1 bedroom to 33" + ] + }, + { + "cell_type": "code", + "execution_count": 106, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "bathrooms\n", + "2.50000 5377\n", + "1.00000 3851\n", + "1.75000 3048\n", + "2.25000 2047\n", + "2.00000 1930\n", + "1.50000 1445\n", + "2.75000 1185\n", + "3.00000 753\n", + "3.50000 731\n", + "3.25000 589\n", + "3.75000 155\n", + "4.00000 136\n", + "4.50000 100\n", + "4.25000 79\n", + "0.75000 71\n", + "4.75000 23\n", + "5.00000 21\n", + "5.25000 13\n", + "5.50000 10\n", + "1.25000 9\n", + "6.00000 6\n", + "0.50000 4\n", + "5.75000 4\n", + "6.75000 2\n", + "8.00000 2\n", + "6.25000 2\n", + "6.50000 2\n", + "7.50000 1\n", + "7.75000 1\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 106, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['bathrooms'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 107, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "floors\n", + "1.00000 10673\n", + "2.00000 8235\n", + "1.50000 1910\n", + "3.00000 611\n", + "2.50000 161\n", + "3.50000 7\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 107, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['floors'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 108, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "sqft_lot\n", + "5000 358\n", + "6000 290\n", + "4000 251\n", + "7200 220\n", + "4800 119\n", + " ... \n", + "22605 1\n", + "25248 1\n", + "9934 1\n", + "9142 1\n", + "1076 1\n", + "Name: count, Length: 9776, dtype: int64" + ] + }, + "execution_count": 108, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['sqft_lot'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 109, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "count 21597.00000\n", + "mean 15099.40876\n", + "std 41412.63688\n", + "min 520.00000\n", + "25% 5040.00000\n", + "50% 7618.00000\n", + "75% 10685.00000\n", + "max 1651359.00000\n", + "Name: sqft_lot, dtype: float64" + ] + }, + "execution_count": 109, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['sqft_lot'].describe()" + ] + }, + { + "cell_type": "code", + "execution_count": 110, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "yr_built\n", + "2014 559\n", + "2006 453\n", + "2005 450\n", + "2004 433\n", + "2003 420\n", + " ... \n", + "1933 30\n", + "1901 29\n", + "1902 27\n", + "1935 24\n", + "1934 21\n", + "Name: count, Length: 116, dtype: int64" + ] + }, + "execution_count": 110, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['yr_built'].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The year built ranges from 1934 to 2014." + ] + }, + { + "cell_type": "code", + "execution_count": 111, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "condition\n", + "Average 14020\n", + "Good 5677\n", + "Very Good 1701\n", + "Fair 170\n", + "Poor 29\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 111, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['condition'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 112, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "waterfront\n", + "NO 19075\n", + "YES 146\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 112, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['waterfront'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 113, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# examining the relationship between sqft_living and price\n", + "sns.jointplot(x='sqft_living', y='price', data=df, kind='reg')\n", + "\n", + "plt.tight_layout()" + ] + }, + { + "cell_type": "code", + "execution_count": 114, + "metadata": {}, + "outputs": [ + { + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "sns.jointplot(x='bathrooms', y='price', data=df, kind='reg')\n", + "plt.tight_layout()\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preperation\n", + "\n", + "Data Preparation Fundamentals - Applying appropriate preprocessing and feature engineering steps to tabular data in preparation for statistical modeling\n", + "\n", + "Data Cleaning Steps\n", + "Handling Missing Values: Identify and address and missing values using techniques such as dropping or replacing data.\n", + "\n", + "Handling Non-Numeric Data: A Linear regression model needs all of the features to be numeric, not categorical. Identify the data type 'object' and address them using techniques such as ordinal or one-hot encoding.\n", + "\n", + "This notebook contains a breakdown of the step-by-step processes that we used to compile, scrub, and transform our data. It includes variations of narrowing our scope and explorations into the impacts that our different transformations have on the data." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Preprocessing with Scikit-learn\n", + "Let explore and clean our data set to prep for our Linear Regression Model.\n", + "Preprocessing Steps.\n", + "\n", + "1. Handle Missing Values\n", + "2. Convert Categorical Features into Numbers\n", + "3. Find and Remove Outliers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Handling Missing Values\n", + "Below, let's check to see if there are any NaNs in our data" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0\n", + "date 0\n", + "price 0\n", + "bedrooms 0\n", + "bathrooms 0\n", + "sqft_living 0\n", + "sqft_lot 0\n", + "floors 0\n", + "waterfront 2376\n", + "view 63\n", + "condition 0\n", + "grade 0\n", + "sqft_above 0\n", + "sqft_basement 0\n", + "yr_built 0\n", + "yr_renovated 3842\n", + "zipcode 0\n", + "lat 0\n", + "long 0\n", + "sqft_living15 0\n", + "sqft_lot15 0\n", + "dtype: int64" + ] + }, + "execution_count": 115, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#locate missing values\n", + "df.isna().sum()" + ] + }, + { + "cell_type": "code", + "execution_count": 116, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "id 0.0\n", + "date 0.0\n", + "price 0.0\n", + "bedrooms 0.0\n", + "bathrooms 0.0\n", + "sqft_living 0.0\n", + "sqft_lot 0.0\n", + "floors 0.0\n", + "waterfront 11.00152798999861\n", + "view 0.29170718155299347\n", + "condition 0.0\n", + "grade 0.0\n", + "sqft_above 0.0\n", + "sqft_basement 0.0\n", + "yr_built 0.0\n", + "yr_renovated 17.78950780200954\n", + "zipcode 0.0\n", + "lat 0.0\n", + "long 0.0\n", + "sqft_living15 0.0\n", + "sqft_lot15 0.0\n" + ] + } + ], + "source": [ + "#dealing with missing values\n", + "for column in df.columns:\n", + " percentage_of_nan = (sum(df[column].isnull())/len(df[column])) * 100 \n", + " print(column, percentage_of_nan)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The feature 'waterfront' is the only feature with missing values and about 11% of the values have NaNs. Lets investigate this feature to handle it's missing values" + ] + }, + { + "cell_type": "code", + "execution_count": 117, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "waterfront\n", + "NO 19075\n", + "YES 146\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 117, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['waterfront'].value_counts()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the 'waterfront' feature only has two values, yes or no.\n", + "Thus NaN values can be considered no because they do not exist in their homes." + ] + }, + { + "cell_type": "code", + "execution_count": 118, + "metadata": {}, + "outputs": [], + "source": [ + "df['waterfront'].fillna('NO', inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 119, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "waterfront\n", + "NO 21451\n", + "YES 146\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 119, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df['waterfront'].value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 120, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "id 0\n", + "date 0\n", + "price 0\n", + "bedrooms 0\n", + "bathrooms 0\n", + "sqft_living 0\n", + "sqft_lot 0\n", + "floors 0\n", + "waterfront 0\n", + "view 63\n", + "condition 0\n", + "grade 0\n", + "sqft_above 0\n", + "sqft_basement 0\n", + "yr_built 0\n", + "yr_renovated 3842\n", + "zipcode 0\n", + "lat 0\n", + "long 0\n", + "sqft_living15 0\n", + "sqft_lot15 0\n", + "dtype: int64" + ] + }, + "execution_count": 120, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "#recheck for missing values\n", + "df.isna().sum()" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Convert Categorical Features into Numbers" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Our model would crash because some of the columns are non-numeric. Features with a numeric data type will work with our model, but these features need to be converted:\n", + "* waterfront (object)\n", + "* condition (object)\n", + "* grade (object)\n", + "\n", + "Let's inspect the value counts of the specified features:" + ] + }, + { + "cell_type": "code", + "execution_count": 121, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "waterfront\n", + "NO 21451\n", + "YES 146\n", + "Name: count, dtype: int64\n", + "\n", + "condition\n", + "Average 14020\n", + "Good 5677\n", + "Very Good 1701\n", + "Fair 170\n", + "Poor 29\n", + "Name: count, dtype: int64\n", + "\n", + "grade\n", + "7 Average 8974\n", + "8 Good 6065\n", + "9 Better 2615\n", + "6 Low Average 2038\n", + "10 Very Good 1134\n", + "11 Excellent 399\n", + "5 Fair 242\n", + "12 Luxury 89\n", + "4 Low 27\n", + "13 Mansion 13\n", + "3 Poor 1\n", + "Name: count, dtype: int64\n" + ] + } + ], + "source": [ + "print(df['waterfront'].value_counts())\n", + "print()\n", + "print(df['condition'].value_counts())\n", + "print()\n", + "print(df['grade'].value_counts())" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Split function to seperate the numeric value of 'grade'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The Grade feature is an object data type however the numeric grade is listed in the front. We will use a simple string split function to isolate the numeric part of the feature.\n", "\n", - "Importing data." + "Waterfront has only 2 categories and can be converted into binary in place, whereas Condition has more than 2 categories and will need to be expanded into multiple columns." ] }, { "cell_type": "code", - "execution_count": 50, + "execution_count": 122, "metadata": {}, "outputs": [], "source": [ - "#importing libraries \n", - "import pandas as pd\n", - "import numpy as np\n", - "from matplotlib import pyplot as plt\n", - "import seaborn as sns\n", - "from sklearn import metrics\n", - "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", - "from sklearn.linear_model import LinearRegression\n", - "import statsmodels.api as sm\n", - "from statsmodels.formula.api import ols\n", - "from scipy import stats\n", - "from sklearn.linear_model import LinearRegression\n", - "from sklearn import tree\n", - "from sklearn.model_selection import train_test_split\n", - "from sklearn.tree import DecisionTreeRegressor\n", - "from sklearn.dummy import DummyRegressor" + "df = df.assign(grade=df.grade.str.split(' ')).explode('grade')" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 123, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "False 46366\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 123, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.duplicated().value_counts()" + ] + }, + { + "cell_type": "code", + "execution_count": 124, + "metadata": {}, + "outputs": [], + "source": [ + "df = df.drop_duplicates()" + ] + }, + { + "cell_type": "code", + "execution_count": 125, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(46366, 21)" + ] + }, + "execution_count": 125, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 126, "metadata": {}, + "outputs": [], "source": [ - "##### Removing irrelvant columns" + "df = df.drop_duplicates(subset='id')" ] }, { "cell_type": "code", - "execution_count": 51, + "execution_count": 127, "metadata": {}, "outputs": [ { @@ -188,10 +1262,10 @@ " 1180\n", " 5650\n", " 1.00000\n", - " NaN\n", + " NO\n", " NONE\n", " ...\n", - " 7 Average\n", + " 7\n", " 1180\n", " 0.0\n", " 1955\n", @@ -215,7 +1289,7 @@ " NO\n", " NONE\n", " ...\n", - " 7 Average\n", + " 7\n", " 2170\n", " 400.0\n", " 1951\n", @@ -227,30 +1301,6 @@ " 7639\n", " \n", " \n", - " 2\n", - " 5631500400\n", - " 2/25/2015\n", - " 180000.00000\n", - " 2\n", - " 1.00000\n", - " 770\n", - " 10000\n", - " 1.00000\n", - " NO\n", - " NONE\n", - " ...\n", - " 6 Low Average\n", - " 770\n", - " 0.0\n", - " 1933\n", - " NaN\n", - " 98028\n", - " 47.73790\n", - " -122.23300\n", - " 2720\n", - " 8062\n", - " \n", - " \n", " 3\n", " 2487200875\n", " 12/9/2014\n", @@ -263,7 +1313,7 @@ " NO\n", " NONE\n", " ...\n", - " 7 Average\n", + " 7\n", " 1050\n", " 910.0\n", " 1965\n", @@ -287,7 +1337,7 @@ " NO\n", " NONE\n", " ...\n", - " 8 Good\n", + " 8\n", " 1680\n", " 0.0\n", " 1987\n", @@ -298,57 +1348,247 @@ " 1800\n", " 7503\n", " \n", + " \n", + " 5\n", + " 7237550310\n", + " 5/12/2014\n", + " 1230000.00000\n", + " 4\n", + " 4.50000\n", + " 5420\n", + " 101930\n", + " 1.00000\n", + " NO\n", + " NONE\n", + " ...\n", + " 11\n", + " 3890\n", + " 1530.0\n", + " 2001\n", + " 0.00000\n", + " 98053\n", + " 47.65610\n", + " -122.00500\n", + " 4760\n", + " 101930\n", + " \n", + " \n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " ...\n", + " \n", + " \n", + " 21592\n", + " 263000018\n", + " 5/21/2014\n", + " 360000.00000\n", + " 3\n", + " 2.50000\n", + " 1530\n", + " 1131\n", + " 3.00000\n", + " NO\n", + " NONE\n", + " ...\n", + " 8\n", + " 1530\n", + " 0.0\n", + " 2009\n", + " 0.00000\n", + " 98103\n", + " 47.69930\n", + " -122.34600\n", + " 1530\n", + " 1509\n", + " \n", + " \n", + " 21593\n", + " 6600060120\n", + " 2/23/2015\n", + " 400000.00000\n", + " 4\n", + " 2.50000\n", + " 2310\n", + " 5813\n", + " 2.00000\n", + " NO\n", + " NONE\n", + " ...\n", + " 8\n", + " 2310\n", + " 0.0\n", + " 2014\n", + " 0.00000\n", + " 98146\n", + " 47.51070\n", + " -122.36200\n", + " 1830\n", + " 7200\n", + " \n", + " \n", + " 21594\n", + " 1523300141\n", + " 6/23/2014\n", + " 402101.00000\n", + " 2\n", + " 0.75000\n", + " 1020\n", + " 1350\n", + " 2.00000\n", + " NO\n", + " NONE\n", + " ...\n", + " 7\n", + " 1020\n", + " 0.0\n", + " 2009\n", + " 0.00000\n", + " 98144\n", + " 47.59440\n", + " -122.29900\n", + " 1020\n", + " 2007\n", + " \n", + " \n", + " 21595\n", + " 291310100\n", + " 1/16/2015\n", + " 400000.00000\n", + " 3\n", + " 2.50000\n", + " 1600\n", + " 2388\n", + " 2.00000\n", + " NO\n", + " NONE\n", + " ...\n", + " 8\n", + " 1600\n", + " 0.0\n", + " 2004\n", + " 0.00000\n", + " 98027\n", + " 47.53450\n", + " -122.06900\n", + " 1410\n", + " 1287\n", + " \n", + " \n", + " 21596\n", + " 1523300157\n", + " 10/15/2014\n", + " 325000.00000\n", + " 2\n", + " 0.75000\n", + " 1020\n", + " 1076\n", + " 2.00000\n", + " NO\n", + " NONE\n", + " ...\n", + " 7\n", + " 1020\n", + " 0.0\n", + " 2008\n", + " 0.00000\n", + " 98144\n", + " 47.59410\n", + " -122.29900\n", + " 1020\n", + " 1357\n", + " \n", " \n", "\n", - "

5 rows × 21 columns

\n", + "

17565 rows × 21 columns

\n", "" ], "text/plain": [ - " id date price bedrooms bathrooms sqft_living \\\n", - "0 7129300520 10/13/2014 221900.00000 3 1.00000 1180 \n", - "1 6414100192 12/9/2014 538000.00000 3 2.25000 2570 \n", - "2 5631500400 2/25/2015 180000.00000 2 1.00000 770 \n", - "3 2487200875 12/9/2014 604000.00000 4 3.00000 1960 \n", - "4 1954400510 2/18/2015 510000.00000 3 2.00000 1680 \n", + " id date price bedrooms bathrooms sqft_living \\\n", + "0 7129300520 10/13/2014 221900.00000 3 1.00000 1180 \n", + "1 6414100192 12/9/2014 538000.00000 3 2.25000 2570 \n", + "3 2487200875 12/9/2014 604000.00000 4 3.00000 1960 \n", + "4 1954400510 2/18/2015 510000.00000 3 2.00000 1680 \n", + "5 7237550310 5/12/2014 1230000.00000 4 4.50000 5420 \n", + "... ... ... ... ... ... ... \n", + "21592 263000018 5/21/2014 360000.00000 3 2.50000 1530 \n", + "21593 6600060120 2/23/2015 400000.00000 4 2.50000 2310 \n", + "21594 1523300141 6/23/2014 402101.00000 2 0.75000 1020 \n", + "21595 291310100 1/16/2015 400000.00000 3 2.50000 1600 \n", + "21596 1523300157 10/15/2014 325000.00000 2 0.75000 1020 \n", "\n", - " sqft_lot floors waterfront view ... grade sqft_above \\\n", - "0 5650 1.00000 NaN NONE ... 7 Average 1180 \n", - "1 7242 2.00000 NO NONE ... 7 Average 2170 \n", - "2 10000 1.00000 NO NONE ... 6 Low Average 770 \n", - "3 5000 1.00000 NO NONE ... 7 Average 1050 \n", - "4 8080 1.00000 NO NONE ... 8 Good 1680 \n", + " sqft_lot floors waterfront view ... grade sqft_above sqft_basement \\\n", + "0 5650 1.00000 NO NONE ... 7 1180 0.0 \n", + "1 7242 2.00000 NO NONE ... 7 2170 400.0 \n", + "3 5000 1.00000 NO NONE ... 7 1050 910.0 \n", + "4 8080 1.00000 NO NONE ... 8 1680 0.0 \n", + "5 101930 1.00000 NO NONE ... 11 3890 1530.0 \n", + "... ... ... ... ... ... ... ... ... \n", + "21592 1131 3.00000 NO NONE ... 8 1530 0.0 \n", + "21593 5813 2.00000 NO NONE ... 8 2310 0.0 \n", + "21594 1350 2.00000 NO NONE ... 7 1020 0.0 \n", + "21595 2388 2.00000 NO NONE ... 8 1600 0.0 \n", + "21596 1076 2.00000 NO NONE ... 7 1020 0.0 \n", "\n", - " sqft_basement yr_built yr_renovated zipcode lat long \\\n", - "0 0.0 1955 0.00000 98178 47.51120 -122.25700 \n", - "1 400.0 1951 1991.00000 98125 47.72100 -122.31900 \n", - "2 0.0 1933 NaN 98028 47.73790 -122.23300 \n", - "3 910.0 1965 0.00000 98136 47.52080 -122.39300 \n", - "4 0.0 1987 0.00000 98074 47.61680 -122.04500 \n", + " yr_built yr_renovated zipcode lat long sqft_living15 \\\n", + "0 1955 0.00000 98178 47.51120 -122.25700 1340 \n", + "1 1951 1991.00000 98125 47.72100 -122.31900 1690 \n", + "3 1965 0.00000 98136 47.52080 -122.39300 1360 \n", + "4 1987 0.00000 98074 47.61680 -122.04500 1800 \n", + "5 2001 0.00000 98053 47.65610 -122.00500 4760 \n", + "... ... ... ... ... ... ... \n", + "21592 2009 0.00000 98103 47.69930 -122.34600 1530 \n", + "21593 2014 0.00000 98146 47.51070 -122.36200 1830 \n", + "21594 2009 0.00000 98144 47.59440 -122.29900 1020 \n", + "21595 2004 0.00000 98027 47.53450 -122.06900 1410 \n", + "21596 2008 0.00000 98144 47.59410 -122.29900 1020 \n", "\n", - " sqft_living15 sqft_lot15 \n", - "0 1340 5650 \n", - "1 1690 7639 \n", - "2 2720 8062 \n", - "3 1360 5000 \n", - "4 1800 7503 \n", + " sqft_lot15 \n", + "0 5650 \n", + "1 7639 \n", + "3 5000 \n", + "4 7503 \n", + "5 101930 \n", + "... ... \n", + "21592 1509 \n", + "21593 7200 \n", + "21594 2007 \n", + "21595 1287 \n", + "21596 1357 \n", "\n", - "[5 rows x 21 columns]" + "[17565 rows x 21 columns]" ] }, - "execution_count": 51, + "execution_count": 127, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "#load and preiview data\n", - "df = pd.read_csv('data/kc_house_data.csv')\n", - "df.head()\n" + "df.dropna()" ] }, { "cell_type": "code", - "execution_count": 52, + "execution_count": 128, "metadata": {}, "outputs": [ { @@ -356,33 +1596,33 @@ "output_type": "stream", "text": [ "\n", - "RangeIndex: 21597 entries, 0 to 21596\n", + "Index: 21420 entries, 0 to 21596\n", "Data columns (total 21 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", - " 0 id 21597 non-null int64 \n", - " 1 date 21597 non-null object \n", - " 2 price 21597 non-null float64\n", - " 3 bedrooms 21597 non-null int64 \n", - " 4 bathrooms 21597 non-null float64\n", - " 5 sqft_living 21597 non-null int64 \n", - " 6 sqft_lot 21597 non-null int64 \n", - " 7 floors 21597 non-null float64\n", - " 8 waterfront 19221 non-null object \n", - " 9 view 21534 non-null object \n", - " 10 condition 21597 non-null object \n", - " 11 grade 21597 non-null object \n", - " 12 sqft_above 21597 non-null int64 \n", - " 13 sqft_basement 21597 non-null object \n", - " 14 yr_built 21597 non-null int64 \n", - " 15 yr_renovated 17755 non-null float64\n", - " 16 zipcode 21597 non-null int64 \n", - " 17 lat 21597 non-null float64\n", - " 18 long 21597 non-null float64\n", - " 19 sqft_living15 21597 non-null int64 \n", - " 20 sqft_lot15 21597 non-null int64 \n", + " 0 id 21420 non-null int64 \n", + " 1 date 21420 non-null object \n", + " 2 price 21420 non-null float64\n", + " 3 bedrooms 21420 non-null int64 \n", + " 4 bathrooms 21420 non-null float64\n", + " 5 sqft_living 21420 non-null int64 \n", + " 6 sqft_lot 21420 non-null int64 \n", + " 7 floors 21420 non-null float64\n", + " 8 waterfront 21420 non-null object \n", + " 9 view 21357 non-null object \n", + " 10 condition 21420 non-null object \n", + " 11 grade 21420 non-null object \n", + " 12 sqft_above 21420 non-null int64 \n", + " 13 sqft_basement 21420 non-null object \n", + " 14 yr_built 21420 non-null int64 \n", + " 15 yr_renovated 17616 non-null float64\n", + " 16 zipcode 21420 non-null int64 \n", + " 17 lat 21420 non-null float64\n", + " 18 long 21420 non-null float64\n", + " 19 sqft_living15 21420 non-null int64 \n", + " 20 sqft_lot15 21420 non-null int64 \n", "dtypes: float64(6), int64(9), object(6)\n", - "memory usage: 3.5+ MB\n" + "memory usage: 3.6+ MB\n" ] } ], @@ -390,181 +1630,65 @@ "df.info()" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We see that waterfrront is missing about 11% of its values, year renovated is missing about 17% of its values and view is missing a few values as well. " - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Preparation" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Data Cleaning\n" - ] - }, - { - "cell_type": "code", - "execution_count": 53, - "metadata": {}, - "outputs": [], - "source": [ - "#drop irrelevant columns\n", - "df.drop(['id', 'date', 'zipcode', 'lat', 'long', 'yr_renovated', 'view'],\n", - " axis=1, inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 54, - "metadata": {}, - "outputs": [], - "source": [ - "#fill in missing sqft_basement values\n", - "df.loc[df.sqft_basement == '?', 'sqft_basement'] = (\n", - " df[df.sqft_basement == '?'].sqft_living - \n", - " df[df.sqft_basement == '?'].sqft_above\n", - ")\n", - "\n", - "#convert into numeric\n", - "df['sqft_basement'] = df.sqft_basement.astype('float64')\n", - "\n", - "#sqft_basement is a zero inflated variable, so I convert it into \n", - "#a categorical variable\n", - "df['is_basement'] = df.sqft_basement.map(lambda x: 0 if x == 0 else 1)\n", - "df.drop('sqft_basement', axis=1, inplace=True)" - ] - }, { "cell_type": "code", - "execution_count": 55, + "execution_count": 129, "metadata": {}, - "outputs": [], + "outputs": [ + { + "data": { + "text/plain": [ + "grade\n", + "7 8889\n", + "8 6041\n", + "9 2606\n", + "6 1995\n", + "10 1130\n", + "11 396\n", + "5 234\n", + "12 88\n", + "4 27\n", + "13 13\n", + "3 1\n", + "Name: count, dtype: int64" + ] + }, + "execution_count": 129, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "#convert condition and grade into numeric values\n", - "df['condition'] = df.condition.map(lambda x: 0 if x=='Poor' \n", - " else (1 if x=='Fair'\n", - " else (2 if x=='Average'\n", - " else (3 if x=='Good' else 4))))\n", - "\n", - "df['grade'] = df.grade.map(lambda x: int(x[0:2]) - 3)" + "df['grade'].value_counts()" ] }, { - "cell_type": "code", - "execution_count": 56, + "cell_type": "markdown", "metadata": {}, - "outputs": [], "source": [ - "#convert waterfront strings to 0 and 1\n", - "df['waterfront'] = df.waterfront.map(lambda x: 0 if x==\"NO\" \n", - " else (1 if x==\"YES\" else None))\n", - "\n", - "#create new column indicating if waterfront value is missing\n", - "waterfront = df[[\"waterfront\"]]\n", - "missing_indicator = MissingIndicator()\n", - "missing_indicator.fit(waterfront)\n", - "waterfront_missing = missing_indicator.transform(waterfront)\n", - "\n", - "#add waterfront missing to dataframe and convert to binary\n", - "df['waterfront_missing'] = waterfront_missing\n", - "\n", - "df['waterfront_missing'] = df.waterfront_missing.map(lambda x: 0 if x==False\n", - " else 1)\n" + "The most common buiding grade is a 7" ] }, { "cell_type": "code", - "execution_count": 57, + "execution_count": 130, "metadata": {}, "outputs": [], "source": [ - "#fill in missing waterfront values with median\n", - "imputer = SimpleImputer(strategy=\"median\")\n", - "\n", - "imputer.fit(waterfront)\n", - "waterfront_imputed = imputer.transform(waterfront)\n", - "\n", - "df['waterfront'] = waterfront_imputed" - ] - }, - { - "cell_type": "code", - "execution_count": 58, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "Index: 19479 entries, 0 to 21596\n", - "Data columns (total 12 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 price 19479 non-null float64\n", - " 1 bedrooms 19479 non-null float64\n", - " 2 bathrooms 19479 non-null float64\n", - " 3 sqft_living 19479 non-null float64\n", - " 4 sqft_lot 19479 non-null float64\n", - " 5 floors 19479 non-null float64\n", - " 6 sqft_above 19479 non-null float64\n", - " 7 sqft_living15 19479 non-null float64\n", - " 8 sqft_lot15 19479 non-null float64\n", - " 9 grade_num 19479 non-null float64\n", - " 10 bed_bath_ratio 19479 non-null float64\n", - " 11 mean_price 19479 non-null float64\n", - "dtypes: float64(12)\n", - "memory usage: 1.9 MB\n" - ] - } - ], - "source": [ - "#removing outliers \n", - "\n", - "#make a copy of the clean dataframe \n", - "no_out = main_df.copy()\n", - "\n", - "#drop columns that we cannot use \n", - "no_out = no_out.drop(columns= ['id', 'zip_city', 'Waterfront'], axis=1)\n", - "\n", - "#change data type so that we can math \n", - "no_out = no_out.astype('float')\n", - "\n", - "#pull out the columns \n", - "columns = no_out.columns\n", - "\n", - "#for each column in the dataframe, get the mean and standard deviation \n", - "#then get the z-score for within 3 standard devaitions\n", - "for col in columns:\n", - " \n", - " mean = no_out[col].mean()\n", - " sd = no_out[col].std()\n", - " \n", - " no_out = no_out[(no_out[col] <= mean+(3*sd))]\n", - " \n", - "pd.set_option('display.float_format', lambda x: '%.5f' % x)\n", - "no_out.info()" + "# Change the data type from object to int.\n", + "df['grade'] = df['grade'].astype(int)" ] }, { "cell_type": "code", - "execution_count": 59, + "execution_count": 131, "metadata": {}, "outputs": [ { "data": { - "image/png": "", + "image/png": "iVBORw0KGgoAAAANSUhEUgAAB+8AAAUmCAYAAABXlPbhAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguMCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy81sbWrAAAACXBIWXMAAA9hAAAPYQGoP6dpAAEAAElEQVR4nOzdd5hV1fk/7GeGjoBUkY4lYkOKoKKo2Bsqiho72BskGHu+lkQ0okk0KkYUC9g7CNg1dkGxoCgqxUKVIr0PzHn/8IWfyjnDzJmyGbjv6/LKZK29136mnH2G+ey1Vk4qlUoFAAAAAAAAAJCY3KQLAAAAAAAAAIBNnfAeAAAAAAAAABImvAcAAAAAAACAhAnvAQAAAAAAACBhwnsAAAAAAAAASJjwHgAAAAAAAAASJrwHAAAAAAAAgIQJ7wEAAAAAAAAgYcJ7AAAAyr2VK1fGqlWrki4DAAAAIGsVky4AAACSdOedd0b//v3T9r3xxhvRtGnTUr3+1KlT44ADDkjb16tXr+jdu3epXh/Ki1QqFV999VWMHTs2Jk6cGBMmTIhp06bF4sWLY8mSJZGXlxcREZUqVYqaNWtGo0aNomnTptGqVato3759tGnTJqpXr57wZ0FhZLovH3PMMdGvX78EKoLiK+j9PhsVKlSI3NzcqFKlSlSvXj0233zzqFevXjRr1iz+8Ic/RNu2bWPnnXeOChUqlNg1S5rXOgAArEt4DwAAwAYplUrFu+++G6+++mq89dZbMXv27PWek5eXF3Pnzo25c+fGV199Fa+88kpERFSpUiX22WefOPzww+Pggw+OihX9cxgov1avXh2rV6+OvLy8WLx4ccyaNSsmTJgQo0aNWntMnTp14pBDDonTTz89ttlmmwSrBQAACstfKwAAANigrFixIoYOHRqDBg2K7777rsTGfO211+K1116LRo0aRc+ePeOUU06JSpUqlcj4ABuaefPmxRNPPBFPPvlkdO/ePS677LKoXbt20mUBAAAFsOc9AAAAG4x33nknDjvssLj22mtLLLj/vRkzZsRNN90URx11VHz00Uelcg2ADUUqlYpnnnkmunfvHuPHj0+6HAAAoADCewAAABK3aNGiuPTSS+Occ86JadOmlck1v/vuu+jZs2c8+OCDZXI9gCRNnTo1evToEZMmTUq6FAAAIAPL5gMAAJComTNnxtlnn12oGaEtW7aM3XffPdq1axctWrSIJk2axGabbRbVqlWLvLy8WLJkSUydOjUmTZoUo0ePjnfeeSfmzJmTcbzVq1dHv379Yv78+XHxxReX5KcFsMGZO3duXHTRRTFkyJCoVq1a0uUAAAC/I7wHAAAgMd9//32cddZZBc62r1SpUnTr1i1OPPHE2HnnnTMeV6FChahatWrUq1cv2rRpE8cee2ysXr063nnnnbjnnnvis88+y3jugAEDYosttohTTjmlWJ8PQFH16tUrevfuXaRzUqlU5OXlxapVq2Lp0qXx888/x7Rp0+KTTz6JYcOGxaxZszKe+/3338cdd9wRV1xxRXFLBwAASpjwHgAAgEQsWLAgzj333AKD+4MPPjj++te/RqNGjbK6RoUKFWK//faL/fbbL1566aXo27dv/Pzzz2mP7devX3To0CFatWqV1bUAykpOTk5Urlw5KleuHNWrV4/69etHq1atYv/9948//elPcffdd8eAAQMilUqlPf/hhx+OU045JZo2bVrGlf8/vXv3LvJDCwAAsLGz5z0AAABlLj8/P/7yl7/E5MmT0/ZXq1YtbrnllrjzzjuzDu5/77DDDotnn302tttuu7T9K1eujGuvvbZErgWQlCpVqkSfPn2iX79+GY/Jy8uLRx99tAyrAgAACkN4DwAAQJkbMGBAvPfee2n7Nttss7j//vvj6KOPLvHrNmrUKB544IFo0qRJ2v4xY8bEyy+/XOLXBShr3bp1i549e2bsHzJkSKxevbrsCgIAANZLeA8AAECZmjFjRtx7771p+3JycuLWW2+NXXfdtdSu36BBg7jlllsiJycnbf/AgQNL7doAZal3795Rt27dtH3z5s2LL774oowrAgAACiK8BwAAoEz985//jGXLlqXtO+ecc6JLly6lXkOHDh3iqKOOStv35ZdfxtixY0u9BoDSVqNGjejWrVvG/jFjxpRZLQAAwPoJ7wEAACgzX3/9dbzwwgtp+3baaafo06dPmdXSq1evyM1N/8/iYcOGlVkdAKWpc+fOGfu+++67MqwEAABYn4pJFwAAAJS+2bNnx4cffhgffvhhjB8/PubOnRvz5s2LZcuWRdWqVaNOnTrRvHnz2GmnnWK33XaLPfbYIypVqpR02WUulUrFF198ER9//HF8/vnnMXHixFi4cGEsWLAgUqlUVK1aNerVqxeNGzeOVq1aRdu2baNz585Ro0aNUq1rxYoV8e6778bo0aPj66+/jilTpsT8+fNjxYoVUaVKlahVq1Y0b948dtxxx9hjjz1ir732isqVK5dqTdl65JFHMvZdfPHFUaFChTKrpXnz5rHXXnvFu+++u07fG2+8Ef/3f/9XYteaM2dOPPLII/Hmm2/Gjz/+GKlUKurXrx9t2rSJ/fbbLw455JCsvmfjx4+Pjz/+OL788suYPHlyzJgxIxYsWBDLly+P/Pz8qFy5clSvXj222GKLaNy4cey4447RoUOH6NChQ1SsWHp/Epg5c2a88sorMWbMmBg3blzMmzcvFi9eHJUrV44tttgiWrVqFXvvvXccfPDBsfnmm5daHZl89dVX8f7778fnn38eP/74Y8ycOXPtahCbbbZZNGnSJLbeeuvo2LFjdOnSJRo2bFjmNZZ3K1asiFGjRsWYMWPiiy++iKlTp8aCBQti0aJFUaFChahWrVpsscUW0aRJk9h5552jffv20bFjx03yvae0bbfddhn7fv755xK5xptvvhnPPPNMfP755zF//vyoUaNGtGzZMnbfffc4+uijY+utty6R6xTF6tWr46OPPoqRI0fGuHHj4vvvv4+FCxfG0qVLY7PNNotatWpFgwYNok2bNtG2bdvYZ599onr16qVSy+LFi+Ptt9+Ojz/+OL799tuYOnVqLFq0aO37eJ06daJFixbRunXr6NSpU+y2225l+n4IAMCGIyeVSqWSLgIAAJJy5513Rv/+/dP2vfHGG9G0adNSvf7UqVPjgAMOSNvXq1ev6N27d7HGHz16dNx7773xzjvvFOm82rVrR/fu3aNnz56xxRZbZH390047LT766KN12m+66aY49thjsx43IuLKK6+MIUOGrNOezddtyZIl8eijj8bjjz8e06dPL9K5lSpVii5dukTPnj2jQ4cORTp3faZPnx733XdfPP/887F48eJCn7fm+3f22Wdn3Os4CQsWLIh999037ZL5HTp0iEcffbTMa3rxxRfj4osvXvv/K1asGG3atInOnTvH2Wefvd5A/cMPP4zTTz99nfYHH3ww9txzz4iIeOWVV+Kqq66KJUuWZBynbt26ceGFF8ZJJ5203lB91qxZ8eSTT8bQoUNj6tSpBR6bSZ06deK4446Lnj17Rv369bMaI51vvvkm/vOf/8Q777wTq1evXu/x1atXjx49esSFF1649mud6b58zDHHRL9+/bKubeXKlfHcc8/F4MGDizTbODc3Nzp37hwXXHBBtG/fPuvrF9egQYPipptuWqe9WrVq8f7778dmm21WItdZsGBB7LXXXpGXl7dO31NPPRVt2rQp8Pxp06bFfffdF8OGDSvSfSsiYvPNN48jjzwyevToEc2bNy/SuRu60n6/L8jKlSujdevWafv23HPPePDBBzOem+59vH379vH4449HRMTChQvjkksuWe/vGV26dInLL788ttlmm7VtpfVa//nnn2PQoEHxzDPPxNy5cwt9Xs2aNaNbt25x5plnRuPGjbO+/q9NnDgx7rnnnnjllVdixYoVhT6vQYMGceqpp8app55a6g8IAgCwYbFsPgAAbIQmT54cp5xySpx66qlFDu4jIubPnx/3339/HHLIITFw4MBChXDl1ciRI+PQQw+Nf//730UO7iMi8vLy4rXXXotTTjklLrzwwpg5c2axa8rLy4v+/fvHIYccEo8++miRA7A137+DDz44nnzyyWLXU1KGDx+eca/70047rYyr+cW+++4b22yzTfzxj3+M/v37x4cffhiPPfbYb8Lk4nj55ZejT58+BQb3ERFz586NG264IX788ceMxyxbtixuvfXWOOCAA6J///5ZB/cREfPmzYuBAwfGwQcfHI899ljW46yxcuXKuP766+OYY46JN998s9D3jKVLl8bdd98d3bt3j6+//rrYdWQycuTI6Nq1a1x33XVFXiY8Pz8/3nnnnTjppJPi0ksvjXnz5pVSlQU76qij0s5KX7ZsWbz22msldp0XX3wxbXC/zTbbrDe4Hzx4cBx22GHx2GOPFfm+FfHLgwOPPPJIHH744fHPf/4zVq5cWeQxWFdBX8eaNWtmPe6yZcvinHPOKdTvGW+99VbGhyVLyqpVq+K///1v7L///nHvvfcWKbiPiFi0aFE8/PDD0bVr13jssceiOPOdlixZEjfccEMceeSRMWzYsCIF9xG/rJh02223xaGHHhqvv/561nUAAFD+CO8BAGAj8/zzz0e3bt3i448/LvZYS5cujX/9619x5plnFvmP4OXBCy+8EOecc07MmjWrRMZ744034rjjjotvvvkm6zFmzZoVp512Wtx5553FDq4WLVoU1157bVxyySWxfPnyYo1VEv73v/+lba9Ro0bst99+ZVzNLzbbbLN48cUX4/rrr4+DDjqoRGc4Tpw4Ma644orIz88v1PG77rrrb2al/trkyZPjhBNOiHvuuadEA80lS5bE3//+97jxxhuzHmPWrFlx8sknx6OPPlroz/X3xo8fHyeffHKMGTMm6zrSSaVS0b9//zjjjDMKfDCisIYPHx7du3cv1ms8W3Xr1s34Ohk+fHiJXef5559P217QaimpVCpuvPHG+Mc//lHkkDKdvLy8uO++++L000/P6iEAfqug9+86depkPe7f/va3Ir1mTzjhhKyvtT4//fRTnHDCCXH77bcX+/1uzX3xqquuyuqe9t1338Vxxx0XDz/8cNb3xDVmz54dF110UfTr12+jfpASAID/R3gPAAAbkdtuuy0uv/zy9c7yLapRo0bFySefHD/99FOJjpukkSNHxiWXXJJ2hmlxzJo1K84999ys9hGeNm1anHzyyfHZZ5+VaE0jRoyI8847L9EAf9myZTF69Oi0fV26dIkqVaqUcUWla9WqVXHppZcW6WueKdiaNm1anHbaaTF+/PiSKm8dDz30UFazt+fPnx9nnnlmjB07ttg1LF26NM4777yYOHFisceK+CVQvvbaa+POO+8s1gza35s2bVqceuqp8cUXX5TYmIXVvXv3tO0jR46M2bNnF3v8yZMnp73/VKhQIY4++uiM591xxx3x0EMPFfv6v/fZZ5/Fn/70pxL9/m2Kvvzyy4x99erVy2rMV155JYYOHVro45s3bx577LFHVtdanylTpsTJJ58cX331VYmOO2TIkLj88suLdM7XX38dJ598cpFX+FifBx98sEgPgwEAUH4VvJEeAABQbtxzzz0xYMCAAo9p27ZtHHzwwdG+ffto0KBBVK9ePebMmROTJ0+ON954I15//fVYuHBh2nO///77OPPMM+OJJ56IWrVqlcanUGYWLVoUV111VcZAaMcdd4xDDz00dtppp2jevHlsttlmUalSpVi0aFFMnTo1xowZE0OHDs34x/mZM2fGLbfcEjfffHOha1qwYEGceeaZMWXKlIzHNGjQIA455JDYZ599omnTplGvXr1YuHBhzJw5M0aNGhUjRoyIH374Ie25o0aNir/85S9x1113RU5OTqHrKikffvhhxhnjHTt2LONqSt/DDz9cpGXgN9988zjssMPWaV+9enX06dOnwAdn2rVrFwceeGBsv/326/y8zp07N77++usYPXr0evdcvvHGG2PvvfeOqlWrFqrmVCoVvXv3jgkTJmQ8pkmTJnHUUUfF3nvvHVtuuWVUrVo1fvrppxg3blwMGzYsRo8e/ZvX4fz58+Pll18u1PXX5+abb46nnnoqY3+FChViv/32iy5dusTOO++8NsScOXNmjB8/Pl588cUYNWpUrFq1ap1zFy1aFGeffXY888wzZbo3+9577x0NGjRYJ6hfvXp1vPDCC9GzZ89ijZ9p1v0+++wTDRo0SNv3xRdfxD333JO2Lzc3Nzp37hz7779//OEPf4jGjRtHtWrVIuKXr+HEiRPjww8/jOeffz7jdgTvv/9+PPfccxkfXGD9ClrWfn1bIaSzZMmSIq/Wcfzxx5fKe8/ixYvj7LPPjmnTpmU8pm7dunHQQQdFly5domnTplG/fv3Iy8uLKVOmxMiRI+PJJ5/M+PDL8OHDo127dnHKKaest5YpU6bEGWecUeDWGk2bNo1DDz00OnfuHI0aNYratWvHvHnz4qeffop33nknXnrppZgxY0bGWmrXrh1XX331emsBAKD8Et4DAMBG4IMPPoj//Oc/Gfu33nrruPbaa6NTp07r9NWtWze22267OPDAA+Oqq66K2267LZ544om0s7smTZoUl19++XofEtjQ3XfffWn/OF6tWrW44YYbomvXrmnPq1WrVjRp0iR23333OPfcc+PRRx+Nfv36pZ29P3z48OjVq1c0a9asUDVdcsklGYP3SpUqxQUXXBBnnnnm2uBrjdq1a0fz5s2jY8eO0bt373j66afj5ptvjkWLFq0zzhtvvBEDBw6Mc889t1A1laSCZn526NChDCspG78Py2rXrh1nnnlmHHTQQdG0adNYuXJljB8/Pt5666145pln4vDDD0+7+sDjjz+ecYZ38+bN46abbsr49Vvz89q6des44YQT4v/+7//i1ltvjSeffDLt8TNmzIh33303DjrooEJ9joMHD46PPvoobV+lSpXivPPOi/PPP3+dfdrr1asXO+20Uxx//PExcuTIuOaaawp8aCUbL7zwQjz44IMZ+zt37hx//etf025TsMUWW0Tr1q2je/fuMWHChPjrX/+a9nuwYMGCuOiii+LZZ5+NypUrl2j9mVSoUCG6desWAwcOXKdv2LBhxQ7vhw0blra9oCXzb7rpprTLeTdr1ixuv/322GmnndKeV6dOnWjevHnsv//+0bt37+jXr188/fTTaY8dMGBAHHPMMZGbawHJopo2bVqMGDEibV+lSpWyenjq22+/Xadt7733jtNOOy3atm0b1apVi9mzZ8cnn3wSzz//fHz44YcF/gwVx3XXXVfge2fPnj3jggsuiM0222yd/oYNG0aHDh3i3HPPjX//+98xePDgtOP885//jH322afA9/OVK1fGRRddlDG4r1mzZvTp0ydOPPHEqFjxt3+OrV27dmy11VbRqVOnuOSSS+L++++P/v37p33g7eGHH462bdtm/D0FAIDyz796AACgnFuxYkVcffXVGZdS3WeffWLIkCFpg/vfq1WrVlx33XXRv3//jGHUm2++Gc8991yxak5SKpXKWP9tt91W6D+I5+TkxKmnnhr/+te/0vavXr260EsKP/PMM/Huu++m7atdu3YMGjQoLrroonWC+3SOP/74GDJkSDRu3Dht/5133lniy/kWRqal0CtWrBgtW7Ys22LK2DbbbBPDhw+P8847L7beeuuoXLly1KhRI9q3bx9/+ctf4rXXXovzzz9/nfNWrlyZcUZz48aN49FHHy3Sgw+1a9eO66+/vsBZm6+++mqhxpo3b17cfvvtafsqVaoUd955Z/Tu3Xud4P73OnXqFM8991zGgDcbc+fOjeuvvz5j/9lnnx333Xdf2uD+9/7whz/EE088kTF4HD9+fNx7771Z15qNTLV89dVXMWnSpKzH/eSTT2Ly5MnrtNepUye6dOmS9pzvvvsuPv3003XaN99883j44YcL/X2tUaNG3HDDDXHqqaem7Z88eXLGbTfIbPXq1fH3v/894/Ywu+66a1SvXr3Y17nqqqvivvvui3333Tc233zzqFy58tpVN+6///546aWXon79+sW+zu+NHj0644MJ1atXj3vvvTcuvfTStMH9r1WpUiX++te/xhVXXJG2f9myZXHfffcVOMbdd9+d9qGGiF9WIHn88cfj1FNPXSe4/72KFSvGeeedF48//njGVY5uuOGGmDt3boHjAABQfgnvAQAggwMOOCBatWpVqv8dcMABxa5z8ODBGZeL3WeffeK///1voZfB/vXnfvfdd0eFChXS9t96662xbNmyIte6IZg0aVLMmjVrnfbddtst9ttvvyKPd+ihh2bcC/rNN99c7/nLly+P2267LW1f5cqVY+DAgUWemd6sWbMYNGhQ2rBk5cqVBa7SUFoyhfdNmjRZb5hRnm2xxRbx4IMPxhZbbJHxmM022yzt9+qtt95K+7MaEfGPf/yjwDELctppp2X8mfrkk08KNcbgwYNj6dKlGWsrymupVq1a8eCDD8bWW29d6HMK8t///jfmz5+ftq9nz55x2WWXFWn57goVKsSNN96YdluDiF9W8vj555+zKTUrW2+9dbRr1y5tX6aZ84WRacn8I488MuPDXCNHjkzbfvrpp0ejRo2KXMMVV1wRLVq0SNtXmPsp/8/KlSvj2muvjbfffjvjMWeeeWaxr9OrV6/1rvhQ2BVoiurf//532vbc3NwYMGBA7LnnnkUa78wzz8z4Oh86dGjGwHzmzJlx//33p+2rXbt2PPTQQ/GHP/yhSLXsvPPOcd9996V9uGLevHlpV98AAGDjILwHAIByLC8vLx566KG0fXXq1Il+/fqtd+ZrJp07d46zzjorbd/s2bPjmWeeyWrcpGVaXneXXXbJeswzzjgjbfs333yz3occnnrqqZgzZ07avquuuirrulq0aBF///vf0/a9+uqrxZqhm41MD5hkE/CVJ+eff340bNgwq3NfeumltO277bZboVbSKMhxxx2Xtn3mzJm/2YM+nby8vHj88cfT9u27775x1FFHFbmezTffPPr27VvsPbF//vnnjNsCtG/fPi677LKsxs3NzY0bbrghttxyy3X6li1bFoMGDcpq3Gxl2v99+PDh6/3+pbNy5cp4+eWX0/YVtNx5pvtp69ati1xDxC8PLJ188slp+8aMGZPVmJuaVCoVI0eOjO7duxf4Pr3TTjvFvvvuW6xrNWzYMM4777xijZGtr776Kj777LO0fT169Ijdd989q3H/7//+L+3DKsuXL4///e9/ac954IEHYsWKFWn7brnllmjatGlWtbRp0yb+/Oc/p+17/PHHY8GCBVmNCwDAhk14DwAA5dibb74Zs2fPTtt35ZVXRr169Yo1fu/evTPOgnziiSeKNXZSlixZkra9ODNnd9hhh2jcuHFss802sd9++0WPHj3immuuiQEDBmRcvWCNTHs8b7vttnHiiSdmXVNExIEHHhjt27dfpz2VSsWzzz5brLGLIj8/P+Ms7fUtZ1ye1a5dO2NIvj6pVCpGjRqVtu+kk04qTlkR8UsolM6qVasyvkbWeP/99zPObL/qqquyrqlDhw7F3sd56NChafeJjvhlVndxVnmoUaNG9OrVK23fkCFD0u77XloOO+ywtNtoTJs2rdCrJ/zaW2+9lTYI3GGHHWKHHXbIeF6m13Vx7qedO3eOmjVrxk477RSHHnponHfeeXHDDTfE5ZdfnvWYG6P8/PxYvHhxzJw5M7766qsYMmRI3HjjjXHAAQdEz549Y/z48RnPrVChQlx55ZXFruH000/PuCpDacu0LU3t2rWjT58+WY/boEGDjPehDz74YJ22lStXZqylc+fOxX5A4rTTTku7Dc6yZcsybhkAAED5tvGuTQgAAJuA1157LW17nTp14vDDDy/2+JUrV46TTjop+vXrt07fxIkTY9KkSYXaN3pDkiksfvXVV+Oyyy7L+oGHbJZ0Hj9+fMaA5dxzz43c3OI/b33iiSem3ZP6pZdeKrMwbPHixRn70gWQG4u99torqlSpkvX5DzzwQEyePDkmT54cU6ZMicmTJ8eMGTNin332KXZtDRo0yNi3fPnyqFGjRsb+V155JW17u3btYquttipWXSeccEIMHz486/NfeOGFtO2dOnWKtm3bZj3uGkcccUT069dvnZ/p2bNnx+jRo2OPPfYo9jUKo0aNGnHIIYekDQ2HDRtW5K02MoWPBc26j8h8P33yySfjmGOOyWolhW233TY+/vjjIp9XnvXv3z/69+9fZte76KKLYrfddiv2OPvvv38JVJOd119/PW17165di7xV0O8deuih8dxzz0XEL6tuNG7cOLbeeuu0S98X9DDTBRdcUKw6In550OL444+P22+/fZ2+l156KU455ZRiXwMAgA2L8B4AAMqxdLPAIiKOPvroEpsNd8wxx8Stt96adjbru+++W+7C+0z7ai9ZsiTOPvvsuOeee7LeS7yo3n333bTtlSpVKrFQZJ999omcnJx1ltKePn16fPfddyW2z3hBMs3OLUkff/xxiYcYTZo0ybhMcmEUNUD9tZycnPXOei6OgsL5VatWFXju6NGj07YfccQRxaopIqJjx47RqFGjmDFjRpHPnTt3bowbNy5t30EHHVTc0iIionr16tGxY8e0D+u8//77ZRbeR/wSrKcL3V955ZW4+uqrC/0eMG/evHjnnXfWaa9UqVIceeSRBZ6b6f4xZsyYuOqqq+L6669PbGY26e23334lEirXq1evTN4/0pk6dWpMnz49bd/RRx9d7PH32GOPuO2222LrrbeOrbbaqsCHsDK9j9evXz923XXXYtcS8ct2JOnC+zFjxsSSJUs26hVsAAA2RZbNBwCAcmr69OkZ90rfa6+9Suw6tWvXjp133jlt39ixY0vsOmVlq622SrsEbUTEuHHj4pBDDonbbrstvvvuu1KvJd2M+IiItm3bRs2aNUvkGnXq1Ek7WzAiMu4XXNIKCu+WLVtWJjUkoVWrVkmXkFFBM6ILWv599uzZMWXKlLR97dq1K5G69txzz6zO/fTTTzPu914SqxWskWnGclm9nn5dR7NmzdZpnz9/ftowPpMXX3wx8vLy1mnff//9o06dOgWeW9D3asiQIXHooYfGY489Vqxl9Ck5Rx55ZNxxxx0lsqpLkve3TO+dVatWjZ122qnY41epUiUOP/zw2H777de7ekqmWjp37pzVyhPp7LDDDml/J8jLyyuXv4cBAFAwM+8BACCDN954I5o2bVqq15g6dWoccMABWZ377bffZuzLFLZnq3Xr1mn/QF1QDRuqnJycOO644+KOO+5I27906dIYMGBADBgwIFq0aBH77LNP7LPPPrHbbrsVeyne3/v666/Ttpf0bMatt9467fL8ZfX9K2hW4MYc3hd3CfmSNn/+/Pj8889j1KhRBa4okCkAj4iYMGFC2vbc3NzYdttti11jRMR2222X1XnffPNN2vYqVaqU6L0802ojBe0xXhpycnLimGOOSXsvGzZsWBx44IGFGuf5559P276+JfMjIpo3bx577LFHjBo1Km3/tGnT4u9//3v07ds3dt5559h3331j7733jtatW5dIgEzhNGnSJC655JISWR1jjZYtW5bYWEU1adKktO3bbbddVKhQoczqyMvLi4kTJ6btK8lViXJzc6Nly5Zpg/pvv/22TFf8AACg9AnvAQCgnMq0ZOyWW24ZdevWLdFr7bjjjmnbp02bVqLXKSs9evSIp556Kn766acCj/vxxx/j4YcfjocffjgqV64cHTt2jL322is6d+5c7FmHeXl5Gb+HDRs2jCVLlhRr/F/bcsst07ZPnjy5xK5RkCpVqkSlSpXSzu6dN29emdRQ1ipVqrTeWculZeHChTFlypT48ccfY9KkSTFhwoT49ttv48cffywwmC+MTLPumzVrVmIPt2T72vrhhx/Stjdq1KhEt26oX79+2vYFCxbEvHnzyvT7fuyxx0b//v0jPz//N+1vvvlmLFq0aL0rePzwww/x+eefr9PeoEGD2HvvvQtVwyWXXBInnXRSgdst5OfnxxdffBFffPFF3HnnnVG7du3o1KlTdO7cOTp37pzxHkX2KlasGLvttlscffTRccQRR0SlSpVKdPyy2l4mnUz3oWwf/MnW9OnT076vRfzyGirJ9/HGjRunDe/L6n0cAICyI7wHAIByaubMmWnbMwVLxVGvXr207UuXLi2X+63WqFEjbr311jjzzDNj+fLlhTpn5cqV8f7778f7778ft9xySzRq1Cj23Xff2H///WPPPfcscjAya9asjEHqHXfckXFlgJI0a9asUr/GGnXr1k37M/vjjz+WWQ1lqUaNGiW2ZHImU6ZMiS+//DImTJgQ33//fUyePDmmTp0a8+fPL7VrZtqLviQD60zbWqxPpnviDz/8EO3bty9OSYU2a9asMg3vGzVqFJ06dYr333//N+0rV66Ml19+OY4//vgCz8806/7oo48u9AzmXXbZJS677LK46aabCld0/LIKxEsvvRQvvfRSRERsv/320aVLlzjggANil112KfQ4m7KcnJyoXLlyVKlSJTbffPOoX79+NGnSJLbZZpvYaaedYtddd40aNWqU2vVr1apVamOvT6YH72rXrl2mdWS650REXHnllXHllVeWeg1l+T4OAEDZEN4DAEA5lWkmaWn8sb6g2ZvLly8vd+F9RMSuu+4aAwcOjD59+mS1H/OMGTPiiSeeiCeeeCI233zzOOigg6J79+6FDgkXLFhQ5GuWtNIMeX9v2223TRt0LF26NGbOnBkNGzYss1rKQklvsbDGpEmTYtiwYTF8+PBEVr7INJO0evXqJXaNbO8nm9prao3u3buvE95H/LJ0fkHhfSqVimHDhqXtK8yS+b/Ws2fPyM3NjVtuuSXjTOSCfPPNN/HNN9/EgAEDomnTptG1a9c47rjjolmzZkUeqzzq1atX9O7dO+kyimR9e8GXprL8/acgm+o9BwCA0iW8BwCAcirTjPH1LZOcjYLGXLlyZYlfr6zstttuMWzYsLjtttti6NChBS77XJAFCxbEM888E88880y0bds2Lr/88th1110LPKewM/5L04oVK8rsWn/4wx/SBowREePGjSt2eN+hQ4f49ttvszr3yiuvjCFDhhTr+r9X0vt5//TTT3HrrbfGsGHDir30fcQvIdfixYuLfF6mn9uSfIAn27HK8ud5Q6rhwAMPjFq1asXChQt/0z569OiYMWNGNGrUKO15n3zySUydOnWd9rZt22a1X/fpp58eHTp0iH79+sWHH35Y5PPXmDp1agwYMCDuvffeOOyww+Kyyy7L+DmQnJK+xxXFsmXL0raX9YOEm9r7OAAAZSO537Q3Yddcc020atUqbrvttjK75oQJE6Jv375x2GGHRbt27aJNmzZx+OGHx0033ZRx2UMAADZsmQK8kgj2fq+gmZRJzr4rCfXr148bb7wxXn/99ejdu3e0bNmyWOONGTMmTj311BgwYECBx/1+j+oklOWDFwXtY/7BBx+UWR3l0dtvvx2HHXZYPP/888V6fTdu3DiOPfbYuPPOO2PkyJFZjVEa95ffq1atWlZbDqxevboUqimaJB5mqlKlShxxxBHrtKdSqRg+fHjG8zItmX/MMcdkXcuOO+4YDz30UDzzzDNx4oknFmsZ8/z8/HjhhRfiyCOPjHfeeSfrcdj4bAiv9YgNo47y/AAlAADpmXlfxl577bV46qmnyvSa//3vf+Ouu+5aZxbRpEmTYtKkSfHMM8/ErbfeGvvuu2+Z1gUAQPFkCs0XLVpU4tcqaIZuaS0PXtYaNWoUvXr1il69esU333wTr7/+erz99tvx5ZdfFjloz8/Pj9tuuy1SqVRccMEFaY+pXLlySZRdLGURxK6x1157ZezLNCOfX5Y+v+qqq4q0KkRubm40btw4ttlmm2jVqlXstNNO0bp162jSpEmx66lWrVra9mxm8WeybNmyrH42N4TXVFK6d+8ejz/++Drtw4cPj3PPPXed9pUrV8bLL7+8TnvVqlXTPghQVK1bt47WrVvHNddcE6NHj4433ngj3nnnnfjxxx+LPNaiRYviwgsvjPvvvz923333YtdG+ZfptZ5pW4+yrqMsleX7OAAAZUN4X4befvvtuPjii8v0mv37948777wzIiLq1KkTZ555ZrRr1y5WrVoVL7/8cjz11FOxePHi+NOf/hTPPfdcVkvjAQCQjEx7u5ZkiLZGpgcCcnJyMoZ5pa00Z5ttv/32sf3220evXr1i3rx5MXLkyBg1alSMGjWqSOHTHXfcEXvssUe0a9dunb6Cvm7/+9//SiRo3ZA0bNgwtt9++/jmm2/W6Zs0aVJ88cUXscsuuyRQ2YZr0qRJcc011xQY3FeqVCl22WWXaNOmTey4446x7bbbxlZbbbXeh2qy3SIi07LUJXnfyXasTK+p3r17R69evYpT0gavdevWsd1228X48eN/0z5+/Pj45ptvYvvtt/9N+xtvvLHOMvsRvyzBX5Jbr1SsWDE6deoUnTp1ioiIKVOmxAcffLD2njpv3rxCjZOXlxd/+ctf4pVXXinzfc3Z8FSvXj1te1mH95nqqFixYnzxxRdRoUKFMq0HAICNg/C+jAwaNCj+9a9/FbjcaEn7+uuv4+67746IiCZNmsTgwYOjWbNma/s7deoUO+64Y1x77bWxfPnyuP322+OOO+4os/oAACieBg0apG2fPXt2iV/rp59+Sttev379xJa3Lqt9XuvUqROHH354HH744RHxS/j03nvvxf/+978YNWpUgQ8R5Ofnx6233hoPP/zwOn2Zvn8REdOmTdvowvuIiP322y9teB8R8eijjwrvfyWVSsXll1+ecU/lmjVrRq9eveLYY4+NWrVqFXn8bF8/jRs3Ttv+888/ZzVeOtmuHpLpNTVt2rTilFNudO/ePW666aZ12ocNG7ZOeJ9pyfzu3buXSm1rNGvWLP74xz/GH//4x0ilUvHVV1/Fe++9F2+88UaMHTu2wFnEc+bMiUGDBm30D2KwfvXr10/bPn/+/DKtI9M9Z9WqVfHTTz9tlO/jAACUPnvel7Iffvghzj///LjpppsiLy+vTJ+6veOOO2LVqlWRk5MT//nPf34T3K/xxz/+MbbbbruI+GV2T6Y/DAEAsOHJ9EfhWbNmlWiQFhExbty4tO1NmzbNaryS2O+9sDM2S1qzZs3ipJNOioEDB8b7778f1157bcZAMyJi9OjRMX369HXaa9eunXHW3tSpU0us3g1J9+7dIzc3/T9DX3jhhayW1N5YvfHGG/Hll1+m7WvatGm88MIL0bNnz6yC+4jsXz8tWrRI2z59+vRYsGBBVmP+3nfffZfVeZnuiRvr6+n3jjrqqKhUqdI67b9fHn/evHnx3nvvrXNc48aNY4899ii1+n4vJycndt555zj//PPj6aefjtdeey3OPffcjPfFiMwPHbBpyfSeO2HChBK7xpIlS9b7oGFB7/2byn0HAICSJ7wvRY8++mh07do13nzzzYiI2HbbbePvf/97mVx73rx58e6770ZExCGHHFLgDJazzjorTjjhhDjzzDNj6dKlZVIfAADF16pVq4x9mUK/bH311Vdp25s3b17geZmC2mXLlhW7plmzZhV7jOKqVatWnHLKKfH8889HmzZt0h6TSqVizJgxafsyfQ9Hjx5dUiVuUJo1axb7779/2r68vLy4/vrry7iiDdewYcPStufm5sbdd98dDRs2LNb4M2bMyNhX0OznP/zhDxlX28j0kE9Rff3111mdt+bB9N/74osvymyljiTVrVs3unTpsk77tGnTfnMPevHFF9OuCtitW7eM9+yy0KxZs7jkkkviySefjNq1a6c9ZvLkyTFnzpyyLYwNzh/+8Ie07ePHjy+xPeCvu+66aNOmTRx00EFxxhlnxLXXXhv33ntvTJw4ce0xderUyTj7fmN9HwcAoPQJ70vR2LFjIy8vLypXrhznnXdePPfcc+v94+bvrVy5Mh599NHo2bNn7LnnnrHzzjtHp06d4vTTT4+HH34440z5Dz74YO0/xrt27VrgNbp16xZ9+/aNv/zlL1G3bt0i1QcAQHKaNGkS9erVS9uXblZltmbNmhXffvtt2r62bdsWeG66WaARxd+XdunSpcVaCjs/Pz+mTJkSb7/9djz44IPx5JNPFqueWrVqxT//+c+MoWamGXg777xz2vaS/P5F/LJ3+sSJE0vkoYni6tmzZ8a+9957L5566qmyK2YDlin46dKlS8aQuigKesCnoJUx6tatmzE4GzVqVLHrioiMD7usT+vWrdO2L1++vESDtAULFsQXX3wRc+fOLbExS8qxxx6btv2ll15a+/ELL7ywTn9OTk4cc8wxWV1z5cqVMWHChHjllVfiv//9b4wcOTKrcdbYbrvt4rLLLsvYb0YzO+20U9r2pUuXxvjx40vkGl9++WXk5eXF5MmT44MPPognn3wy/v3vf6/zu0em+05Jv49//fXX8cMPPxS4VQ8AABsHe96XoipVqsTxxx8fF1xwQVb7XH3zzTdx0UUXrfMP07lz58aHH34YH374YTz44INxxx13rPNHv1/vI/nrWff5+fkxe/bsWLJkSTRs2DA222yzItcFAMCGY88994zhw4ev0z5s2LC47LLLonLlysW+xtChQ2PVqlVp+3bfffcCz820/HFxl7z//PPPs1p6/5NPPonrr78+fvjhh988CLtmH+biaNGiRWy77bZpl+3NtMJV586d4+GHH16nfdasWTFy5Mjo1KlTsWqKiFi9enWcc845awOHevXqRZMmTaJp06axzz77ZB3YZatjx46x//77x//+97+0/ddff300a9asRD738mrhwoUZg+H1PTBTWG+99VbGvkyv9zX23HPPtAHZ0KFD489//nOxZm/Pnj0764cAdthhh6hXr17abUOef/756Ny5c9Z1/doDDzwQAwYMiIhf7nGNGzeOpk2bRtOmTeOKK64okftutvbZZ59o0KBBzJ49+zftr732Wlx11VXx008/xaeffrrOeR07dizyZINHH300HnrooZgyZcpvlhfv3r17sV+/mVboiCiZlVso33beeefYfPPN027VMWLEiAJXJiqM6dOnxw8//LBOe05Ozjr34M6dO6d9P/v888/j+++/j6222qpYtURELFq0KE4++eRYunRp5OTkRIMGDda+jx9++OEFvl4AACh/zLwvRdddd13ccMMNWQX33333XZxyyikxderUqFSpUpx00klxzz33xNNPPx333HNP/PGPf4xKlSrFtGnTokePHjFp0qTfnL/mD4aVKlWKLbbYIubMmRPXXXdd7LHHHrHPPvvEYYcdFh07dowePXrExx9/XCKfLwAAZe/ggw9O2z5//vy0oX5RrVy5MuNM6KZNm8Y222xT4PmZlj5O90fxonjllVeyOq9Ro0bxzTffrLOC1ZQpU7LeZ/vXqlatmra9Zs2aadv32GOPjH3//e9/i11PxC9fq1/PFPz555/jiy++iBdffDGxbbOuueaajA925OXlxUUXXRRvv/12mdTy4osvxquvvlom1yqsgsLJzTffvNjjjxs3Lj788MOM/emWVP+1I488Mm37Tz/9tHbbuGwNGTJkvQ8PZJKTkxMHHnhg2r4XXnghfvzxx+KUFhG/rBry63vi0qVLY+LEifHWW2/F559/nmhwHxFRsWLFOOqoo9ZpnzZtWowfPz5effXVtMuKZ5qxX5DNNtssfvjhh3X2BX///ffXu1f4+lSrVi1jX40aNYo1NuVfbm5u7Lvvvmn7RowYkfU9ZI0XXngh7etku+22W+cefMABB6R9YCk/Pz/uueeeYtWxxhNPPLH2/TqVSsWsWbPis88+i+HDh0eFChVK5BoAAGw4hPelqDizDS677LJYvHhx1KxZMx577LH429/+Fl26dIlddtklunTpEtdff3089NBDUbly5Vi8eHFcffXVvzl//vz5EfHLP2rHjBkTRxxxRDzxxBO/eSp59erVMWrUqDj11FPjgQceyLpWAACSs99++2Xc+uiWW24p9t7At99+e0yZMiVtX2Fmqjdu3Dht+yeffJJxC6j1mT17dgwdOjSrcxs3bpzx4drHH388qzHXWLlyZcaHElq0aJG2vWrVqhmD0I8++ijt8tZFsWLFivj3v/+dtq9ixYpx2GGHFWv8bDVu3Dj+/Oc/Z+xfsmRJXHDBBfHAAw9ktcJCYUydOjV69eoVF198cbG3cShptWrVytiX6fVYWHl5efH3v/+9wH2h17cs88477xw77rhj2r4bb7wx64dCZs2aFQMHDszq3DWOO+64tO2rV6+OG264odj7YQ8YMCDjqgjr27KurHTv3j1t+1tvvRWvvfbaOu3Vq1ePQw45pMjX2W233dK2l8RDHJmWPs/Jycl4P2XTkumBk+nTp8egQYOyHnfp0qXx0EMPpe1Lt1LNlltuGXvvvXfa44cNG1bsCTM///xzxocAateuHXvttVexxgcAYMMjvN8AjRw5cu3+gxdeeOFvlr3/tfbt20ePHj0iIuLTTz+NL774Ym3fmj8+rVixIs4///xYsGBBnHbaafHCCy/E2LFj4+23344rr7wyqlevHqlUKm6++eZ48cUXS/kzAwCgpFWqVCnOOOOMtH3z58+PK664Iuv9Ud9///148MEH0/ZVr149Y0j2a5n2gl26dGk8/fTTRa4pPz8//vrXvxZr2eRDDz00bfvjjz+edsn7who2bFgsWrRonfaKFStG+/btM5535plnRqVKldL2XX311cXav/fGG2/MuD/0EUcckfHBj7LQs2fPOProozP2r169Om6++ebo3r17fPLJJyV23e+//z6uvfbaOPTQQ9MGmRuCatWqZVyR4dVXX816VmkqlYq//e1v691TfsWKFesd69xzz03bPm3atLjhhhuKXFt+fn5ce+21sXDhwiKf+2u77LJL7LHHHmn73nnnnbjzzjuzHvvjjz/OeE+sWbNmdOvWLeuxS9I222yTdnuF4cOHp10y/9BDD824EkZBGjdunPEef/PNN2f9gFbEL0vyp7P99tsX+HALm4499tgjtt9++7R9/fv3X2eFysK68847Y9asWeu0V6lSJeM2M5nuh6tXr44+ffrEzJkzs6olPz8/rrzyyrS/W0REnHzyyVGxoh1RAQA2NsL7DdCvn1Bf3xO0v14m7Nf7Aq75Y+bSpUtj/vz50bdv37j66qtj2223jcqVK8eWW24ZZ5xxRjz44INr/1DYr1+/Qv2RBgCADcupp54ajRo1Stv33nvvxYUXXljkEOXNN9+M888/P+PSx3/6058KFfx27NgxNttss7R9//nPf4q0VP3KlSvjiiuuiHfeeafQ56Rz8sknp11mNi8vLy644IKs/sg+YcKEuOWWW9L27bPPPgV+rZo1axYnnnhi2r6lS5fGaaedltXMvbvuuiuefPLJtH2VKlWKP/3pT0Ues6TdcMMN0bFjxwKPGTduXJx88slx4oknxogRI7J6cGPJkiUxfPjwOOuss+Kwww6LJ598cr1Lw+fm5sYJJ5xQ5GuVlA4dOqRtnzx58tr91oti0aJF0adPn3jmmWfWe2xhViI47LDDMj5o/uyzzxYpwF+9enVcfvnlxZ6tvcall16acSnpu+66K/71r38VeQb+l19+Gb179874c3POOedk3CYkCelmJY8fPz7tgx+ZZuoXxqmnnpq2ffLkydGnT5/1vs7SGT58eDz//PNp+wr7gMRpp50WrVq1SvtfQVtGUH7k5OTExRdfnLZv2bJl0aNHj5g4cWKRxnz66aczrkx5wgknZHyNd+jQIeO+87Nnz46TTz65yA8Hrl69Oq677rqMv/PUqVMnzjrrrCKNCQBA+SC83wCNGzdu7cdHHXVUxn9wtmrV6jf/UJ48efLaj3+912anTp3i+OOPT3uttm3brp0xNXPmzPjggw9K+tMBAKCUVa9ePfr27Rs5OTlp+999993o1q1boX7XW7RoUfTt2zcuuuiijDP2d9lllzjttNMKVVvlypWjS5cuafsWL14cp59+eqGC6U8++SSOP/74GDZsWKGuW5CmTZvG4YcfnrZvypQp0b1793jjjTcKPd7rr78ep5122m+2qFojJycnLrjggvWOcfHFF0ezZs3S9s2fPz969uwZt912W8bZd782a9as+POf/xx33HFHxmPOOeecaNq06XrHKm2VK1eOu+++u1DL/n722WdxySWXxO677x5nn3123HvvvfHWW2/F5MmTY968eZGXlxcrV66MefPmxYQJE+L111+P/v37R8+ePWPPPfeMSy+9NN57771ChbYNGjSIBx98MM4///yS+DSzkmnv9ohfZob279+/UPuK5+XlxXPPPRdHHHFEvPzyy4W69ppt2Nanb9++Gfd4f/jhh+Oss86K6dOnFzjGDz/8ED179ozhw4cX6pqF0bp164wrkkREDBw4MHr27Blff/31esdavXp1PPLII3HaaadlXC5/q622itNPPz3rekvDEUccUeC+8Wu0aNEi44MihXH44YdnvJe8+eabcfLJJxd69ZBVq1bF3XffHVdccUXa/gYNGmT82wabpi5dumRcTWf27Nlx0kknxWOPPbbe7VcWL14cN9544zpbUq6x5ZZbRp8+fQoc429/+1tsvvnmafumTp0aJ554Ytx///2FmjTz/fffx5lnnhlPPfVUxmMuvfTSqFGjxnrHAgCg/LG20gZo3rx5WZ336z8W/voX+IMPPrjA8/bff/+1+3uOGTMm9ttvv6yuDwBAcvbee+/405/+FLfffnva/u+//z7OOOOMaNu2bRxyyCHRvn372GKLLaJatWoxZ86cmDx5crzxxhvx+uuvpw2h19hyyy3jrrvuKtIyrRdeeGG8/PLLaYPG2bNnx6mnnhpdunSJrl27xk477RR169aNvLy8+Omnn2LMmDHx4osvpl02/YADDoi33nqrUAHm7/31r3+N999/P20YN3v27Ljwwgtj2223jSOOOCLatGkTzZs3X/s79uLFi2Pq1KkxduzYePHFFwsMAE844YSMs5N/bbPNNou77rorTjrppLSznvPy8mLAgAHxxBNPxAEHHBD77rtvbLXVVlGvXr3IycmJefPmxbhx49buaV1QOLDnnntG796911tTWalZs2bce++9cf3112dcKeDXVqxYEe+++268++67pVLP0UcfHVdddVXUqVOnVMYvrKOOOioGDBiQcY/7O++8M55//vk49thjY9ddd41mzZpFjRo1YsWKFWsfYPjoo4/i9ddfj9mzZ6cdo3bt2mmD+p9++qlQNW6//fZx5ZVXxvXXX5+2/7333ovDDz88Dj744DjssMOiZcuWUadOnfj555/ju+++i5deeilee+21dR4UqlGjRixevLhQNWRy8cUXx1dffRUjR45M2z9q1Kg45phjYs8994wuXbpE27Zto379+lGjRo1YtGhR/PDDDzF69Oh4/vnnC3wAoXr16tG/f/9CBeVlqUaNGnHQQQet94GnTMuAF1blypXjH//4R/To0SPtgzFffPFFHHXUUdGpU6fYf//9o3Xr1rHFFltEjRo1Ii8vLxYtWhQTJ06MTz/9NEaMGFHgyidXXXVVxpVc2HTdcMMN8e2338b333+/Tt/ChQvj73//ewwePDgOO+yw2GuvvWLLLbeMWrVqxcKFC+PHH3+M999/P5577rmMDy1VqFAh+vbtu96gvGHDhnHbbbfFueeem3aFi8WLF8ctt9wSgwYNioMPPjj23nvvaNasWdStWzfy8/Njzpw58eWXX8brr78e77zzToHboxx77LGF2roIAIDySXi/AVrzC3rlypULfMr29379D4kGDRqs/XjLLbcs8LzGjRuv/TjbBwcAAEjehRdeGAsXLsy4J3PELw9rrm+/60yaNGkSAwcOjC222KJI52277bbxxz/+MR577LG0/alUKt58880iLZm9/fbbxy233BK77bZbkWpZo27dutGvX7+46KKLMi7rPHHixIwPQxRG27ZtM87iS6dVq1YxYMCAOO+882Lp0qVpj5k/f348++yz8eyzz2ZV04477hi33npr5OZuWIuwVaxYMa6//vpo37593HzzzRlnOJemrbfeOq655prYc889y/za6VSuXDmuvvrqOP/88zOuFjB58uT4z3/+U+Sxc3Nzo0+fPtG4ceO49NJL1+kvyj3ilFNOiVmzZmVcyn/ZsmXx/PPPZ1wG/fdycnLimmuuyTj7urAqVqwY/fv3j3POOSftPu8Rv9x73n///Xj//fezukbVqlXjtttui2233bY4pZaa7t27Fxje5+bmFnoZ+oLsvvvucd5552X8GUilUvHBBx8Ua6W/Hj16xBFHHJH1+Wy8atasGQMHDowePXrEtGnT0h7zww8/xN133x133313kcbOycmJG2+8MfbZZ59CHb/XXnvFP//5z7jssssyhu+zZs2KRx55JB555JEi1bJG586d429/+1tW5wIAUD5sWH+xKWUTJkyIvn37xmGHHRbt2rWLNm3axOGHHx433XRTzJgxI+ny1lqzh9bKlSujYcOGscMOOxTqv18vs9mqVau1Hxc0c2rNddaoVatWyX4yAACUqSuvvDKuuOKKIs2ML4x27drFE088Edtss03WdWUbtP/ejjvuGA8++GCxl4vdd9994z//+U9UqlSpROr6tT322CPuv//+jEuKZ7LbbrvFww8/HE2aNCnxmvbaa694+OGHE59RXpBu3brFSy+9FCeccEKZPWDQqFGjuOGGG2LEiBEbTHC/RpcuXeLKK68s0TGbNGkSDz30UJx33nmx/fbbpz3mo48+KnDW5+9dfPHF8ec//7lEvmdXX311obZRKIwaNWrEAw88UCqhb+3atWPQoEEZtwXZEOy+++4Fbo+x5557RqNGjUrkWhdffHGp7b991llnxV//+tdSGZuNQ7NmzeLRRx+NnXbaqcTGrFy5cvTt27fIq1Mcfvjhcc8996z9215J6tatWwwYMCCqVKlS4mMDALDh2GTC+//+97/RrVu3eOSRR+K7776LpUuXxvLly2PSpEkxaNCg6Nq1a7z99ttJlxkREdttt93aj0eNGlXgsZMmTYq77747hg0bFj/88MPa9rZt2679ON0So782YcKEtR9vCPteAgBQPGeeeWY89thjscMOOxR7rBo1asRVV10Vjz32WJFn3P9alSpV4v77749TTjklcnJysh7nmGOOicceeyzq1q2b9Ri/duCBB8Zjjz2WMcQsqmrVqsUll1xSrIcLdt555xgyZEiJBdg1atSIv/71rzFw4MBysT9u7dq1o2/fvvHKK6/EaaedFtWrVy+V67Rp0yb+9a9/xWuvvRbHH398VKhQoVSuU1w9e/aMf/zjH8X+OlSuXDnOPvvsGD58eHTs2DEiIrbZZpto2LDhOscuWbIk/ve//xVp/AsvvDDuvffe9a78lkmlSpXiH//4R5x66qlZnZ9JtWrV4tZbb41+/fpFvXr1SmTMQw89NIYNGxbt2rUrkfFKS05OToHB47HHHlui17v88svjn//8Z4ndn7fccssYMGBAXH755SUyHhu3Ro0axeOPPx5nnnlmsR9g3GabbeLpp5+O448/PqvzO3fuHMOGDVvvNpaFVb9+/bjlllvi5ptvLpUHDgEA2LBsEuF9//794/bbb49Vq1ZFnTp14pJLLolHHnkkBg0aFCeeeGLk5ubG4sWL409/+lNMmjQp6XJj7733XvvxQw89lHGJxIhfPrf//Oc/cdlll/1macNOnTqtXTr/pZdeijlz5mQcY8iQIRHxyz5e+++/fzGrBwBgQ9CmTZt49tln49///ne0bt26yOdvueWW8ec//zneeuut6NmzZ4mEyJUrV45rr702nnrqqdh3332LFOK3a9cuBg8eHP369SvxvaV32WWXePbZZ6Nv377Rpk2brMaoV69enHXWWfH666/HueeeW+yv1+abbx59+/Zdu6d5Np9zgwYN4txzz42XXnopevToscGG05k0b948rr766nj77bfjhhtuiP3337/YAfZ2220XvXr1ihEjRsRTTz0VRx55ZLkIQrp37x5DhgyJo446qsj11q5dO84444x49dVX47LLLvvNnuEFLZv+wAMPFLnOvffeO15++eXo3bt3kYLyzp07x/PPPx/du3cv8jUL65hjjln7NcjmofVKlSrFIYccEg899FDcfvvtaR962BAdc8wxae9HtWrVigMPPLDEr3fUUUfFSy+9FL169cp6csC2224bV199dbz22mux3377lXCFbMyqVKkSV1xxRQwfPjyOPfbYqFq1apHOb9myZfTt2zeGDh1a7If6GjZsGHfeeWc8/vjjcfDBB2f1XtO8efPo06dPvPzyy3H00UcXqx4AAMqPnFRByfBG4Ouvv47jjjsuVq1aFU2aNInBgwf/Znn5iIgnn3wyrr322oiIOOSQQ+KOO+4otXo+/PDDOP300yMi4vzzz4+LL754nWNWr14dhx9++NqZ9GeffXZcdtll6xw3bNiwuPzyyyOVSsUWW2wRr7/++m+WznrwwQejX79+EfFLmH/XXXf95g81ERGDBw+Of/zjHxHxy+yB4uzpCQDAhuuHH36Id999Nz777LP4/vvvY8aMGbFkyZLIz8+PGjVqxOabbx7bbLNN7LjjjrH33ntHmzZtijVDvjCmTZsWr732WowePTomTpwYs2bNiuXLl0fVqlWjTp06sdVWW0X79u3jgAMOKLGZ8YXx/fffx0cffRTjxo2Lb7/9NubMmROLFy+OxYsXR8QvM3nr1q0bLVu2jB133DF222236NixY4lvVfBry5Yti/fffz8++eST+Prrr2Pq1Kkxd+7cWLFiRVSoUCFq1aoVtWrVihYtWkTr1q2jXbt2sdtuu5W7wH59Vq5cGZ9++ml8++23MWHChJg4cWLMmzdv7fcnLy8vqlatGtWrV4/69etH8+bNo2XLltG6devYddddS2xGcJJmz54db731Vnz22Wcxbty4mDt3bixatChWrFgRVatWjXr16kXz5s1j5513jt122y123333An82ly5dGrNnz16nPScnJ5o1a5b1fSAvLy/efvvtGDlyZHz++ecxY8aMWLBgQeTk5MTmm28eW2+9dXTo0CEOOeSQ32z7VlbGjh0bI0eOjC+//DJ++OGHmDlzZixZsiRSqVTUqFEjatasGVtssUXstNNO0bp16+jcufNG8fNTllKpVHz22Wfx+eefx7hx42LixImxYMGCWLRoUSxZsiQqVKgQ1atXjy233DJatmwZbdq0iT333LNM7/dlaeHChTF48OC0fWteq5SsxYsXxzvvvBOjR4+Ob775JqZOnRoLFy6MvLy8qFmzZtSuXTuaNGkSHTp0iN133z3atWtXalu2LFy4MN5999349NNPY/z48TFt2rSYN29erFixIipXrhw1a9Zce2/cZZddon379tGuXbtS/10MAIANz0Yf3l9wwQXxv//9L3JycuKpp56KXXbZJe1xRx55ZIwfPz4qVaoUH3/8cZGfzi2swoT3ERFffPFFnHLKKWv3o+/YsWOccMIJ0aJFi5gzZ068/vrr8fzzz8fq1asjJycnBgwYsM5ee/n5+XHGGWesXXq/RYsW0aNHj9hhhx1i0aJFMWzYsBgxYkRERNStWzdGjBhRYssIAgAAAAAAAFB4pTc1ZAMwb968ePfddyPilxn1mYL7iIizzjorPvnkk6hTp04sXbq01ML7wtpll13iwQcfjD59+sTs2bNj9OjRMXr06HWOq169elx//fXrBPcRvyyBeM8998QVV1wRL7/8cvz4449x/fXXr3Ncy5Yt46677hLcAwAAAAAAACRkow7vP/jgg8jLy4uIiK5duxZ4bLdu3TLu9ZeUDh06xKuvvhpPP/10/O9//1u7xFzlypWjRYsW0blz5zj55JOjUaNGGceoWrVq3H777fHBBx/Es88+G59++mnMmTMnatasGS1atIiuXbtGt27d1llOHwAAAAAAAICys1Evm//vf/877r333oiIeOedd6Jhw4YR8cty8rNnz44lS5ZEw4YNBdcAAAAAAAAAJGqjnnk/YcKEiIioVKlSbLHFFjFnzpy4884746WXXooFCxZERESFChWiY8eO0bt37+jQoUOS5QIAAAAAAACwicpNuoDSNH/+/IiIqFGjRowZMyaOOOKIeOKJJ9YG9xERq1evjlGjRsWpp54aDzzwQEKVAgAAAAAAALAp26jD+yVLlkRExIoVK+L888+PBQsWxGmnnRYvvPBCjB07Nt5+++248soro3r16pFKpeLmm2+OF198MeGqAQAAAAAAANjUbNR73h944IExZcqUtf//hhtuiOOPP36d48aMGROnnnpq5OXlRcOGDeO1116LKlWqZHXNVCoVOTk5WdcMAAAAAAAAwKZno97zvmrVqms/7tSpU9rgPiKibdu2cdxxx8Xjjz8eM2fOjA8++CD222+/rK6Zn5+KhQuXZnUuAAAAAAAAABuPWrWqRYUKhVsQf6MO72vUqLH244MPPrjAY/fff/94/PHHI+KXmfjZhvcREatW5Wd9LgAAAAAAAACbno16z/sGDRqs/XjLLbcs8NjGjRuv/XjevHmlVhMAAAAAAAAA/N5GHd63atVq7ccLFiwo8NiVK1eu/bhWrVqlVhMAAAAAAAAA/N5GHd63bdt27ceffPJJgcdOmDBh7cdNmzYtrZIAAAAAAAAAYB0bdXjfqVOntUvnv/TSSzFnzpyMxw4ZMiQiIipUqBD7779/mdQHAAAAAAAAABEbeXhfoUKFOOussyIiYvHixXHppZfGkiVL1jlu8ODBMXLkyIiIOOigg2KLLbYo0zoBAAAAAAAA2LTlpFKpVNJFlKb8/Pw444wzYtSoURER0aJFi+jRo0fssMMOsWjRohg2bFiMGDEiIiLq1q0bI0aMiHr16mV9vdWr82Pu3HUfEAAAAAAAAABg01K37mZRoULh5tRv9OF9RMTy5cvjiiuuiJdffjnjMS1btoy77rortt1222JdS3gPAAAAAAAAQITwPqMPPvggnn322fj0009jzpw5UbNmzWjRokV07do1unXrFptttlmxryG8BwAAAAAAACBCeJ8o4T0AAAAAAAAAEUUL7wt3FAAAAAAAAABQaoT3AAAAAAAAAJAw4T0AAAAAAAAAJEx4DwAAAAAAAAAJE94DAAAAAAAAQMKE9wAAAAAAAACQMOE9AAAAAAAAACRMeA8AAAAAAAAACRPeAwAAAAAAAEDChPcAAAAAAAAAkDDhPQAAAAAAAAAkTHgPAAAAAAAAAAkT3gMAAAAAAABAwoT3AAAAAAAAAJAw4T0AAAAAAAAAJEx4DwAAAAAAAAAJE94DAAAAAAAAQMKE9wAAAAAAAACQMOE9AAAAAAAAACRMeA8AAAAAAAAACRPeAwAAAAAAAEDChPcAAAAAAAAAkDDhPQAAAAAAAAAkTHgPAAAAAAAAAAkT3gMAAAAAAABAwoT3AAAAAAAAAJAw4T0AAAAAAAAAJEx4DwAAAAAAAAAJE94DAAAAAAAAQMKE9wAAAAAAAACQMOE9AAAAAAAAACRMeA8AAAAAAAAACRPeAwAAAAAAAEDChPcAAAAAAAAAkDDhPQAAAAAAAAAkTHgPAAAAAAAAAAkT3gMAAAAAAABAwoT3AAAAAAAAAJAw4T0AAAAAAAAAJEx4DwAAAAAAAAAJE94DAAAAAAAAQMKE9wAAAAAAAACQMOE9AAAAAAAAACRMeA8AAAAAAAAACRPeAwAAAAAAAEDChPcAAAAAAAAAkDDhPQAAAAAAAAAkTHgPAAAAAAAAAAkT3gMAAAAAAABAwoT3AAAAAAAAAJAw4T0AAAAAAAAAJEx4DwAAAAAAAAAJE94DAAAAAAAAQMKE9wAAAAAAAACQMOE9AAAAAAAAACRMeA8AAAAAAAAACRPeAwAAAAAAAEDChPcAAAAAAAAAkDDhPQAAAAAAAAAkTHgPAAAAAAAAAAkT3gMAAAAAAABAwoT3AAAAAAAAAJAw4T0AAAAAAAAAJEx4DwAAAAAAAAAJE94DAAAAAAAAQMKE9wAAAAAAAACQMOE9AAAAAAAAACRMeA8AAAAAAAAACRPeAwAAAAAAAEDCKiZdAAAAAAAAAMCmJJVKxYoVK5Iu4/+X+v//NyfRKtaoUqVK5ORsGLWUNeE9AAAAAAAAQBlJpVLRt+81MWHCt0mXskHabrtWcfXVfTfJAN+y+QAAAAAAAABlaBPMpSmEnFQqlVr/YRTW6tX5MXfukqTLAAAAAAAAADZQG8qy+StWLI9evc6JiIj+/QdGlSpVE65o41s2v27dzaJChcLNqbdsPgAAAAAAAEAZysnJiapVkw/Kf61KlaobXE2bGsvmAwAAAAAAAEDChPcAAAAAAAAAkDDhPQAAAAAAAAAkTHgPAAAAAAAAAAkT3gMAAAAAAABAwoT3AAAAAAAAAJAw4T0AAAAAAAAAJEx4DwAAAAAAAAAJE94DAAAAAAAAQMKE9wAAAAAAAACQMOE9AAAAAAAAACRMeA8AAAAAAAAACRPeAwAAAAAAAEDChPcAAAAAAAAAkDDhPQAAAAAAAAAkTHgPAAAAAAAAAAkT3gMAAAAAAABAwoT3AAAAAAAAAJAw4T0AAAAAAAAAJEx4DwAAAAAAAAAJE94DAAAAAAAAQMKE9wAAAAAAAACQMOE9AAAAAAAAACRMeA8AAAAAAAAACRPeAwAAAAAAAEDChPcAAAAAAAAAkDDhPQAAAAAAAAAkTHgPAAAAAAAAAAkT3gMAAAAAAABAwoT3AAAAAAAAAJAw4T0AAAAAAAAAJEx4DwAAAAAAAAAJE94DAAAAAAAAQMKE9wAAAAAAAACQMOE9AAAAAAAAFMOnn34cffpcEJ9++nHSpQDlmPAeAAAAAAAAsrRixYoYNGhg/PzznBg0aGCsWLEi6ZKAckp4DwAAAAAAAFkaPnxIzJ8/LyIi5s+fFyNGDEm4IqC8Et4DAAAAAABAFmbOnBEjRgyNVCoVERGpVCqGDx8aM2fOSLgyoDwS3gMAAAAAAEARpVKpGDz4/rXB/fraAdZHeA8AAAAAAABFNH36tBg79vPIz8//TXt+fn6MHft5TJ8+LaHKgPJKeA8AAAAAAABF1Lhxk2jduk3k5v42bsvNzY3WrdtG48ZNEqoMKK+E9wAAAAAAAFBEOTk50aPHWZGTk1OodoD1Ed4DAAAAAABAFho2bBRdu3ZbG9Tn5OTEkUd2i4YNt0y4MqA8Et4DAAAAAABAlo488pioXbtORETUqVM3unY9JuGKgPJKeA8AAAAAAABZqlKlSuy9936Rm5sbnTt3iSpVqiRdElBOCe8BAAAAAAAgSytWrIh3330z8vPz491334wVK1YkXRJQTgnvAQAAAAAAIEvDhw+J+fPnRUTE/PnzYsSIIQlXBJRXwnsAAAAAAADIwsyZM2LEiKGRSqUiIiKVSsXw4UNj5swZCVcGlEfCewAAAAAAACiiVCoVgwffvza4X187wPoI7wEAAAAAAKCIpk+fFmPHfh75+fm/ac/Pz4+xYz+P6dOnJVQZUF4J7wEAAAAAAKCIGjduEq1bt4nc3N/Gbbm5udG6ddto3LhJQpUB5ZXwHgAAAAAAAIooJycnevQ4K3JycgrVDrA+wnsAAAAAAADIQsOGjaJr125rg/qcnJw48shu0bDhlglXBpRHwnsAAAAAAADI0pFHHhO1a9eJiIg6depG167HJFwRUF4J7wEAAAAAACBLVapUiZ49z4l69epHjx5nR5UqVZIuCSinKiZdAAAAAAAAAJRn7dt3iPbtOyRdBlDOmXkPAAAAAAAAAAkT3gMAAAAAAABAwoT3AAAAAAAAAJAw4T0AAAAAAAAAJEx4DwAAAAAAAAAJE94DAAAAAAAAQMKE9wAAAAAAAACQMOE9AAAAAAAAACRMeA8AAAAAAAAACRPeAwAAAAAAAEDChPcAAAAAAAAAkDDhPQAAAAAAAAAkTHgPAAAAAAAAAAkT3gMAAAAAAABAwoT3AAAAAAAAAJAw4T0AAAAAAAAAJEx4DwAAAAAAAAAJE94DAAAAAAAAQMKE9wAAAAAAAACQMOE9AAAAAAAAACRMeA8AAAAAAAAACRPeAwAAAAAAAEDChPcAAAAAAAAAkDDhPQAAAAAAAAAkTHgPAAAAAAAAAAkT3gMAAAAAAEAxfPrpx9GnzwXx6acfJ10KUI4J7wEAAAAAACBLK1asiEGDBsbPP8+JQYMGxooVK5IuCSinhPcAAAAAAACQpeHDh8T8+fMiImL+/HkxYsSQhCsCyivhPQAAAAAAAGRh5swZMWLE0EilUhERkUqlYvjwoTFz5oyEKwPKI+E9AAAAAAAAFFEqlYrBg+9fG9yvrx1gfYT3AAAAAAAAUETTp0+LsWM/j/z8/N+05+fnx9ixn8f06dMSqgwor4T3AAAAAAAAUESNGzeJ1q3bRG7ub+O23NzcaN26bTRu3CShyoDySngPAAAAAAAARZSTkxM9epwVOTk5hWoHWB/hPQAAAAAAAGShYcNG0bVrt7VBfU5OThx5ZLdo2HDLhCsDyiPhPQAAAAAAAGTpyCOPidq160RERJ06daNr12MSrggor4T3AAAAAAAAkKUqVapEz57nRL169aNHj7OjSpUqSZcElFMVky4AAAAAAAAAyrP27TtE+/Ydki4DKOfMvAcAAAAAAACAhAnvAQAAAAAAACBhwnsAAAAAAAAASNgmsef922+/Heeee26hju3cuXPcf//9pVwRAAAAAAAAAPw/m8TM+2+++SbpEgAAAAAAAAAgo01i5v24ceMiIqJ+/fpx3333FXhsjRo1yqIkAAAAAAAAAFhrkwjvv/7664iI2HHHHWOHHXZIuBoAAAAAAAAA+K2Nftn8xYsXx+TJkyPil/AeAAAAAAAAADY0G314/80330QqlYqIMOseAAAAAAAAgA3SRh/er9nvPiJip512SrASAAAAAAAAAEhvo9/zfs1+9zVr1ozVq1fHTTfdFO+9915MmTIlKlasGC1atIgDDjggTj/99KhVq1bC1QIAAAAAAACwKdrow/s1M+/z8vKia9eukZeXt7ZvxYoVMW7cuBg3blw88sgjceedd0bHjh2TKhUAAAAAAACATdRGHd6vXLkyJk2aFBERy5cvj5o1a0bPnj1j9913j1q1asX3338fzz77bHz00Ucxb968OOuss+Kxxx6LnXfeOeHKAQAAAAAAANiUbNTh/YQJE9bOtG/ZsmXcf//90bRp07X9bdq0iW7dusW///3vuPfee2PFihVx+eWXx4gRIyI3Nzfr61asmP25AAAAAAAAAGXh17lmxYq5cs6E5aRSqVTSRZSW1atXx7Rp02Lq1KnRvHnz3wT3v5ZKpeKkk06Kzz77LCIi7r777th///2zumYqlYqcnJysawYAAAAAAAAoC8uXL49u3bpFRMTQoUOjatWqyRa0iduoZ95XqFAhmjdvHs2bNy/wuJycnPjjH/+4Nrz/4IMPsg7v8/NTsXDh0qzOBQAAAAAAACgry5cvX/vxvHlLomrV1QlWs3GqVataVKhQuBUNNurwvih22GGHtR9PmzatWGOtWpVf3HIAAAAAAAAAStWvc81Vq/LlnAmzacH/79dLQKxcuTLBSgAAAAAAAADY1GzUM+/HjRsXU6dOjZ9//jm6desW1apVy3jszz//vPbj+vXrl0V5AAAAAAAAABARG3l4P3DgwHjxxRcjIqJly5bRqVOnjMd+8sknaz/eZZddSr02AAAAAAAAAFhjo142f4899lj78dChQzMet2zZsnjiiSciIqJSpUpx8MEHl3ZpAAAAAAAAALDWRh3eH3744VG7du2IiBg2bFi8/vrr6xyTl5cXV1xxRUybNi0iIk4++eRo0KBBWZYJAAAAAAAAwCZuo142v2bNmnHdddfFX/7yl8jPz48//elPcfzxx8fBBx8cNWrUiPHjx8dDDz0U48ePj4hflsu/+OKLE64aAAAAAAAAgE3NRh3eR/wy+37lypXxt7/9be3y+GuWyP+1zp07x2233RbVqlVLoEoAAAAAAAAANmUbfXgfEdGtW7fYfffd49FHH4333nsvJk+eHCtXroz69evHLrvsEkcffXQccMABSZcJAAAAAAAAwCYqJ5VKpZIuYmOyenV+zJ27JOkyAAAAAAAAAAq0fPnyOOec0yIiYuDAh6Nq1aoJV7TxqVt3s6hQIbdQxxbuKAAAAAAAAACg1AjvAQAAAAAAACBhwnsAAAAAAAAASJjwHgAAAAAAAAASJrwHAAAAAAAAgIQJ7wEAAAAAAAAgYcJ7AAAAAAAAAEiY8B4AAAAAAAAAEia8BwAAAAAAAICECe8BAAAAAAAAIGHCewAAAAAAAABImPAeAAAAAAAAABImvAcAAAAAAACAhAnvAQAAAAAAACBhwnsAAAAAAAAASJjwHgAAAAAAAAASJrwHAAAAAAAAgIQJ7wEAAAAAAAAgYcJ7AAAAAAAAAEiY8B4AAAAAAAAAEia8BwAAAAAAAICECe8BAAAAAAAAIGHCewAAAAAAAABImPAeAAAAAAAAABImvAcAAAAAAACAhAnvAQAAAAAAACBhwnsAAAAAAAAASJjwHgAAAAAAAAASJrwHAAAAAAAAgIQJ7wEAAAAAAAAgYcJ7AAAAAAAAAEiY8B4AAAAAAAAAEia8BwAAAAAAAICECe8BAAAAAAAAIGHCewAAAAAAAABImPAeAAAAAAAAABImvAcAAAAAAACAhAnvAQAAAAAAACBhwnsAAAAAAAAASJjwHgAAAAAAAAASJrwHAAAAAAAAgIQJ7wEAAAAAAAAgYcJ7AAAAAAAAAEiY8B4AAAAAAAAAEia8BwAAAAAAAICECe8BAAAAAAAAIGHCewAAAAAAACiGTz/9OPr0uSA+/fTjpEsByjHhPQAAAAAAAGRpxYoVMWjQwPj55zkxaNDAWLFiRdIlAeWU8B4AAAAAAACyNHz4kJg/f15ERMyfPy9GjBiScEVAeSW8BwAAAAAAgCzMnDkjRowYGqlUKiIiUqlUDB8+NGbOnJFwZUB5JLwHAAAAAACAIkqlUjF48P1rg/v1tQOsj/AeAAAAAAAAimj69GkxduznkZ+f/5v2/Pz8GDv285g+fVpClQHllfAeAAAAAAAAiqhx4ybRunWbyM39bdyWm5sbrVu3jcaNmyRUGVBeCe8BAAAAAACgiHJycqJHj7MiJyenUO0A6yO8BwAAAAAAgCw0bNgounbttjaoz8nJiSOP7BYNG26ZcGVAeSS8BwAAAAAAgCwdeeQxUbt2nYiIqFOnbnTtekzCFQHllfAeAAAAAAAAslSlSpXo2fOcqFevfvTocXZUqVIl6ZKAcqpi0gUAAAAAAABAeda+fYdo375D0mUA5ZyZ9wAAAAAAAACQMOE9AAAAAAAAACRMeA8AAAAAAAAACRPeAwAAAAAAAEDChPcAAAAAAAAAkDDhPQAAAAAAAAAkTHgPAAAAAAAAAAkT3gMAAAAAAABAwoT3AAAAAAAAAJAw4T0AAAAAAAAAJEx4DwAAAAAAAAAJE94DAAAAAAAAQMKE9wAAAAAAAACQMOE9AAAAAAAAACRMeA8AAAAAAAAACRPeAwAAAAAAAEDChPcAAAAAAAAAkDDhPQAAAAAAAAAkTHgPAAAAAAAAAAkT3gMAAAAAAABAwoT3AAAAAAAAAJAw4T0AAAAAAAAUw6effhx9+lwQn376cdKlAOWY8B4AAAAAAACytGLFihg0aGD8/POcGDRoYKxYsSLpkoBySngPAAAAAAAAWRo+fEjMnz8vIiLmz58XI0YMSbgioLwS3gMAAAAAAEAWZs6cESNGDI1UKhUREalUKoYPHxozZ85IuDKgPBLeAwAAAAAAQBGlUqkYPPj+tcH9+toB1qdi0gUAAAAAAABAUaVSqUT3l58xY1qMHfv5Ou35+fkxduzn8cMP30WjRk0SqCyiSpUqkZOTk8i1gewJ7wEAAAAAAChXUqlU9O17TUyY8G3SpWR07bVXJnbt7bZrFVdf3VeAD+WMZfMBAAAAAAAod+TSwMbGzHsAAAAAAADKlZycnLj66r6JLpu/xrPPPhEvv/xCRPxS1xFHHB1HH9090Zosmw/lk/AeAAAAAACAcicnJyeqVq2adBnRtesxa8P72rXrRLdux0WVKlUSrgoojyybDwAAAAAAAFn6dVB/yik9BPdA1oT3AAAAAAAAUALatGmfdAlAOSa8BwAAAAAAAICECe8BAAAAAAAAIGHCewAAAAAAAABImPAeAAAAAAAAABImvAcAAAAAAACAhAnvAQAAAAAAACBhwnsAAAAAAAAASJjwHgAAAAAAAAASJrwHAAAAAAAAgIQJ7wEAAAAAAAAgYcJ7AAAAAAAAAEiY8B4AAAAAAAAAEia8BwAAAAAAAICECe8BAAAAAAAAIGHCewAAAAAAAABImPAeAAAAAAAAABImvAcAAAAAAACAhAnvAQAAAAAAACBhwnsAAAAAAAAASJjwHgAAAAAAAAASJrwHAAAAAAAAgIQJ7wEAAAAAAAAgYcJ7AAAAAAAAAEiY8B4AAAAAAAAAEia8BwAAAAAAAICECe8BAAAAAAAAIGHCewAAAAAAAABImPAeAAAAAAAAABImvAcAAAAAAACAhAnvAQAAAAAAACBhwnsAAAAAAAAASJjwHgAAAAAAAAASJrwHAAAAAAAAgIQJ7wEAAAAAAAAgYcJ7AAAAAAAAAEiY8B4AAAAAAAAAEia8BwAAAAAAAICECe8BAAAAAAAAIGHCewAAAAAAAABImPAeAAAAAAAAABImvAcAAAAAAACAhAnvAQAAAAAAACBhwnsAAAAAAAAASJjwHgAAAAAAAAASJrwHAAAAAAAAgIQJ7wEAAAAAAAAgYcJ7AAAAAAAAAEiY8B4AAAAAAAAAEia8BwAAAAAAAICECe8BAAAAAAAAIGHCewAAAAAAAABImPAeAAAAAAAAABImvAcAAAAAAACAhAnvAQAAAAAAACBhwnsAAAAAAAAASJjwHgAAAAAAAAASJrwHAAAAAAAAgIQJ7wEAAAAAAAAgYcJ7AAAAAAAAAEiY8B4AAAAAAAAAEia8BwAAAAAAAICECe8BAAAAAAAAIGHCewAAAAAAAABImPAeAAAAAAAAABImvAcAAAAAAACAhAnvAQAAAAAAACBhwnsAAAAAAAAASJjwHgAAAAAAAAASJrwHAAAAAAAAgIQJ7wEAAAAAAAAgYcJ7AAAAAAAAAEiY8B4AAAAAAAAAEia8BwAAAAAAAICECe8BAAAAAAAAIGHCewAAAAAAAABImPAeAAAAAAAAABImvAcAAAAAAACAhAnvAQAAAAAAACBhm3R4v3Dhwth7772jVatWcemllyZdDgAAAAAAAACbqE06vO/bt2/MmjUr6TIAAAAAAAAA2MRtsuH966+/HsOGDUu6DAAAAAAAAADYNMP7uXPnxnXXXZd0GQAAAAAAAAAQEZtoeP/3v/895syZE3Xr1k26FAAAAAAAAADY9ML7F198MV5++eXIzc2Nq6++OulyAAAAAAAAAGDTCu/nzJkT119/fURE9OzZM9q0aZNwRQAAAAAAAACwiYX31113XcybNy+22mqr6NOnT9LlAAAAAAAAAEBEbELh/dChQ+P111+P3NzcuOmmm6JKlSpJlwQAAAAAAAAAEbGJhPczZ86MG2+8MSJ+WS6/Xbt2CVcEAAAAAAAAAP9PxaQLKAtXX311LFy4MFq2bBl//vOfS/16FStuEs9EAAAAAAAAbPJ+nQtVrJgrJ6Jc8fO7Ydnow/unn3463nnnnbXL5VetWrVUr5ebmxN16mxWqtcAAAAAAABgw7B8eYW1H9eps1mpZ1FQkvz8blg26vB++vTp0a9fv4iI6NGjR7Rv377Ur5mfn4qFC5eW+nUAAAAAAABI3vLly9d+PG/ekqhadXWC1UDR+PktfbVqVYsKFQq3osFGG96nUqn4v//7v1i8eHG0bNky+vTpU2bXXrUqv8yuBQAAAAAAQHJ+nQutWpUvJ6Jc8fO7Ydlow/snn3wyPvjgg4iIOP300+P7779f55hZs2at/XjhwoXx9ddfR0RE/fr1o0GDBmVTKAAAAAAAAACbvI02vB8zZszaj6+//vr1Hv/222/H22+/HRERvXr1it69e5dWaQAAAAAAAADwG4VbXB8AAAAAAAAAKDUb7cz7fv36Rb9+/Qo8ZurUqXHAAQdERMSRRx4Z//rXv8qiNAAAAAAAAAD4DTPvAQAAAAAAACBhwnsAAAAAAAAASJjwHgAAAAAAAAASJrwHAAAAAAAAgIRVTLqAJDVt2jS+/fbbpMsAAAAAAAAAYBNn5j0AAAAAAAAAJEx4DwAAAAAAAAAJE94DAAAAAAAAQMKE9wAAAAAAAACQMOE9AAAAAAAAACRMeA8AAAAAAAAACRPeAwAAAAAAAEDChPcAAAAAAAAAkDDhPQAAAAAAAAAkTHgPAAAAAAAAAAkT3gMAAAAAAABAwoT3AAAAAAAAAJAw4T0AAAAAAJC4Tz/9OPr0uSA+/fTjpEsBgEQI7wEAAAAAgEStWLEiBg0aGD//PCcGDRoYK1asSLokAChzwnsAAAAAACBRw4cPifnz50VExPz582LEiCEJVwQAZU94DwAAAAAAJGbmzBkxYsTQSKVSERGRSqVi+PChMXPmjIQrA4CyJbwHAAAAAAASkUqlYvDg+9cG9+trB4CNmfAeAAAAAABIxPTp02Ls2M8jPz//N+35+fkxduznMX36tIQqA4CyJ7wHAAAAAAAS0bhxk2jduk3k5v42rsjNzY3WrdtG48ZNEqoMAMqe8B4AAAAAAEhETk5O9OhxVuTk5BSqHQA2ZsJ7AAAAAAAgMQ0bNoquXbutDepzcv4/9v4/TM66vhf/X/dscKIsmF+62Vm//akBGya/mqrVIKRS9cguJlKOqNWlTdBSIwavg1qF9lhs1WoFMR6tITarothYErMbT7VoQDnHqjG4WaLyo7Uf291lJcnmhAUZyM79/QOyss0ku5udzD2bfTyuK5cz79fszBNzJ5md597vO4m2tlXR1DQ/42QAUFvKewAAAAAAIFNtbatj1qzZERExe/acaG1dnXEiAKg95T0AAAAAAJCpfD4fl112ecydOy/a29dGPp/POhIA1NyMrAMAAAAAAAAsW7Y8li1bnnUMAMiMM+8BAAAAAAAAIGPOvAcAAAAAAABOaWmaRqlUyjpG3SmVHq14m9Hy+XwkSXLSX0d5DwAAAAAAAJzSSqVSXH75G7OOUdfWrbs86wh1a+PGz8XMmTNP+uvYNh8AAAAAAAAAMubMewAAAAAAAGDa+PjKd0a+4WlZx6gbaZpGRNRkW/ippDT8WLxt59/U9DWV9wAAAAAAAMC0kW94WuRnKO+pP7bNBwAAAAAAAICMKe8BAAAAAAAAIGPKewAAAAAAAADImPIeAAAAAAAAADKmvAcAAAAAAACAjCnvAQAAAAAAACBjynsAAAAAAAAAyJjyHgAAAAAAAAAyprwHAAAAAAAAgIwp7wEAAAAAAAAgY8p7AAAAAAAAAMiY8h4AAAAAAAAAMqa8BwAAAAAAAICMKe8BAAAAAAAAIGPKewAAAAAAAADImPIeAAAAAAAAADKmvAcAAAAAAACAjCnvAQAAAAAAACBjynsAAAAAAAAAyJjyHgAAAAAAAAAyprwHAAAAAAAAgIwp7wEAAAAAgMzt3r0r1q+/Inbv3pV1FADIhPIeAAAAAADIVKlUis2bN8b+/fti8+aNUSqVso4EADWnvAcAAAAAADLV2bk1BgcPRETE4OCB6OramnEiAKg95T0AAAAAAJCZgYH+6OwcXdZv3741Bgb6M0oEANlQ3gMAAAAAAJlI0zQ6OjZFuVwetV4ul6OjY1OkaZpRMgCoPeU9AAAAAACQib6+3ujp6a446+npjr6+3honAoDsKO8BAAAAAIBMNDcXorGxseKssfGMaG4u1DgRAGRHeQ8AAAAAAGSiv78vhoaGKs6Ghh6K/v6+GicCgOwo7wEAAAAAgEwUCi1RLC6OJElGrSdJEsXikigUWjJKBgC1p7wHAAAAADhF7N69K9avvyJ2796VdRQYlyRJor19TSTJ6Loil8s9uZ4c4ysB4NSjvAcAAAAAOAWUSqXYvHlj7N+/LzZv3hilUinrSDAuTU3N0da2atRaW9uqaGqan00gAMiI8h4AAAAA4BTQ2bk1Dh4cjIiIgwcHo6tra8aJYPza2lbH7NlzIiJizpy50dq6OuNEAFB7ynsAAAAAgCluYKA/urq2RZqmERGRpml0dm6LgYH+jJPB+OTz+bjssstj7tx50d6+NvL5fNaRAKDmlPcAAAAAAFNYmqbR0bFppLgfax3q1bJly+OGGz4Zy5YtzzoKAGRCeQ8AAAAAMIX19fVGT093lMvlUevlcjl6erqjr683o2QAAEyE8h4AAAAAYAorFFqiWFwcudzoj3tzuVwUi0uiUGjJKBkAABOhvAcAAAAAmMKSJIn29jUVZ+3tayJJkhonAgDgRCjvAQAAAACmuKam5njucxeMWnve8xZEU9P8jBIBADBRynsAAAAAgCluYKA/7r//vlFr999/XwwM9GeUCACAiVLeAwAAAABMYWmaRkfHpohIK66naVr5CwEAqCvKewAAAACAKayvrzd6erqjXC6PWi+Xy9HT0x19fb0ZJQMAYCKU9wAAAAAAU1ih0BLF4uLI5UZ/3JvL5aJYXBKFQktGyQAAmAjlPQAAAADAFJYkSbS3r4kkSca1DgBAfVLeAwAAAABMcU1NzdHaumqkqE+SJNraVkVT0/yMkwEAMF7KewAAAACAU0Bb2+qYNWt2RETMnj0nWltXZ5wIAICJUN4DAAAAAJwC8vl8XHbZ5TF37rxob18b+Xw+60gAAEzAjKwDAAAAAABQHcuWLY9ly5ZnHQMAgBPgzHsAAAAAAAAAyJjyHgAAAAAAAAAyprwHAAAAAAAAgIwp7wEAAAAAAAAgY8p7AAAAAAAAAMiY8h4AAAAAAAAAMqa8BwAAAAAAAICMKe8BAAAAAAAAIGPKewAAAAAAAADImPIeAAAAAAAAADKmvAcAAAAAAACAjCnvAQAAAAAAACBjynsAAAAAAAAAyJjyHgAAAAAAAAAyprwHAAAAAAAAgIwp7wEAAAAAAAAgY8p7AAAAAAAAAMiY8h4AAAAAAMjc7t27Yv36K2L37l1ZRwGATCjvAQAAAACATJVKpdi8eWPs378vNm/eGKVSKetIAFBzynsAAAAAACBTnZ1b4+DBwYiIOHhwMLq6tmacCABqT3kPAAAAAABkZmCgP7q6tkWaphERkaZpdHZui4GB/oyTAUBtKe8BAAAAAE4RW7bcEu3tr40tW27JOgqMS5qm0dGxaaS4H2sdAE5lynsAAAAAgFPAoUOHorPz1iiXy9HZeWscOnQo60gwpr6+3ujp6Y5yuTxqvVwuR09Pd/T19WaUDABqT3kPAAAAAHAK+NjHPjxq2/Ebb/xwxolgbIVCSxSLiyvOisUlUSi01DgRAGRHeQ8AAAAAMMXt3bsn7r33J6PW7rnnJ7F3756MEsH4JEkSF154UcXZhRdeFEmS1DgRAGRHeQ8AAAAAMIWVy+XYsOH6irMNG64/ajtyqCdpmsaOHduPKumTJIkdO77imvcATCvKewAAAACAKay7+64YGhqqOBsaGoru7rtqnAjG78g17/9rSZ+mqWveAzDtKO8BAAAAAKawxYuXRmNjY8VZY+MZsXjx0hongvE7cs37XG50XZHL5VzzHoBpR3kPAAAAADCF5XK5WLfuqoqzdeuuOqoUhXqSJEm0t6+puG1+pXUAOJV51wYAAAAAMMUtXLgoFiw4e9TaWWedHQsXFjNKBOPX1NQcra2rRor6JEmirW1VNDXNzzgZANSW8h4AAAAA4BTw9rdf/ZTyMxdXXnl1xolg/NraVscznnF6REScfnpjtLauzjgRANSe8h4AAAAA4BRw5plnRlvbayKXy0Vb2+o488wzs44EE/LLHfLTLGMAQGZmZB0AAAAAAIDquOSSS+OSSy7NOgZMWGfn1nj44YcjIuLhhx+Orq6tcfHFjmUAphdn3gMAAAAAAJkZGOiPrq5tkaZPnHGfpml0dm6LgYH+jJMBQG0p7wEAAAAAgEykaRodHZtGivux1gHgVKa8BwAAAAAAMtHX1xs9Pd1RLpdHrZfL5ejp6Y6+vt6MkgFA7SnvAQAAAACATBQKLVEsLo5cbnRdkcvlolhcEoVCS0bJAKD2lPcAAAAAAEAmkiSJ9vY1kSTJuNYB4FSmvAcAAAAAADLT1NQcra2rRor6JEmirW1VNDXNzzgZANSW8h4AAAAAAMhUW9vqmDVrdkREzJ49J1pbV2ecCABqT3kPAAAAAABkKp/Px7x5z4qIiLlz50U+n884EQDUnvIeAAAAAADI1L59D8Z9990TERH33XdP7Nv3YMaJAKD2lPcAAAAAAECmrrvu2lH33//+a4/xSAA4dSnvAQAAAACAzNx55+1x4MD+UWv79++PO++8PZtAAJAR5T0AAAAAAJCJ4eHhuOmmT1Wc3XTTp2J4eLjGiQAgO8p7AAAAAAAgEzt33nbMgn54eDh27rytxokAIDvKewAAAAAAIBMrV14QDQ0NFWcNDQ2xcuUFNU4EANlR3gMAAAAAAJloaGiISy65tOLskkted8xiHwBORcp7AAAAAAAgE2maxt69d1ec7d3bE2ma1jgRAGRHeQ8AAAAAAGSir683enq6K856erqjr6+3xokAIDvKewAAAAAAIBOFQksUi4sjlxtdV+RyuSgWl0Sh0JJRMgCoPeU9AAAAAACQiSRJor19TSRJMq51ADiVKe8BAAAAAIDMNDU1R2vrqpGiPkmSaGtbFU1N8zNOBgC1pbwHAAAAAAAy1da2OmbNmh0REbNnz4nW1tUZJwKA2lPeAwAAAAAAmcrn83HZZZfH3Lnzor19beTz+awjAUDNzcg6AAAAAAAAwLJly2PZsuVZxwCAzDjzHgAAAAAAAAAyprwHAAAAAAAAgIwp7wEAAAAAAAAgY8p7AAAAAAAgcx//+EfjjW+8JD7+8Y9mHQUAMqG8BwAAAAAAMrVv34Pxve99JyIivve978S+fQ9mnAgAai+T8r5cLseBAweiv78/i5cHAAAAAADqyHXXXTvq/vvff+0xHgkAp64ZtXiRQ4cOxa233hrf/e53o7u7OwYHByMiIkmS+NGPfjTyuDe+8Y1xzjnnxB/90R/Fs5/97FpEAwAAAAAAMnTnnbfHgQP7R63t378/7rzz9lix4vxsQgFABk5qeX/48OG44YYb4uabb45HH300IiLSND3m43t6emLXrl1xyy23xLve9a649NJLT2Y8AAAAAAAgQ8PDw3HTTZ+qOLvppk/F7/7uudHQ0FDjVACQjZO2bf6DDz4Yr3vd62LTpk3xi1/8YtQsSZKjHn/gwIGRgv8Xv/hFvO9974vrr7/+ZMUDAAAAAAAytnPnbTE8PFxxNjw8HDt33lbjRACQnZNS3v/iF7+It7zlLdHT0zOylqZpPP3pT49isVjx7Pv/9//+X8yYMSPSNI0kSSJN0/j0pz8d//t//++TEREAAAAAAMjYypUXHPPM+oaGhli58oIaJwKA7JyU8v7DH/5w/OhHPxop4ZcsWRKbNm2K73//+7Fly5aKX/Prv/7r8c1vfjNaW1tHFfh//dd/HY899tjJiAkAAAAAAGSooaEh1q79k4qztWuvsGU+ANNK1cv7n/3sZ/EP//API1vjX3jhhfGFL3whXvKSl4z5j+yzn/3s+MhHPhLveMc7Rs7O37dvX3z961+vdkwAAAAAAKAOrFhxfsyZM3fU2ty5c2PFivMySgQA2ah6ed/V1RWHDx+OiIhf+7Vfiw996EORy03sZd785jfH0qVLR+7ffvvt1YwIAAAAAADUkWuvvW7U/Wuuue4YjwSAU1fVy/s777xz5PbatWtjxowZJ/Q8a9asGbm9d+/eSecCAAAAAADq07x5z4oXvOB3IyLiBS/43Zg371kZJwKA2juxZv04ent7R26/5CUvOeHnef7znx8REWmaxr59+yadCwAAAAAAqF9ve9s7so4AAJmq+pn3Bw4cGLk9b968E36eZz/72SO3f/GLX0wqEwAAAAAAAADUs6qX9894xjNGbg8NDZ3w8xw6dCgiIpIkiTPOOGPSuQAAAAAAAACgXlW9vH/q2fY9PT0n/Dw/+MEPKj4nAAAAAAAAAJxqql7eL1myZOT2P/zDP5zw83zhC18Yub148eLJRAIAAAAAAACAulb18v7888+PiIg0TeO2226Lr371qxN+ji9+8YvxL//yLyP3V6xYUa14AAAAAAAAAFB3ql7ev+xlL4vf+I3fiCRJIk3TeOc73xlf/OIXx/W15XI5Pv3pT8f73//+SJIkIiJaWlri93//96sdEwAAAAAAAADqxoxqP2Eul4s/+7M/i7e85S0REXH48OH4y7/8y7j55pvjwgsvjLPPPnvU4w8cOBA/+9nP4jvf+U5s3bo1/uM//iPSNI2IiCRJ4h3veEc0NDRUOyYAAAAAAAAA1I2ql/cREeeee25cffXV8aEPfWjkDPz7778/brzxxlGPS9M0XvKSl4y6HxEjX3PZZZfFq171qpMREQAAAAAAAADqRtW3zT/ij/7oj+L9739/5PP5iIiRbfDTNI0kSUZ+pWk6qrQ/8r/r1q2Ld73rXScrHgAAAAAAUEd2794V69dfEbt378o6CgBk4qSV9xERf/AHfxBf+cpX4uKLL47TTjttpKQ/UtgfuX9kLSLi/PPPjy984Quxbt26kxkNAAAAAACoE6VSKTZv3hj79++LzZs3RqlUyjoSANTcSdk2/6l+9Vd/Nf7qr/4q3v3ud8cPfvCD2L17dzzwwANx6NChePzxx+OZz3xmzJkzJxYtWhQveMELYv78+Sc7EgAAAAAAUEc6O7fGwYODERFx8OBgdHVtjYsvvjTjVABQWye9vD/ijDPOiPPPPz/OP//8Wr0kAAAAAMC0snv3rvjsZzfFm960JpYtW551HBiXgYH+6OraNmr33s7ObbFixXnR1NSccToAqJ2Tum3+RNxzzz3x6KOPZh0DAAAAAGBKsu04U1GaptHRsWnUZXaPtw4Ap7KTXt7fdddd8e53vzuuvPLK4z7uyiuvjOXLl8e6deviBz/4wcmOBQAAAJxEu3fvivXrr4jdu3dlHQVg2qi07TjUu76+3ujp6Y5yuTxqvVwuR09Pd/T19WaUDABq76SV94cOHYorr7wyXv/618dXvvKV2LXr+N+s9/f3x/DwcHzjG9+IP/zDP4w///M/j8OHD5+seAAAAMBJ4sxPgNo71rbjAwP9GSeD4ysUWqJYXBxJkoxaT5IkisUlUSi0ZJQMAGrvpJT3Dz30UFx22WXxz//8z5GmaaRpGoODgzE0NFTx8T//+c/jscceG7mfpmls2bIl1q1bd9RP2wEAAAD1zZmfALVl23GmsiRJor19TcVZe/uao0p9ADiVnZTy/rrrrosf/ehHEREj/7CeffbZ8fDDD1d8/DOe8Yx473vfG+eee27kcrlIkiTSNI077rgjPvGJT5yMiAAAAMBJ4MxPgNqz7TinLj94AsD0UvXy/p577ont27ePlPYtLS3x2c9+NrZu3RpNTU0Vv6axsTHe+MY3xqc//en4x3/8x/j1X//1kQL/05/+dPz85z+vdkwAAACgypz5CZCNI9uO53KjP+7N5XK2HafuHXmfUGnbfO8fAJhuql7eb9u2LSKe+Ad37ty58cUvfjFe8IIXjPvrzz777PjsZz8bc+fOjYiIw4cPx5e//OVqxwQAAACqzJmfANk4su14pfLTtuPUO+8fAOCXql7e/+AHP4iIJ94Yvu1tb4tnPetZE36OefPmxeWXXz5y/zvf+U7V8gEAAAAnhzM/AbLT1NQcra2rRor6JEmirW1VNDXNzzgZHJ/3DwDwS1Uv73/2s5+N3D7//PNP+Hl+7/d+b+T2v/3bv00mEgAAAFADzvwEyFZb2+qYNWt2RETMnj0nWltXZ5wIxub9AwD8UtXL+6GhoZHb8+bNO+HnaW5ujogntt8/dOjQpHMNDAzERz7ykWhra4ulS5fG0qVL45WvfGW8//3vj3vuuWfSzw8AAAA48xMgS/l8Pi677PKYO3detLevjXw+n3UkGBfvHwDgCTOq/YT5fD4OHz4cEREHDx4cuXb9RD388MMjt5/xjGdMKtNtt90W73rXu0b9YEFExE9/+tP46U9/Gl/84hfjT/7kT+Jtb3vbpF4HAAAAeOLMz299a2cMDh5w5idAjS1btjyWLVuedQyYMO8fAOAknHlfKBRGbu/evfuEn+fuu++OiCd+wu5EfwAgIuKuu+6K9evXx9DQUDQ0NMTrXve6+OQnPxmf+9zn4s/+7M9i3rx5cfjw4diwYUN85jOfOeHXAQAAAJ7gzE8AYKK8fwCAk3DmfbFYjPvuuy8iIm6++eb4/d///RN6ni9+8YsjtxcvXnzCef7yL/8yHn/88YiIuPHGG+OCCy4Ymb3gBS+Itra2ePWrXx0PPvhgfPzjH4+LL744nvnMZ57w6wEAAADO/AQAJs77BwCmu6qfef+KV7xi5PZ3v/vd+Pu///sJP8eXv/zluO2220bun3/++SeU5e67744f/ehHI7meWtwfMXfu3FizZk1ERDzyyCNx++23n9BrAQAAAAAAAMCJqnp5/9KXvjR+9Vd/NSIi0jSNv/mbv4nrrrsuDh06NObXPvzww/G3f/u38Rd/8ReRJEkkSRK/9mu/dsJn7z/22GNxwQUXxK/8yq8c9zl+4zd+Y+R2f3//Cb0WAAAAAAAAAJyoqm+bnyRJXHvttbF27dpIkiTSNI0vfOEL8eUvfznOPffcWLRoUTznOc+JxsbGiHiisP/P//zPuPvuu+Nb3/pWPProo5GmaURENDQ0xHve857I5U7sZwyWLVsWy5YtG/Nxvb29I7ef/exnn9BrAQAAAABkbcuWW6Kra2u0tq6OSy65NOs4MCEf//hH43vf+0684AW/G2972zuyjgMANVf18j4iYsWKFfGOd7wjPvrRj0aSJBERUSqV4hvf+EZ84xvfOObXpWk6csZ9mqbxzne+M84999yTEXHEgQMH4jOf+UxERDzjGc+IlStXntTXAwAAAAA4GQ4dOhSdnbdGmqbR2XlrvOIVr4ozzzwz61gwLvv2PRjf+953IiLie9/7Tuzb92DMm/esjFMBQG1Vfdv8I9785jfHhz70oTjjjDNGSvkj0jQd9euII6X96aefHjfccEO0t7eflGylUin+9V//NT796U/HRRddFP/xH/8xsmPA7NmzT8prAgAAAACcTB/72IdHPm9N0zRuvPHDGSeC8bvuumtH3X//+689xiMB4NR1Us68P+LVr351nH/++fH5z38+tm/fHv/f//f/VXzckTeUhUIhLr744nj9619/0kr0np6e+IM/+INRa/Pnz4//+T//p7PuAQAAAIApae/ePXHvvT8ZtXbPPT+JvXv3xMKFizJKBeNz5523x4ED+0et7d+/P+688/ZYseL8bEIBQAZOankfEfHMZz4z3vrWt8Zb3/rWeOCBB+LHP/5x9PX1xdDQUCRJEmeeeWbMnTs3zjnnnGhubj7ZcaKvr++otQcffDC+9KUvxbOe9aw455xzJv0aM2actA0NAAAAAABGKZfLsWHDDRVnGzbcEH/3d5+JXM5nltSn4eHhuOmmT1Wc3XTTp+Lcc8+LhoaGGqeCiXlqLzRjRk5PVKf8vjAZtfqzfdLL+6eaP39+zJ8/v5YveZRf+7Vfi7/7u7+LOXPmxM9//vPYsWNHfPWrX42dO3fGv/zLv8THP/7xOPfcc0/4+XO5JGbPPr2KiQEAAAAAju273/1uDA09VHE2NPRQ3H//j+KFL3xhjVPB+HR1dcXw8HDF2fDwcPzLv3wrWltba5wKJubRR3/5AyazZ58eM2fOzDANx/LU3yeYqFr92a5peV8PzjrrrDjrrLNG7l9wwQWxYsWKeM973hO/+MUv4n/8j/8R3/jGN6KxsfGEnr9cTuPQoUeqFRcAAAAA4Lie+9zfisbGMyoW+GeccUY897m/FYODD2eQDMb2ohe9ND75yU9WLPAbGhriRS96qeOXuvfoo4+O3B4cfDhmzqz8Aylk66m/TzBRk/mzfeaZT4+GhvGdtT/tyvtKLr744rjjjjvia1/7Whw8eDC+9rWvxcUXX3zCz3f4cLmK6QAAAAAAjm/duvXxwQ9ed9T6W996VZTLT2ytD/UpibVr/yT+7u8+cdRk7dorIk0Tn7lT9556jB4+XHbM1qmn/r6Uhh/LMAlTxVOPk1r92VbeP+nlL395fO1rX4uIiB//+McZpwEAAAAAGL+FCxfFggVnx733/mRk7ayzzo6FC4sZpoLxWbHi/Niy5ZY4cGD/yNrcuXNjxYrzMkwFnMretvNvso4AFU24vH/+858/cjtJkvjRj350zHm1VHqd8XjooYfiZz/7Wfznf/5nvPzlL48kSY752FmzZo3cfvzxx08kJgAAAABAZt7+9qvjrW9dM3L/yiuvzjANTMy1114XV131pyP3r7nm6J0kAOBUN+HyPk3TSJIk0jQ95rxe/OVf/mVs3749IiK2bdt23B8s+NnPfjZye/78+Sc9GwAAAAAA8ISnPS1/3PsA1fTxle+MfMPTso5BnSsNP1bzXRpOyrb5xyv3a+l3fud3Rsr7L3/5y3HttddWfFy5XI4vf/nLI/dXrFhRk3wAAAAAANXysY99eNT9G2/8sLOXmTIcv0At5RueFvkZynvqz4TL+9WrVx93vmrVquNuT19Lr3rVq+KjH/1oDA4Oxpe+9KW44IIL4nd/93dHPSZN0/jrv/7r2Lt3b0REvOQlL4li0XWgAAAAAICpY+/ePaOudx8Rcc89P4m9e/fEwoWLMkoF4+P4BYAnTLi8/8AHPnDc+Qc/+METDlNtjY2N8b73vS/Wr18fjz/+ePzxH/9xXHLJJXHeeefFvHnz4qc//Wnccsstcdddd0XEE9vl//Vf/3XGqQEAAAAAxq9cLseGDddXnG3YcH184hObIpfL1TgVjI/jFwB+qerb5j/22GMREfG0p9XHVhOveMUr4sMf/nBce+218cgjj8SXvvSl+NKXvnTU484555z42Mc+5nr3AAAAAMCU0t19VwwNDVWcDQ0NRXf3XbF06W/XOBWMj+MXAH6p6j+utn379njRi14UV111VXz1q1+Nhx9+uNovMWGtra3xT//0T/GWt7wlnv/858fpp58ep512Wjz72c+OCy64IK6//vrYsmVLPOc5z8k6KgAAAADAhCxevDQaGxsrzhobz4jFi5fWOBGMn+MXAH6p6mfef/3rX49HHnkk/umf/in+6Z/+KV7/+tfHtddeW+2XmbCmpqZ4xzveEe94xzuyjgIAAAAAUDW5XC7WrbsqPvjB646arVt3lS3HqWuOXwD4par/q3f//fdHkiSRpmlERLzmNa+p9ksAAAAAAPAUCxcuijlz5o5amzt3bixcWMwoEYzfwoWLYsGCs0etnXXW2Y5fAKadqpf3+/fvH3X/N3/zN6v9EgAAAAAAPMXAQH8MDh4YtXbgwIEYGOjPKBFMzNvffnUkSRIREUmSiyuvvDrjRABQe1Uv7wuFwqj7g4OD1X4JAAAAAACelKZpdHRsGik+j0iSJDo6No3skgr17Mwzz4z585/oF+bPb44zzzwz40QAUHtVL+9f+9rXRpqmI28U//Ef/7HaLwEAAAAAwJP6+nqjp6c7yuXyqPVyuRw9Pd3R19ebUTIYv337Hoz+/ieO1f7+3ti378GMEwFA7VW9vH/Tm94Ura2tkaZppGkan/rUp2Lr1q3VfhkAAAAAACKiUGiJYnFxxTPvi8UlUSi0ZJQMxu+6664ddf/977/2GI8EgFPXjGo/YS6Xi4985CPx4he/OD7ykY/EgQMH4j3veU985jOfiXPPPTeWLl0aTU1N8cxnPjNOO+20cT/vf92OHwAAAACAJ0r6Cy+8KHp6uketp2kaF17YdlSpD/XmzjtvjwMH9o9a279/f9x55+2xYsX52YQCgAxUvbx/wxveMHJ73rx5ceDAgUjTNO677764//774+///u8n/JxJksSPfvSjasYEAAAAADglpGkaO3ZsrzjbsWN7/NZvFRX41K3h4eG46aZPVZzddNOn4nd/99xoaGiocSoAyEbVy/sf/OAHo94I/tc3hWmaVvslAQAAAACmrSPXvK/kyDXvW1qeU+NUMD47d94Ww8PDFWfDw8Oxc+dtccEFr6hxKgDIRtWveR8RI9e7r/QLAAAAAIDqaW4uRGNjY8VZY+MZ0dzskqTUr/PPf9mk5gBwKqn6mffr1q2r9lMCAAAAAHAM/f19MTQ0VHE2NPRQ9Pf3OfOeujUw8MCYc8cvANOF8h4AAAAAYAorFFqiWFwcd9+9Z9Tup0mSxDnnLI5CoSXDdHB8R47fSpd+KBaXOH4BmFZOyrb5AAAAAADURpIk0d6+JpJk9Me9uVzuyfUko2QwtiPHb0RScd3xC8B0orwHAAAAAJjimpqao61t1ai1trZV0dQ0P5tAMAFNTc3x6le/ZtTaq1/9GscvANPOpLfNHxoaiq997Wvxta99Lf793/89fv7zn8fTnva0mDdvXixdujQuuOCCWLlyZTWyAgAAAABwDC9/+ati+/ZbI03TSJIkfv/3X5V1JBi3trbV8a1v7YzBwQMxZ87caG1dnXUkAKi5SZX327Ztiw996ENx8ODBiIiR6yk9+uijcejQofjpT38at956azzvec+Lv/qrv4pisTjpwAAAAAAAHO3rX//qyGe0aZrGP//zV+Piiy/NOBWMTz6fj8suuzw++9lN8aY3rYl8Pp91JACouRPeNv9v//Zv48/+7M9icHBw5A1hkiSjfkU88Sbx3nvvjde//vXR2dlZndQAAAAAAIwYGOiPrq5to9Y6O7fFwEB/NoHgBCxbtjxuuOGTsWzZ8qyjAEAmTqi837ZtW2zcuHFk+6UkSSJN04q/jswff/zxeO973xt79uyp9n8DAAAAAMC0laZpdHRsGjnJaqx1AADq04S3zX/sscfiYx/7WETESGn/zGc+My6++OJ44QtfGIVCISIi+vr64tvf/nbceuut8cgjj0SSJPHYY4/FBz/4wfjCF75Q3f8KAAAAAIBpqq+vN3p6uo9aL5fL0dPTHX19vdHS8pwMkgEAMBETLu+/9a1vRX9//8i2+C9/+cvjAx/4QJx++umjHve85z0vzjvvvLj88svjiiuuiB//+McREXHXXXfFj3/843j+859fhfgAAAAAANNbodASxeLi2Lu3J8rl8sh6LpeLhQsXRaHQkmE6AADGa8Lb5v+f//N/Rm6fc845ccMNNxxV3D9VU1NTfPKTn4wzzjhjZO3b3/72RF8WAAAAAIAKkiSJ9vY1IydcjbUOAEB9mnB5f/fdd4/cXrt2beRyYz9FU1NTXHzxxSP3XfceAAAAAKB6mpqao7V11UhRnyRJtLWtiqam+RknAwBgvCZc3g8MDIzcLhaL4/66c889d+T2v/7rv070ZQEAAAAAOI62ttUxa9bsiIiYPXtOtLauzjgRTMzu3bti/forYvfuXVlHAYBMTLi8f+ihh0Zuz5kzZ9xft2DBgorPAQAAAADA5OXz+bjssstj7tx50d6+NvL5fNaRYNxKpVJs3rwx9u/fF5s3b4xSqZR1JACouRkT/YLHHnts5PZE3vydeeaZERGRpmkMDQ1N9GUBAAAAABjDsmXLY9my5VnHgAnr7NwaBw8ORkTEwYOD0dW1NS6++NKMUwFAbU34zPvh4eGR20eunzQeT3va00Zu+4k5AAAAAAAgImJgoD+6urZFmqYR8cRJgJ2d22JgoD/jZABQWxMu7wEAAAAAAKohTdPo6Ng0UtyPtQ4ApzLlPQAAAAAAkIm+vt7o6emOcrk8ar1cLkdPT3f09fVmlAwAak95DwAAAAAAZKJQaIlicXHkcqPrilwuF8XikigUWjJKBgC1p7wHAAAAAAAykSRJtLeviSRJxrUOAKcy5T0AAAAAAJCZpqbmaG1dNVLUJ0kSbW2roqlpfsbJAKC2lPcAAAAAAECm2tpWx6xZsyMiYvbsOdHaujrjRABQezMm88W7du2KNE1r8rW/8zu/c0KvAwAAAAAwXWzZckt0dW2N1tbVcckll2YdB8Ytn8/HZZddHp/97KZ405vWRD6fzzoSANTcCZX3SZJEmqbxxje+8YRedKJfmyRJ/OhHPzqh1wIAAAAAmA4OHToUnZ23Rpqm0dl5a7ziFa+KM888M+tYMG7Lli2PZcuWZx0DADJzwtvmHynwJ/IrSZKRXxP9WgAAAAAAju1jH/vwyGepaZrGjTd+OONEAABMxAmX9ydSqCvjAQAAAACqb+/ePXHvvT8ZtXbPPT+JvXv3ZJQIAICJmvC2+a49DwAAAABQP8rlcmzYcH3F2YYN18cnPrEpcrkTPo8LAIAamXB5/7nPfe5k5AAAAAAA4AR0d98VQ0NDFWdDQ0PR3X1XLF362zVOBQDARPlxSwAAAACAKWzx4qXR2NhYcdbYeEYsXry0xongxOzevSvWr78idu/elXUUAMiE8h4AAAAAYArL5XKxbt1VFWfr1l1ly3ymhFKpFJs3b4z9+/fF5s0bo1QqZR0JAGrOuzYAAAAAgClu4cJFsWDB2aPWzjrr7Fi4sJhRIpiYzs6tcfDgYEREHDw4GF1dWzNOBAC1p7wHAAAAADgFvP3tV0eSJBERkSS5uPLKqzNOBOMzMNAfXV3bIk3TiIhI0zQ6O7fFwEB/xskAoLaU9wAAAEDVuWYtU5njl6nqzDPPjLa210Qul4u2ttVx5plnZh0JxpSmaXR0bBop7sdaB4BT2YysAwAAAACnliPXrB0cPBCbN2+MhQuLkc/ns44F4+L4Zaq75JJL45JLLs06BoxbX19v9PR0H7VeLpejp6c7+vp6o6XlORkko5I0TaNUKmUdo+6USo9WvM1o+Xx+ZIcYoDLlPQAAAFBVla5Ze/HFiiSmBscvQG0VCi1RLC6uWOAXi0uiUGjJIBXHUiqV4vLL35h1jLq2bt3lWUeoWxs3fi5mzpyZdQyoa7bNBwAAAKrGNWuZyhy/ALWXJEm8+MUrKs5e/OKXOEsXgGnFmfcAAABAVYx1zdqrr36vD+CpW45fgGyUy+W4+eaOirObb+6IF7/4pZHLOQ+xHl33io/H0xpcWuaII+8hvF8Y7bHhUlz7tbdlHQOmDOU9AAAAUBWuWctU5vgFyEZ3910xNDRUcTY0NBTd3XfF0qW/XeNUjMfTGvKRn6G8B6gm5T0AAABQFUeuWbt3b0+Uy+WR9VwuFwsXLnLNWuqa4xeYrtI0jVKplNnrn3XW8+P00xvj4YePLvBPP70xzjrr+fHoo49mkCwin887ixqAmlLeAwAAAFWRJEm0t6+Jd73rqorrPvymnjl+gekoTdO47rpr47777sk6SkUPPzwUb3lLe2avv2DBWXHNNdf5NwCAmnGhGAAAAKBqmpqao7V11ciH3EmSRFvbqmhqmp9xMhib4xeYjvTSAFA/nHkPAAAAVFVb2+r41rd2xuDggZg9e060tq7OOhKMm+MXmE6SJIlrrrku023zj9i//8F497vfERFP5Prbv90QZ5xxZqaZbJsPQK0p7wEAAICqyufzcdlll8dnP7sp3vSmNZHP57OOBOPm+AWmmyRJYubMmVnHiLlznzVy+1WvaotnPevZGaYBgGwo7wEAAICqW7ZseSxbtjzrGHBCHL8A2Vq16pKsIwBAJlzzHgAAAAAAAAAyprwHAAAAAAAAgIwp7wEAAAAAAAAgY8p7AAAAAAAAAMiY8h4AAAAAAAAAMqa8BwAAAAAAAICMKe8BAAAAAAAAIGPKewAAAACAU8Tu3bti/forYvfuXVlHAQBggpT3AAAAAACngFKpFJs3b4z9+/fF5s0bo1QqZR0JAIAJUN4DAAAAAJwCOju3xsGDgxERcfDgYHR1bc04EQAAE6G8BwAAAACY4gYG+qOra1ukaRoREWmaRmfnthgY6M84GQAA46W8BwAAAACYwtI0jY6OTSPF/VjrAADUJ+U9AAAAAMAU1tfXGz093VEul0etl8vl6Onpjr6+3oySAQAwEcp7AAAAAIAprFBoiWJxcSRJMmo9SZIoFpdEodCSUTIAACZCeQ8AAAAAMIUlSRIXXnhRxW3zL7yw7ahSHwCA+qS8BwAAAACYwtI0jR07tlec7dix3TXvAQCmCOU9AAAAADzF7t27Yv36K2L37l1ZR4FxOXLN+0pc8x4AYOpQ3gMAAADAk0qlUmzevDH2798XmzdvjFKplHUkGFOh0BILFpxdcXbWWc93zXsAgClCeQ8AAAAAT+rs3BoHDw5GRMTBg4PR1bU140QwObbMBwCYOpT3AAAAABARAwP90dW1baTsTNM0Oju3xcBAf8bJ4Pj6+nrj3nt/UnF2770/sW0+AMAUobwHAAAAYNpL0zQ6OjYddZbysdahnhQKLVEsLq44KxaX2DYfAGCKUN4DAAAAMO319fVGT093lMvlUevlcjl6erqduUxdS5IkfuVXfrXi7Fd+5VcjSZIaJwIA4EQo7wEAAACY9o6cuZzLjf64LJfLOXOZunf48OHYsWN7xdmOHV+Jw4cP1zgRAAAnQnkPAAAAwLSXJEm0t6856gzlY61DPdm27cuTmgMAUB+U9wAAAAAQEU1NzdHaumqkqE+SJNraVkVT0/yMk8HxvfrVF09qDgBAfVDeAwAAAMCT2tpWx6xZsyMiYvbsOdHaujrjRDC2n/98YFJzAADqg/IeAAAAAJ6Uz+fj3HNXRi6XixUrzo98Pp91JBhTodASxeLiirNicUkUCi01TgQAwIlQ3gMAAADAk0qlUnz72zujXC7Ht7+9M0qlUtaRYExJkkR7+5qKs/b2NSOXggAAoL4p7wEAAADgSZ2dW+PgwcGIiDh4cDC6urZmnAjGp6mpORYsOHvU2llnnR1NTfMzSgQAwEQp7wEAAAAgIgYG+qOra1ukaRoREWmaRmfnthgY6M84GYxtYKA/7r//3lFr999/n+MXAGAKUd4DAAAAMO2laRodHZtGivux1qGeHOs4LZfLjl8AgClEeQ8AAADAtNfX1xs9Pd1RLpdHrZfL5ejp6Y6+vt6MksHYjhy/lX74xPELADB1KO8BAAAAmPYKhZYoFhdHLjf647JcLhfF4pIoFFoySgZja24uRGNjY8VZY+MZ0dxcqHEiAABOhPIeAAAAgGkvSZJob18TSZKMax3qSX9/XwwNDVWcDQ09FP39fTVOBADAiVDeAwAAAEBENDU1R2vrqpGiPkmSaGtbFU1N8zNOBsd3ZOeISuwcAQAwdSjvAQAAAOBJbW2rY9as2RERMXv2nGhtXZ1xIhhbkiRx4YUXVZxdeOFFdo4AAJgilPcAAAAA8KR8Ph+XXXZ5zJ07L9rb10Y+n886EowpTdPYsWN7xcs+7NjxlUjTNKNkAABMxIysAwAAAABAPVm2bHksW7Y86xgwbn19vdHT033Uepqm0dPTHX19vdHS8pwMkgEAMBHOvAcAAAAAmMJc8x4A4NSgvAcAAAAAmMKSJImFC8+pOFu48BzXvAcAmCKU9wAAAAAAU9jw8HBs2XJLxdmWLV+M4eHhGicCAOBEKO8BAAAAAKawnTtvO2ZBPzw8HDt33lbjRAAAnAjlPQAAAADAFLZy5QXR0NBQcdbQ0BArV15Q40QAAJwI5T0AAAAAPMXu3bti/forYvfuXVlHgXFpaGiItWv/pOJs7dorjlnsAwBQX2ZkHQAAAAAA6kWpVIrNmzfG4OCB2Lx5YyxcWIx8Pp91LBjTihXnx5Ytt8SBA/tH1ubOnRsrVpyXYSoAqE+l4ceyjlBX0jSNiIgkSTJOUl+yOE6U9wAAAADwpM7OrTE4eCAiIgYHD0RX19a4+OJLM04F43PttdfFVVf96cj9a665LsM0AFC/3rbzb7KOABXZNh8AAAAAImJgoD+2b7911Nr27VtjYKA/o0QAAMB04sx7AAAAAKa9NE2jo2PTyJahR5TL5ejo2BRXX/1e24hS96677tpR99///mvjhhs+lVEaAKgv+Xw+Nm78XNYx6k6p9GisW3d5RERs2LAx8vmZGSeqT7W6lJbyHgAAAIBpr6+vN3p6uivOenq6o6+vN1panlPjVDB+d955+6jr3UdE7N+/P+688/ZYseL8bEIBQB1JkiRmzlRMH08+P9P/RxmzbT4AAAAA015T0/xJzSFLw8PDcdNNlc+wv+mmT8Xw8HCNEwEAcCKU9wAAAABMe7ff/o1JzSFLO3fedsyCfnh4OHbuvK3GiQAAOBHKewAAAACmvZUrL4iGhoaKs4aGhli58oIaJ4Lxc/wCAJwalPcAAAAATHsNDQ2xdu2fVJytXXvFMYtRqAcNDQ3xyldeWHH2ylde6PgFAJgilPcAAAAAEBErVpwfM2c+fdTa05/+9Fix4ryMEsH4lMvluOOOb1ac3XHHN6NcLtc4EQAAJ0J5DwAAAAARMTDQH489Vhq19thjj8XAQH9GiWB8urvviqGhoYqzoaGh6O6+q8aJAAA4Ecp7AAAAAKa9NE2jo2PTMdfTNM0gFYzP4sVLo7GxseKssfGMWLx4aY0TAQBwIpT3AAAAAEx7fX290dPTfdT24uVyOXp6uqOvrzejZDC2XC4X69ZdVXG2bt1Vkcv5GBgAYCrwrg0AAACAaa9QaIlicfFRJWcul4ticUkUCi0ZJYPxWbhwUcyZM3fU2ty5c2PhwmJGiQAAmCjlPQAAAADTXpIk0d6+JpIkGdc61JuBgf4YHBwctTY4OBgDA/0ZJQIAYKKU9wAAAAAQEU1NzdHaumqkqE+SJNraVkVT0/yMk8HxpWkaHR2botLPmHR0bIo0TWsfCgCACVPeAwAAAMCT2tpWx6xZsyMiYvbsOdHaujrjRDC2vr7e6OnpjnK5PGq9XC5HT0939PX1ZpQMAICJUN4DAAAAwJPy+XxcdtnlMXfuvGhvXxv5fD7rSDCmQqElisXFkcuN/rg3l8tFsbgkCoWWjJIBADARynsAAAAAeIply5bHDTd8MpYtW551FBiXJEmivX3NyCUfxloHAKA+Ke8BAAAAAKa4pqbmaG1dNVLUJ0kSbW2roqlpfsbJAAAYL+U9AAAAAMApoK1tdcyaNTsiImbPnhOtraszTgQAwEQo7wEAAADgKXbv3hXr118Ru3fvyjoKTEg+nx+5nabpqPsAANQ/5T0AAAAAPKlUKsXmzRtj//59sXnzxiiVSllHgnH793//aQwOHoiIiMHBA/Hv//7TjBMBADARynsAAAAAeFJn59Y4eHAwIiIOHhyMrq6tGSeC8fuf//M9o+6/733vOcYjAQCoR8p7AAAAAIiIgYH+6OraFmmaRsQT2453dm6LgYH+jJPB2G655XMxPHx41Nrhw4fjlls+l1EiAAAmSnkPAAAAwLSXpml0dGwaKe7HWod68vjjj8eOHdsrznbs2B6PP/54jRMBAHAilPcAAAAATHt9fb3R09Md5XJ51Hq5XI6enu7o6+vNKBmM7fOf//tJzQEAqA/KewAAAACmvUKhJYrFxZEkyaj1JEmiWFwShUJLRslgbH/4h380qTkAAPVBeQ8AAADAtJckSbS3r6k4a29fc1SpD/XktNNOiwsvvKji7MILV8Vpp51W40QAAJwI5T0AAAAAHJfr3VP/Lr30jdHQ0DBqraFhRlx66RsySgQAwEQp7wEAAACY9tI0jY6OTRW3ze/o2BRpqsCn/q1Ycd6o++ee+9KMkgAAcCKU9wAAAABMe319vdHT0x3lcnnUerlcjp6e7ujr680oGYzPwEB/fOtbt49a+9a3bo+Bgf5sAgEAMGHKewAAAACmvUKhJYrFxRVnxeKSKBRaapwIxu/IzhFpevQPn9g5AgBg6lDeAwAAADDtJUkSCxeeU3G2cOE5R22nD/XkyM4Rldg5AgBg6lDeAwAAADDtDQ8Px5Ytt1ScbdnyxRgeHq5xIhi/+fObo6GhoeKsoaEh5s9vrnEiAABOhPIeAAAAgGlv587bjlnQDw8Px86dt9U4EYzfnj0/PO7xu2fPD2sbCACAE6K8BwAAAGDaW7nyguOeubxy5QU1TgTjt3jx0mhsbKw4a2w8IxYvXlrjRAAAnAjlPQAAAADTXkNDQ1xyyaUVZ5dc8rpjFvtQD3K5XLzhDe0VZ294Q3vkcj4GBgCYCrxrAwAAAGDaS9M09u69u+Js796eSNO0xolg/NI0jf/7f++sOPu///fbjl8AgClCeQ8AAADAtNfX1xs9Pd0VZz093dHX11vjRDB+jl8AgFOD8h4AAACoui1bbon29tfGli23ZB0FxqVQaIlicfFR24vncrkoFpdEodCSUTIY25HjtxLHLwDA1KG8BwAAAKrq0KFD0dl5a5TL5ejsvDUOHTqUdSQYU5Ik0d6+JpIkGdc61JMkSeLCCy+qOLvwwoscvwAAU4TyHgAAAKiqj33swyPXV07TNG688cMZJ4LxaWpqjtbWVSNFZ5Ik0da2Kpqa5mecDI4vTdPYsWN7xdmOHV9xzXsAgClCeQ8AAABUzd69e+Lee38yau2ee34Se/fuySgRTExb2+qYNWt2RETMnj0nWltXZ5wIxuaa9wAApwblPQAAAFAV5XI5Nmy4vuJsw4bro1wu1zgRTFw+n4/LLrs85s6dF+3tayOfz2cdCcb07Gc3TWoOAEB9UN4DAAAAVdHdfVcMDQ1VnA0NDUV39101TgQnZtmy5XHDDZ+MZcuWZx0FxuUrX/nHSc0BAKgPynsAAACgKhYvXhqNjY0VZ42NZ8TixUtrnAhgerjootdMag4AQH1Q3gMAAABVkcvlYt26qyrO1q27KnI5H0MAnAx79/ZMag4AQH3wXTMAAABQNQsXLooFC84etXbWWWfHwoXFjBIBnPoWLVoSDQ0NFWcNDQ2xaNGS2gYCAOCEKO8BAACAqnr726+OJEkiIiJJkrjyyqszTgRwanvggf4YHh6uOBseHo4HHuivcSIAAE6E8h4AAACoqnw+H/l8/qjbAJwchUJLFIuLK86KxSVRKLTUOBEAACdCeQ8AAABUVWfn1iiVShERUSqVoqtra8aJAE5tSZJEe/uairP29jUju6EAAFDflPcAAABA1QwM9EdX17ZI0zQiItI0jc7ObTEwYMtmgJNp374Hj7H+8xonAQDgRCnvAQAAgKpI0zQ6OjaNFPdjrQNQHeVyOTZsuL7ibMOG66NcLtc4EQAAJ0J5DwAAAFRFX19v9PR0H1USlcvl6Onpjr6+3oySAZzaurvviqGhoYqzoaGh6O6+q8aJAAA4Ecp7AAAAoCoKhZYoFhdHLjf644ZcLhfF4pIoFFoySgZwalu0aEk0NDRUnDU0NMSiRUtqGwgAgBOivAcAAACqIkmSaG9fE0mSjGsdgOp44IH+GB4erjgbHh6OBx7or3EiAABOhPIeAAAAqJqmpuZobV01UtQnSRJtbauiqWl+xslg/Hbv3hXr118Ru3fvyjoKjEtzcyEaGxsrzhobz4jm5kKNEwEAcCKU9wAAAEBVtbWtjlmzZkdExOzZc6K1dXXGiWD8SqVSbN68Mfbv3xebN2+MUqmUdSQYU39/33Guef9Q9Pf31TgRAAAnQnkPAAAAVFU+n4/LLrs85s6dF+3tayOfz2cdCcats3NrDA4eiIiIwcED0dW1NeNEMLZCoSWKxcUVZ8XikigUWmqcCACAE6G8BwAAAKpu2bLlccMNn4xly5ZnHQXGbWCgPzo7t41a6+zcFgMDrhdOfUuSJNrb10SSjP64N5fLPbmeZJQMAICJUN4DAAAAMO2laRodHZsiTcuj1svl8pPraUbJYHyamprjec9bMGrtec9bEE1N8zNKBADARCnvAQAAAJj2+vp6o6en+6iSPk3T6Onpjr6+3oySwfgMDPTH/fffO2rtvvvutXMEAMAUorwHAAAAYNprbi5EY2NjxVlj4xnR3FyocSIYvyM7R1Ri5wgAgKlDeQ8AAADAtNff3xdDQ0MVZ0NDD0V/f1+NE8H4Hdk5olw++rIPdo4AAJg6lPcAAAAATHuFQksUi4srzorFJVEotNQ4EYzfkeM3lxv9cW8ul3P8AgBMIcp7AAAAAKa9JEmivX1NxfKzvX1NJEmSUTIY25HjtxLHLwDA1KG8BwAAAICIaGpqjra21aPWLrpodTQ1zc8oEYxfU1NzPPe5C0atPe95Cxy/AABTiPIeAAAAAJ7U1rY6Zs+eExERc+bMjdbW1WN8BdSHgYH+uP/++0at3X//fTEw0J9RIgAAJkp5DwAAUId2794V69dfEbt378o6CsC0ks/n43nPOysiIp773AWRz+czTgRjS9M0Ojo2RURacT1N08pfCABAXVHeAwAA1JlSqRSbN2+M/fv3xebNG6NUKmUdCWDaOHToUHz/+/8SERHf//6/xKFDhzJOBGPr6+uNnp7uKJfLo9bL5XL09HRHX19vRskAAJgI5T0AAECd6ezcGgcPDkZExMGDg9HVtTXjRADTx8c+9uGRs5TTNI0bb/xwxolgbIVCSxSLiyOXG/1xby6Xi2JxSRQKLRklAwBgIqZNeb9v3774+Mc/Hq997WvjBS94QZxzzjnxohe9KN74xjfG5s2b45FHHsk6IgAAQAwM9EdX17ZRxVFn5zbXqwWogb1798S99/5k1No99/wk9u7dk1EiGJ8kSaK9fU0kSTKudQAA6tO0KO9vu+22eOUrXxkbNmyIH/7wh/H//t//i8cffzwGBwfje9/7XnzgAx+I1tbW+PGPf5x1VAAAYBo71nVpXa8W4OQrl8uxYcP1FWcbNlx/1HbkUG+ampqjtXXVSFGfJEm0ta2Kpqb5GScDAGC8Tvny/nvf+16sX78+HnrooTjttNPiDW94Q2zcuDG2bNkSN954Y6xcuTIiInp7e+OP//iPo7/f2SwAAEA2XK8WIDvd3XfF0NBQxdnQ0FB0d99V40QwcW1tq2PWrNkRETF79pxobV2dcSIAACbilC7v0zSN973vffH444/HaaedFps2bYo///M/j5e+9KWxaNGieMUrXhGf+tSn4sorr4yIiAMHDsRHPvKRjFMDAADTlevVAmRn8eKl0djYWHHW2HhGLF68tMaJYOLy+XxcdtnlMXfuvGhvXxv5fD7rSAAATMApXd7/8Ic/jPvvvz8iIi699NJ44QtfWPFxf/qnfxoLFiyIiIivf/3r8cgjj9QsIwAAwBGuVwuQnVwuF+vWXVVxtm7dVUf9YBUAAEC1ndLfdXz/+98fuf2yl73smI9LkiRe8pKXRETEY489Fv/2b/920rMBAABU4nq1ANlZuHBRLFhw9qi1s846OxYuLGaUCCamVCrF5s0bY//+fbF588YolUpZRwIAYAJO6fJ+0aJF8Sd/8iexevXq+PVf//XjPjZN05Hb3tQCAABZcr1agOy8/e1Xj7p/5ZVXH+ORUH86O7fG4OCBiIgYHDwQXV1bM04EAMBEzMg6wMn0ohe9KF70oheN67Hf/e53R263tLiOJAAAkJ0j16v97Gc3xZvetMb1agGAMQ0M9Edn5+iyvrNzW6xYcV40NTVnlAoAgIk4pc+8H6877rgjfvzjH0dExIIFC2L+fNtRAgAA2Vq2bHnccMMnY9my5VlHgROye/euWL/+iti9e1fWUWBCPvaxD4+6f+ONHz7GI6F+pGkaHR2bolwuj1ofHh6Ojo5No3YdBQCgfk378v7AgQPxF3/xFyP316xZk2EaAAAAmPpcc5mpau/ePXHvvT8ZtXbPPT+JvXv3ZJQIxqevrzd6erorznp6uqOvr7fGiQAAOBGn9Lb5Y3n44YfjiiuuiP7+/oiIeMELXhAXXXTRpJ93xoxp/zMRAAAATGO33rotDh4cjIiIgwcH46tf3RaXXPK6jFPB8ZXL5diw4YaKsw0bboi/+7vPRC7nMx/qU0tLYcy5zyypd089RmfMyDlm65TfFybDn+365O/f+jJty/uHHnoo3vzmN8cPf/jDiIiYP39+fPSjH530N2G5XBKzZ59ehYQAAAAw9fT19cX27VtHtmhO0zS2b98Wra2vikLh+OUSZOm73/1uDA09VHE2NPRQ3H//j+KFL3xhjVPB+HR1dR13/t3vfjtaW1trlAZOzKOPNozcnj379Jg5c2aGaTiWp/4+wUT5s12f/P1bX6Zlef/zn/883vzmN49c537evHnxmc98Jp71rGdN+rnL5TQOHXpk0s8DAAAAU02apnHDDTcedW3lcrkcN9xwY7z73ddEkiQZpYPje+5zfysaG8+oWOCfccYZ8dzn/lYMDj6cQTIY24te9NL45Cc/GcPDw0fNGhoa4kUveqnjl7r36KOPjtweHHw4Zs48+ngme0/9fYKJ8me7Pvn79+Q788ynR0PD+E4gn3bl/U9+8pN4y1veEg888EBEPHHG/Wc+85n4zd/8zaq9xuHD5ao9FwAAAEwVvb3/GXv2/PCo9XK5HHv2/DB+9rP/iJaW59Q+GIzTunXr44MfvO6o9be+9aool584lqE+JfHKV14YO3ZsP2ryyle2RpomPrOk7j31GD18uOyYrVN+X5gMf7brk79/68u0umjBHXfcEa973etGivvf+I3fiC984QtVLe4BAABguioUWqJYXHzUJelyuVwUi0uiUGjJKBmMz8KFi2LOnLmj1ubOnRsLFxYzSgTjUy6X4447vllxdscd3/CDJwAAU8S0OfN+69atcc0118Thw4cjImLZsmXxyU9+MmbNmpVtMAAAADhFJEkS7e1r4p3vXF9x3Zb51LuBgf4YHBwctTY4OBgDA/3R1NScUSqmgjRNo1QqZfb6e/bcFUNDQxVnQ0NDsWvXd2PRoqU1TvWEfD7v738AgHGaFuX9rbfeGu95z3tGrrn33/7bf4u/+Zu/iac97WkZJwMAAIBTS1NTc8yaNTsOHNg/sjZr1qxoapqfYSoYW5qm0dGxKZIk4smPkEZ0dGyKq69+rwKSitI0jeuuuzbuu++erKMc08c//tHMXnvBgrPimmuu8+cHAGAcTvlt87///e/HNddcM1Lc/+Ef/mFcf/31insAAAA4Cfbu3TOquI+I2L9/f+zduyejRDA+fX290dPTfdT24uVyOXp6uqOvrzejZEwFemkAAKrhlD7zfmhoKK6++uoYHh6OiIiLL744rr322oxTAQAAwKmpXC7Hhg3XV5xt2HB9fOITmyKXO+XPI2CKKhRaolhcHD093UfNisUlUSi0ZJCKqSBJkrjmmusy3Tb/iA9+8H3xr/96/8j95z3vrHjnO6/JMJFt8wEAJuKULu8///nPR39/f0REPOtZz4r//t//e/z4xz8e8+uam5tj1qxZJzkdAAAAnFq6u49/zeXu7rti6dLfrnEqGJ8kSeLCCy+qWN5feOFFykeOK0mSmDlzZtYx4q1vvSre8Y63RsQTmdavf2dd5AIAYHxO6fL+lltuGbn94IMPxmtf+9pxfd0HPvCBeM1rXnOyYgEAAMApadGiJdHQ0DCyA95TNTQ0xKJFS2ofCsYpTdPYsWN7JEkycvnFiCcK0B07vhK/9VvnKPCpe2eccebI7Ve9qi3OPPPM4zwaAIB6c8ruVXfgwIGRs+4BAACAk++BB/orFvcREcPDw/HAA75Pp34dueb9U4v7iCdKfde8ZypateqSrCMAADBBp+yZ93PmzIl77rkn6xgAAAAwbbhmOFOZ4xcAAMjaKXvmPQAAAFBbSZJEe/uayOVGf9zQ0NAQ7e1rbDlOXTtyzftKXPMeAACoBeU9AAAAUDVNTc3R1rZ61Fpb26poapqfUSIYnyPXvK9kx46vHLWdPgAAQLUp7wEAAICqamtbHbNnz4mIiDlz5kZr6+oxvgKyd+Sa95W45j0AAFALynsAAACgqvL5fJx77srI5XKxYsX5kc/ns44EY2puLkRjY2PFWWPjGdHcXKhxIgAAYLpR3gMAAABVVSqV4tvf3hnlcjm+/e2dUSqVso4EY+rv74uhoaGKs6Ghh6K/v6/GiQAAgOlGeQ8AAABUVWfn1jh4cDAiIg4eHIyurq0ZJ4KxFQotUSwurjgrFpdEodBS40QAAMB0o7wHAAAAqmZgoD+6urZFmqYREZGmaXR2bouBgf6Mk8HxJUkSCxeeU3G2cOE5kSRJjRMBAADTjfIeAAAAqIo0TaOjY9NIcT/WOtST4eHh2LLlloqzLVu+GMPDwzVOBAAATDfKewAAAKAq+vp6o6enO8rl8qj1crkcPT3d0dfXm1EyGNvOnbcds6AfHh6OnTtvq3EiAABgulHeAwAAAFVx5JrhudzojxtyuZxrhlP3Vq684Jhb4ydJEitXXlDjRAAAwHSjvAcAAACqIkmSaG9fc1QBeqx1AAAA4JeU9wAAAEDVNDU1R2vrqpGiPkmSaGtbFU1N8zNOBse3c+dtkaZpxVmaprbNBwAATjrlPQAAAFBVbW2rI5/PR0REPj8zWltXZ5wIxnb++S+b1BwAAGCylPcAAABAVZVKpSiVSk/efnTkNtSzgYEHJjUHAACYLOU9AAAAUFUf+9iHR7YfT9M0brzxwxkngrE1NxeisbGx4qyx8Yxobi7UOBEAADDdKO8BAACAqtm7d0/ce+9PRq3dc89PYu/ePRklgvHp7++LoaGhirOhoYeiv7+vxokAAIDpRnkPAAAAVEW5XI4NG66vONuw4fool8s1TgTj19Q0f1JzAACAyVLeAwAAAFXR3X3Xcc5cHoru7rtqnAjG7/bbvzGpOQAAwGQp7wEAAICqWLx46XGvGb548dIaJ4LxW7nygmhoaKg4a2hoiJUrL6hxIgAAYLpR3gMAAABVkcvlYt26qyrO1q27KnI5H0NQvxoaGmLt2j+pOFu79opjFvsAAADV4rtmAAAAoGoWLlwUCxacPWrtrLPOjoULixklgvFbseL8mDnz6aPWnv70p8eKFedllAgAAJhOZmQdAAAAADi1vP3tV8e6dWsjTdNIklxceeXVWUeCcRkY6I9HH/3FqLVf/OIXMTDQH01NzRmlAoD69NjhUtYRmAIcJzAxynsAAACgqs4888z4nd95UXzve9+J3/mdF8aZZ56ZdSQYU5qm0dGxqeKso2NTXH31eyNJkhqnAoD6de3X35Z1BIBTjm3zAQAAgKoqlUpx3333RETEfffdE6WSs22of319vdHT011x1tPTHX19vTVOBAAATDfOvAcAAACqqrNzaxw8OBgREQcPDkZX19a4+OJLM04Fxzd/fnM0NDTE8PDwUbOGhoaYP9+2+QDwVNe9/OPxtBn5rGNQ5x47XLJLA0yA8h4AAAComoGB/ujq2hZpmkbEE1uRd3ZuixUrznPNcOranj0/rFjcR0QMDw/Hnj0/jKVLf7vGqQCgfj1tRj7yynuAqrJtPgAAAFAVR64ZfqS4H2sd6sk55yya1BwAAGCylPcAAABAVRy5Zni5XB61Xi6XXTOcunfHHd+c1BwAAGCylPcAAABAVRQKLVEsLo5cbvTHDblcLorFJVEotGSUDMa2cuUF0dDQUHHW0NAQK1deUONEAADAdOOa9wAAAEBVJEkS7e1r4l3vuqriepIkGSWDsTU0NMTatX8Sf/d3nzhqtnbtFccs9gEA4ESkaRqlUinrGFEqPVrxdpby+fy0/f5ReQ8AAABUTVNTc7S2rort22+NNE0jSZJoa1sVTU3zs44GY1qx4vzo6NgUjz76yw8tZ858eqxYcV6GqQAAONWkaRrXXXdt3HffPVlHGWXdusuzjhAREQsWnBXXXHPdtCzwbZsPAAAAVFVb2+o4/fTTIyLi9NMbo7V1dcaJYHwGBvpHFfcREY8++osYGOjPKBEAAKeqadhLMw7OvAcAAACqLk2P/G+abRAYpzRN49Of/l8VZ5/+9P+Ka675y2l55g8AANWXJElcc811dbFt/hOOfN9WH+93bZsPAAAAUCWdnVvjkUcejoiIRx55OLq6tsbFF1+acSo4vt7e/4x77/1Jxdm99/4kenv/M57znP9fjVMBAHCqSpIkZs6cmXUM6oxt8wEAAICqGRjoj66ubSNn3KdpGp2d22w7DgAAAGNQ3gMAAABVkaZpdHRsOmqr/GOtQz1paXlOLFhwdsXZggXPj5aW59Q4EQAAMN0o7wEAAICq6OvrjZ6e7iiXy6PWy+Vy9PR0R19fb0bJYGxJksSyZb9dcbZs2bJpe81NAACgdpT3AAAAQFUUCi1RLC6OXG70xw25XC6KxSVRKLRklAzGNjw8HFu23FJxtmXLLTE8PFzjRAAAwHSjvAcAAACqIkmSaG9fc9QZysdah3qyc+dtxyzoh4eHY+fO22qcCAAAmG6U9wAAAEDVNDU1R2vrqpGiPkmSaGtbFU1N8zNOBsd3/vkvm9QcAABgspT3AAAAQFW1ta2OWbNmR0TE7NlzorV1dcaJYGwDAw9Mag4AADBZynsAAACgqvL5fJx77srI5XKxYsX5kc/ns44EYyoUWqJYXFxxViwuiUKhpcaJAACA6UZ5DwAAAFRVqVSKb397Z5TL5fj2t3dGqVTKOhKMKUmSePGLV1ScvfjFK0YuBQEAAHCyKO8BAACAqurs3BoHDw5GRMTBg4PR1bU140QwtnK5HDff3FFxdvPNm6NcLtc4EQAAMN3MyDoAAAAAcOoYGOiPrq5tkaZpRESkaRqdndtixYrzoqmpOeN0cGzd3XfF0NBQxdnQ0FB0d98VS5f+do1TAaeaNE3tSHMMpdKjFW/zS/l83k4wAKc45T0AAABQFWmaRkfHppHi/r+uX331e33gTN0655xFk5oDjEepVIrLL39j1jHq3rp1l2cdoS5t3Pi5mDlzZtYxADiJbJsPAAAAVEVfX2/09HQftb14uVyOnp7u6OvrzSgZjO2OO745qTkAAMBkOfMeAAAAqIpCoSWKxcWxd2/PqAI/l8vFwoWLolBoyTAdHN955/1edHTcdNw5QDW97g8/ETNm5LOOUVeO7N5jp55fOny4FF/8/FuzjgFAjSjvAQAAgKpIkiTa29fEu951VcV1H8RTz+6+e8+Yc9e8B6ppxox8nHaaLdABgF+ybT4AAABQNU1NzdHaumrUWlvbqmhqmp9NIBgn17wHAACyprwHAAAAqurlL3/VyFn2SZLE7//+qzJOBGNzzXsAACBrynsAAIA6tHv3rli//orYvXtX1lFgwr7+9a+OXLM2TdP453/+asaJYGwrV14QDQ0NFWcNDQ2xcuUFNU4EAABMN8p7AACAOlMqlWLz5o2xf/++2Lx5Y5RKpawjwbgNDPTH9u1bR61t3741Bgb6M0oE49PQ0BCXXHJpxdkll7zumMU+AABAtSjvAQAA6kxn59Y4eHAwIiIOHhyMrq6tY3wF1Ic0TaOjY1OkaXnUerlcfnI9zSgZjC1N09i79+6Ks717exy/AADASae8BwAAqCMDA/3R1bVt1JbjnZ3bnLXMlNDX1xs9Pd0VZz093dHX11vjRDB+jl8AACBrynsAAIA68cuzltNxrUO9mT+/+bjXDJ8/v7nGiWD8nv3spknNAQAAJkt5DwAAUCeOnPVZLh+95bizPpkK9uz5YQwPD1ecDQ8Px549P6xtIJiAr3zlHyc1BwAAmCzlPQAAQJ0oFFqiWFwcudzob9VyuVwUi0uiUGjJKBmMz+LFS6OxsbHirLHxjFi8eGmNE8H4vfrVF09qDgAAMFnKewAAgDqRJEm0t6+JJEnGtQ71JpfLxbp1V1WcrVt31VE/mAL15IEH+ic1BwAAmCzfNQMAANSRpqbmaG1dNVLUJ0kSbW2roqlpfsbJYHwWLlwUCxacPWrtrLPOjoULixklgvF58MGfT2oOAAAwWcp7AACAOtPWtjpmzZodERGzZ8+J1tbVGSeCiXn7269+yr0krrzy6mM+FupFsbh4UnMAAIDJUt4DAADUmXw+H5dddnnMnTsv2tvXRj6fzzoSTEg+n4+ZM2dGRMTMmXnHMFPCHXd8c1JzAACAyVLeAwAA1KFly5bHDTd8MpYtW551FJiwzs6tUSqVIiKiVCpFV9fWjBPB2FauvCAaGhoqzhoaGmLlygtqnAgAAJhulPcAAABA1QwM9EdX17ZI0zQiItI0jc7ObTEw0J9xMji+hoaGWLiwWHF2zjnFYxb7AAAA1aK8BwAAAKoiTdPo6Ng0UtyPtQ715PHHH489e35Ycdbd/cN4/PHHaxsIAACYdpT3AAAAQFX09fVGT093lMvlUevlcjl6erqjr683o2Qwts9//u8nNQcAAJgs5T0AAEAd2r17V6xff0Xs3r0r6ygwboVCSxSLiyvOisUlUSi01DgRjN8b3nDZpOYAAACTpbwHAACoM6VSKTZv3hj79++LzZs3RqlUyjoSjEuSJLFw4TkVZwsXLowkSWqcCMbvwQd/Pqk5AADAZCnvAQAA6kxn59Y4eHAwIiIOHhyMrq6tGSeC8RkeHo4tW26pONuy5ZYYHh6ucSIYv/nzm6OhoaHirKGhIebPb65xIgAAYLpR3gMAANSRgYH+6OraFmmaRkREmqbR2bktBgb6M04GY9u587ZjFvTDw8Oxc+dtNU4E47dnzw+Pe/zu2fPD2gYCAACmHeU9AABAnUjTNDo6No0U92OtQ71ZufKC4565vHLlBTVOBON3zjmLJjUHAACYLOU9AABAnejr642enu4ol8uj1svlcvT0dEdfX29GyWB8Ghoa4pJLLq04u+SS1x2z2Id6cMcd35zUHAAAYLKU9wAAAHWiUGiJYnFxJEkyaj1JkigWl0Sh0JJRMhifNE1j7967K8727u2xewR17bzzfm9ScwAAgMlS3gMAANSJJEniwgsvqrht/oUXth1V6kO9ObJ7RCV2j6De3X33nknNAQAAJkt5DwAAUCfSNI0dO7ZXnO3Ysd1Zy9S95uZCNDY2Vpw1Np4Rzc2FGieC8XPNewAAIGvKewAAgDrhrGWmuv7+vhgaGqo4Gxp6KPr7+2qcCMbPNe8BAICsKe8BAADqxJFr3udyo79Vy+VyrnnPlHDkGP6vl3hIksQxTN176UtXTmoOAAAwWcp7AACAOpEkSbS3r6lYfFZah3pzrGM1l8s5hql73/rWzknNAQAAJkt5DwAAUEeampqjtXXVSMmZJEm0ta2Kpqb5GSeD8Wlqao7nPnfBqLXnPvd5jmHq3vnnv2xScwAAgMlS3gMAANSZtrbVMWvW7IiImD17TrS2rs44EYzfwEB/3H//vaPW7rvv3hgY6M8oEYzPwMADk5oDAABM1oysAwAAADBaPp+Pyy67PD772U3xpjetiXw+n3UkGJc0TaOjY1PFWUfHprj66vfaOp+61dxciMbGxhgaGjpq1th4RjQ3FzJIBQD167HhUtYR6kqaphER3u/+F44TmBjlPQAAQB1atmx5LFu2POsYMCF9fb3R09N91Hq5XI6enu7o6+uNlpbnZJAMxtbf31exuI+IGBp6KPr7+xy/APAU137tbVlHADjl2DYfAAAAqIpCoSWKxcUVZ8XikigUWmqcCMZv/vzmaGhoqDhraGiI+fOba5wIAACYbpx5DwAAAFRFkiTx4hevqHj2/YtfvMIWotS1PXt+GMPDwxVnw8PDsWfPD2Pp0t+ucSoAqC/5fD42bvxc1jHqTqn0aKxbd3lERGzYsDHy+ZkZJ6pPLgkHY1PeAwAAAFVRLpfj5ps7Ks5uvnlzvPjF50YuZxNA6tPzn79wUnMAmA6SJImZMxXTx5PPz/T/EXDCfMcMAAAAVEV3913HuWb4UHR331XjRDB+Gzf+r0nNAQAAJkt5DwAAAFTF4sVLo7GxseKssfGMWLx4aY0Twfi1ta2e1BwAAGCylPcAAABAVeRyuVi37qqKs3XrrrJlPnWtpeU5k5oDAABMlu+aAQAAgKqZN+9Zx1ifV+MkMDFf+co/TmoOAAAwWcp7AAAAoCrSNI2Ojk0VZx0dmyJN0xongvG76KLXTGoOAAAwWcp7AAAAoCr6+nqjp6e74qynpzv6+nprnAjGb+/enknNAQAAJkt5DwAAAFRFodASxeLiSJJk1HqSJFEsLolCoSWjZDC2RYuWRENDQ8VZQ0NDLFq0pLaBAACAaUd5DwAAAFRFkiRx4YUXHbU9fpqmceGFbUeV+lBPHnigP4aHhyvOhoeH44EH+mucCAAAmG6U9wAAAEBVpGkaO3ZsrzjbsWO7a95T1+bOnTepOQAAwGQp7wEAAICqcM17prKPf/wjk5oDAABMlvIeAAAAqIqmpvmTmkOW3va2/zGpOQAAwGQp7wEAAICquP32b0xqDlnav3/fpOYAAACTpbwHAAAAquL88182qTlkyc4RAABA1pT3AAAAQFUMDDwwqTlkyc4RAABA1pT3AAAAdWj37l2xfv0VsXv3rqyjwLgVCi1RLC6uOCsWl0Sh0FLjRDB+do4AAACyprwHAACoM6VSKTZv3hj79++LzZs3RqlUyjoSjEuSJNHevqbirL19TSRJUuNEMH52jgAAALKmvAcAAKgznZ1b4+DBwYiIOHhwMLq6tmacCMavqak5GhpmjFqbMWOG64VT91zzHgAAyJryHgAAoI4MDPRHV9e2SNM0IiLSNI3Ozm0xMNCfcTIYnx07tsXw8OFRa4cPH44dO7ZlEwjGyTXvAQCArCnvAQAA6kSaptHRsWmkuB9rHerN4cOH45Zbbq44u+WWm+Pw4cMVZ1APXvSil0xqDgAAMFnKewAAgDrR19cbPT3dUS6XR62Xy+Xo6emOvr7ejJLB+Gzb9uVJzSFLH/zg+yY1BwAAmCzlPQAAQJ0oFFqiWFwcudzob9VyuVwUi0uiUGjJKBmMz0UXvWZSc8jSO9957aTmAAAAk6W8BwAAqBNJkkR7+5pIkmRc61Bv9u7tmdQcsvSP/3jLpOYAAACTpbwHAACoI01NzdHaumqkqE+SJNraVkVT0/yMk8HYFi1aEg0NDRVnDQ0NsWjRktoGggl47Wv/cFJzAACAyVLeAwAA1Jm2ttXxjGecHhERp5/eGK2tqzNOBOPzwAP9MTw8XHE2PDwcDzzQX+NEMH5f+tLnJzUHAACYLOU9AABAHfrlDvlpljFgQpqbCzFz5syKs5kznx7NzYUaJ4Lxe93r3jSpOQAAwGQp7wEAAOpMZ+fWGBoaioiIoaGh6OramnEiGJ/e3v+MRx99tOLs0Ud/Eb29/1njRDB+3/727ZOaAwAATJbyHgAAoI4MDPRHZ+e2UWudndtiYMB249S/Bx/8+aTmkKWzznr+pOYAAACTpbwHAACoE2maRkfHpkjT8qj1crn85Lot9KlvixYtmdQcsvTsZzdNag4AADBZM7IOAAAAwBP6+nqjp6f7qPU0TaOnpzv6+nqjpeU5GSSD8al0/P7X+dKlv12jNEwlaZpGqVTKNMMNN3x4zPn69VfXKM1o+Xw+kiTJ5LUBAIDaUd4DAADUiebmQjQ2No5c7/6pGhvPiObmQgapYPwWL1563GN48eKlGaSi3qVpGtddd23cd989WUc5rr1798Tll78xk9desOCsuOaa6xT4AABwirNtPgAAQJ3o7++rWHpGRAwNPRT9/X01TgQTk8vlYvnyF1ScLV/+O5HL+RiCynTSAAAAzrwHAACoG4VCSxSLiytuPV4sLolCoSWDVDB+hw8fjttv/2bF2e23fzPa2y+PGTN8FMFoSZLENddcl/m2+UdUOrt+48bPZZDkl2ybDwAA04PvmAEAAOpEkiTR3r4m3vnO9VEul0fWc7lctLevUdxQ97Zt+/KY8z/4g0trlIapJEmSmDlzZtYxIiLiHe94d3z0ox8cuf/ud/953WQDAABObfarAwAAqCNNTc3R1rZ61NpFF62Opqb5GSWC8bvootdMag714PnPXzhy+/TTT4+FC4sZpgEAAKYT5T0AAECdaWtbHbNnz4mIiDlz5kZr6+oxvgLqw969PZOaQ7254YZPZR0BAACYRpT3AAAAdSafz8dll10ec+fOi/b2tZHP57OOBONyzjmLJjUHAACA6cw17wEAAOrQsmXLY9my5VnHgAm5445vjjm/4IJX1CgNAAAATC3OvAcAAACq4oUvfPGk5gAAADCdKe8BAACAqnj/+6+Z1BwAAACmM+U9AABAHdq9e1esX39F7N69K+soMG7XXvtXk5oDAADAdOaa9wAAAHWmVCrF5s0bY3DwQGzevDEWLixGPp/POhaM6b777hlzvnTpb9coDQDUt8cfL2UdgSnAcQIwvSjvAQAA6kxn59Y4eHAwIiIOHhyMrq6tcfHFl2acCsZ2zjmLJjUHgOnklpvfmnUEAKDO2DYfAACgjgwM9EdX17ZI0zQiItI0jc7ObTEw0J9xMhjbHXd8c1JzAAAAmM6ceQ8AAFAn0jSNjo5NI8X9f12/+ur3RpIkGaWDsb30pSujo+Om484BgCdc+oZPxGmnuTQSx/f44yW7NABMI8p7AACAOtHX1xs9Pd1HrZfL5ejp6Y6+vt5oaXlOBslgfL71rZ1jzi+44BU1SgMA9e200/Jx2mkzs44BANQR2+YDAADUiUKhJYrFxZHLjf5WLZfLRbG4JAqFloySwficd97vTWoOAAAA05kz7wEAAOpEkiTR3r4m3vWuqyqu2zKfenf33XvGnC9d+ts1SgOcqtI0jVKplHWMulQqPVrxNr+Uz+e9pwIA6pbyHgAAoI40NTVHa+uq2L791kjTNJIkiba2VdHUND/raDCm5z9/4aTmAONRKpXi8svfmHWMurdu3eVZR6hLGzd+LmbOtFU9AFCfbJsPAABQZ9raVsesWbMjImL27DnR2ro640QwPjfd9L8mNQcAAIDpzJn3AAAAdSafz8dll10en/3spnjTm9ZEPp/POhKMy9q1fxrf/e53jjsHqKbT//AvIpnxtKxj1JU0TSMibA3/FOnhx+Lhz78v6xgAAGNS3gMAANShZcuWx7Jly7OOARMynmveL1/+ghqlAaaDZMbTIjnND7k9lcoeAGDqsm0+AAAAUBWDgwcmNQcAAIDpTHkPAMApa/fuXbF+/RWxe/eurKPAhDl+mYrOP/9lk5oDAADAdKa8BwDglFQqlWLz5o2xf/++2Lx5Y5RKpawjwbg5fpmqxrNtPgAAAFCZ8h4AgFNSZ+fWOHhwMCIiDh4cjK6urRkngvFz/DJVLVxYnNQcAAAApjPlPQAAp5yBgf7o6toWaZpGRESaptHZuS0GBvozTgZjc/wylW3ffuuk5gAAADCdKe8BADilpGkaHR2bRorPsdahnjh+meouvPDVk5oDAADAdKa8BwDglNLX1xs9Pd1RLpdHrZfL5ejp6Y6+vt6MksHYHL9Mdbfc8rlJzQEAAGA6U94DAHBKKRRaolhcHLnc6Le6uVwuisUlUSi0ZJQMxub4Zaq79NI3TmoOAAAA05nyHgCAU0qSJNHeviaSJBnXOtQTxy9T3Y4dX5nUHAAAAKYz5T0AAKecpqbmaG1dNVJ0JkkSbW2roqlpfsbJYGyOX6ayV73qoknNAQAAYDpT3gMAcEpqa1sds2bNjoiI2bPnRGvr6owTwfg5fpmqPvOZT01qDgAAANOZ8h4AgFNSPp+Pyy67PObOnRft7Wsjn89nHQnGLZ/Px7x5z4qIiLlz5zl+mTIuv/ytk5oDAADAdDYj6wAAAHCyLFu2PJYtW551DJiwffsejPvuuyciIu67757Yt+/BkTIf6tm+fQ+OOW9peU6N0gAAAPD/b+/O46yq6/+Bv++w3EkGFGQRUCMLzIVFcy13S1NBUdM0F0zFNNGfWqkVRkpfpa+VG2ZFJIvmLgiYmgtumZE7qSwafIEBEZRtRC4Dc39/EDfGWWGYOXdmns/Hw4fnnM/n3vua4cPl3vM+n8+hcTHzHgAAIM8MH35Nuf1f/OKaKnpCfunSZYc6tQMAAEBzpngPAACQR1588dn4+OOPyh376KOP4sUXn00mEGyGZ599uk7tAAAA0Jwp3gMAAOSJ9evXxx//+LtK2/74x9/F+vXrGzgRbJ6DDjq0Tu0AAADQnCneAwAA5ImpU5+qskC/fv36mDr1qQZOBJvnnnvG1akdAAAAmjPFewAAgDxx+OFfjxYtWlTa1qJFizj88K83cCLYPKeffnad2gEAAKA5U7wHAADIEy1atIjzz7+w0rbzz7+oysI+5IsXX3yuTu0AAADQnCneAwAA5JGDDjosCgs/V+7Y5z73OfcKp1E45JDD69QOAAAAzZniPQAAQB5ZvHhRZDJryh3LZDKxePGihBJB7T3//NQ6tQMAAEBzpngPAACQJ7LZbIwdO7rK49lsNoFUUHuHHnpEndoBAACgOVO8BwAAyBMLFxbH9OlvVijSZ7PZmD79zVi4sDihZFA7//rXW3VqBwAAgOZM8R4AACBPdO3aLYqKiiptKypqG127dmvgRLB5evfuW6d2AAAAaM4U7wEAAPLEokULo6SkpNK2kpJVsWjRwgZOBJtn+vQ369QOAAAAzZniPQAAQJ7o1KlzndohaT177lqndgAAAGjOFO8BAADyxN13j6lTOyRt+PCf1qkdAAAAmjPFewAAgDxx5pnfrVM7JO1rXzusTu0AAADQnCneAwAA5IlWrVrFcccdX2nbcccNjFatWjVwItg8Rx55VJ3aAQAAoDlTvAcAAMgjp512VrRo0bLcsZYtW8Zpp52RUCKovd/85oY6tQMAAEBzpngPAECT9cAD98agQd+OBx64N+kosFl+/vPry+0PG3Z9FT0hv1x22VV1agcAAIDmTPEeAIAmaeXKlTF58sNRVlYWkyc/HCtXrkw6EtRahw7bV7sP+erBB++pUzsAAAA0Z4r3AAA0SbfccmNks9mIiMhms3HrrTcmnAhq75Zbyo9X45fG4tRTq7+9Q03tAAAA0Jwp3gMA0OS8/fZbMWvWjHLHZs6cEW+//VZCiaD2jF8aszvv/H2d2gEAAKA5a9bF+2uuuSZ23XXXuOmmm5KOAgDAVlJWVhYjR1b++W7kyJuirKysgRNB7Rm/NHannz6oTu0AAADQnLVMOkBSnnzyybj//vuTjgEAwFb25puvR0lJSaVtJSUl8eabr8dee32lgVNB7Ri/NHY/+9mVNbbffvvoBkoDAPlt3bpM0hHyzsZbn6VSqYST5A/jBKB5aZbF++eeey4uv/zypGMAAFAP+vbdK4qKiiotgBYVtY2+ffdKIBXUjvFLY/eLX/wqLr30gmrbAYAN7rnr4qQjAAB5ptktmz9mzJi4+OKLo7S0NOkoAADUg4KCghgypPILNYcMuTwKCprdR2AakYKCgjjjjMqXFT/jjEHGL3nv6aefqFM7AAAANGfNZub93LlzY8SIETF16tSIiGjRokWsX78+4VQAANSHPfboE716fTlmzZqRO7brrl+OPfbonWAqqFk2m42XXnqx0raXXnohvva1QywhSl479tjj45FHHqq2HQCas3Q6HaNGjU86Rl7KZNbEkCGDIyJi5MhRkU4XJpwo/6TT6aQjAFDPmkXx/u67744bbrghN9v+S1/6UpxzzjkxdOjQhJMBAFBfzjvvwrjqqsty++eee2FyYaCWFi4sjunT36y0bfr0N2PhwuLo3n3HBk4FtfenP/2uxvYhQ65ooDQAkH9SqVQUFipK1ySdLvR7AqBZahZrLk6fPj1KS0ujdevW8b3vfS8efvjh2HnnnZOOBQBAPRo9unwBqaaCEuSDrl27RVFRUaVtRUVto2vXbg2cCDbPoEGD69QOAAAAzVmzmHmfTqfjlFNOiYsuuii6d++edBwAAOrZ22+/VW7J/IiImTNnxNtvvxV77NEnoVRQs0WLFkZJSUmlbSUlq2LRooVm3lOlbDYbmUwm0Qy3335zje2XXfajhgnzGel02m0nAAAAyGvNong/bNiwKChoFosMAAA0e2VlZTFy5E2Vto0ceVPcfvtonw3JW926dY/evftWunR+7979ols3FyNTuWw2G8OHXxOzZ89MOkq13n77rRg8+KxEXrtXr11j6NDhCvgAAADkrWZx1tLJWQCA5uPNN1+vZuZySbz55usNnAhqL5VKxc47f77Stp13/ryiI9UyPAAAAKBxaxYz7xtay5YuFgAASMpXvvKVKCpqGyUlqyq0tW3bNr7yla+4uJO8tW7dunj00UmVtj366CNx2mnfiZYtfY2jcj//+f8kvmz+Rt/97hkVjt15590JJPkvy+ZTW5ue12nZssB5njzkz4S68Pc6f3n/pTEzfoGtxVmfraygIBXt27dJOgYAQLP205/+JH784x9Xcvynsf32bRNIBLUzduzYatv/8peJMWjQoAZKQ+NUlHSAiIi47rrr4mc/+1luf8SIEdG16/YJJoLaW7OmRW67ffs2UVhYmGAaKrPpnxFsLn+v85f3Xxoz4xfYWhTvt7KysmysXLk66RgAAM1ajx69omfPXjF79qzcsV69do3Pf75nLFv2SYLJoHp9+nwl7rnnnmrbjWEag512+mJuu02bIu+/NCpr1qzJbS9b9kkUFq5PMA2V2fTPCDaXv9f5y/svjZnxC1SnXbvPRYsWtVuRQ/G+HqxbV5Z0BACAZq9Hjy+WK9736LGLz2nkvbIahmhZme8bNA6bjtObb77DuKVR2XS8rltXZvzmIX8m1IW/1/nL+y+NmfELbC1uugEAQJOzePGieOqpx8sde/LJx2Px4kUJJYLaWbLkwzq1AwAAANB4Kd4DANCkZLPZ+MMffhvZbLZWxyGfdO7cuU7tAAAAADReivcAADQpxcULYtasGZW2zZo1I4qLFzRwItgcqTq2AwAAANBYKd4DAADkiVQNtfma2gEAAABovBTvAQBoUrp33zF69fpypW29eu0W3bvv2MCJYHOYeQ8AAADQXCneAwDQpKRSqbjggu9H6jNTlKs6DvmkU6fq72lfUzsAAAAAjZfiPQAATU6XLl3j2GOPL3fsuOOOjy5ddkgoEdTOH/5we53aAQAAAGi8FO8BAGiSTjzxlGjTpigiIoqK2sbAgacknAhq1r//wDq1AwAAANB4tUw6QFL233//mDlzZtIxAACoJ+l0Oi644OIYN250nH32eZFOp5OOBDUqKKj+tg41tQMAAADQeDXb4j0AAE3f3nvvE3vvvU/SMaDW/vGPv9fY/vnPf6GB0gAAAADQkCybDwBAk/Xaa6/EZZddFK+99krSUaBWNt7qYUvbAQAAAGi8zLwHAKBJymQyMWbMqFi27OMYM2ZU7LFHb0vnk/dWrlxRp3YAaG6ypWuTjkAjYJwAAI2F4j0AAE3S5MkTYvnyZRERsXz5spgyZUKcfPJpCacCAGBr+uTua5OOAAAAW41l8wEAaHIWL14UU6ZMjGw2GxER2Ww2Jk+eGIsXL0o4GVSvoKD6r2g1tQMAAADQeJl5DwBAk5LNZmPs2NFRVlZW7nhZWVmMHTs6fvSjn0YqlUooHVTvxBNPiSlTJlbbDgD8V5szhkWqVeukY5DnsqVrrdIAADQKivcAADQpCxcWx/Tpb1Y4ns1mY/r0N2PhwuLo3n3HBJJBzV5//ZUa2/ff/6sNlAYA8l+qVetItUonHQMAALYKay4CANCkdOvWPXr1+nKlbb167RbdunVv4ERQe88++0yd2gEAAABovBTvAQBoRrJJB4BqXXLJFXVqBwAAAKDxsmw+AABNysKFxTFr1oxK22bNmmHZfPLaCy88W2P70UcfW/9BgCYvm81GJpNJOkZeymTWVLrNf6XT6UilUknHAACAJkfxHgCAJqVbt+7Ru3ffePvt6VFWVpY7XlBQEHvs0cey+eS1lStX1KkdoLYymUwMHnxW0jHy3pAhg5OOkJdGjRofhYWFSccAAIAmx7L5AAA0KalUKgYNOq/CbLCqjkM+2XPPvnVqBwAAAKDxMvMeAIAmp0uXrtG//8CYNOnhyGazkUqlYsCAgdGlyw5JR4NqjR79uxrbf/WrWxsoDdBctDpjUETLVknHyCvZbDYiwkV/m1pXGqV3j006BQAANGmK9wAANEkDBpwYzz8/NZYt+zjat+8Q/fufmHQkqNGhhx4Z999/V7XtAFtdy1aRaqV4vykl+4qySQcAAIBmwLL5AAA0Sel0Os45Z3Bsv33HGDTo/Ein00lHghq1aFH9V7Sa2gEAAABovJz5AQAAyBMrViyvUzsAAAAAjZfiPQAATVImk4lRo26Pjz5aGqNG3R6ZTCbpSAAAAAAAVVK8BwCgSZo48YEoKSmJiIiSkpKYOPHBhBNBzVasWFGndgAAAAAar5ZJBwAAgK1t8eJF8eijk8ode/TRR+Kww46ILl26JpQKarbddtvVqR0AAKA5yWazebHSXiazptLtJKXT6UilUknHADaT4j0AAE1KNpuNP/zht5HNZis9PnTodb68kre+8IVd6tQOAADQXGSz2Rg+/JqYPXtm0lHKGTJkcNIRIiKiV69dY+jQ4c6BQCNj2XwAAJqU4uIFMWvWjErbZs2aEcXFCxo4EdTeq6++Uqd2AACA5kRdGmhqzLwHAADIE3Pn/rtO7QAAAM1FKpWKoUOH58Wy+RtsXAEwP64osGw+NE6K9wBAtV577ZUYN250nH32ebH33vskHQdq1L37jtGr15crnX3fq9du0b37jgmkgtrp3HmHWLSouNp2AAAANkilUlFYWJh0DICtxrL5AECVMplMjBkzKj76aGmMGTMqj65khqqlUqm44ILvV7i6vKrjkE/atSuqUzsAAAAAjZfiPQBQpcmTJ8Ty5csiImL58mUxZcqEhBNB7XTp0jWOPfb4cseOO+746NLFrGXyW2HhNnVqBwAAAKDxUrwHACq1ePGimDJlYmSzG+7Xlc1mY/LkibF48aKEk0HtfLZ4f8wxx1fRE/LHO++8Xad2AAAAABovxXsAoIJsNhtjx47OFe5rOg756C9/mVRu/7HHJlXRE/JHTRdIuYAKAAAAoOlqmXQAACD/LFxYHNOnv1nheFlZWUyf/mYsXFgc3bvvmEAyqJ3FixfFo4+WL9Y/+uikOOywI6NLl64JpaIxyGazkclkEnv9008/O8aPH11t+5o1axowUXnpdDpSqVRirw8AAADQlCneAwAVdOvWPXr37ltpAb93737RrVv3BFJB7WSz2fjDH34bERVXjvjDH34bQ4dep/hIpbLZbAwffk3Mnj0z6ShVGj9+dLXF/frWq9euMXTocH+HAAAAAOqBZfMBgApSqVQcd1zl9wc/7rjjFW3Ia8XFC2LWrBmVts2aNSOKixc0cCIaE29vAAAAACTFzHsAoIJsNhuPPjopUqlUufvbp1KpePTRR2L33fdUwCePZevYTnOVSqVi6NDhiS6bX1ZWFt///rmxfv36Cm0tWrSI3/72T1FQkNw12JbNByDfZNetTTpC3tn4Hc6/2f9lnAAAjYXiPQBQQVX3vM9ms+55TyNQ00lKJzGpWiqVisLCwkQz/OhHP4kRI4ZXcvynsc022ySQCADy1yd3XZt0BAAA2Gosmw8AVLDxnvefnd1ZUFDgnvfkvW7duldZ4NxmmzbGL3lvjz36xA47dCt3rFu3HWOPPXonlAgAAACAhmDmPQBQQSqVikGDzourrrq80uOWXySfLVq0MFavXl1p2+rVn8SiRQutHEHeu/LKn8YVV1yc2//pT80qBICN0ul0jBo1PukYeSmTWRNDhgyOiIiRI0dFOp3sikL5KJ1OJx0BAKBKivcAQKW6dOka/fsPjEmTHo5sNhupVCoGDBgYXbrskHQ0qNbGlSMqu/WDlSNoLNq2bZfbPu6446Ndu3bV9AaA5iUfbnPTGKTThX5PAACNjGXzAYAqDRhwYmy3XfuIiGjfvkP0739iwomgZhtXiCgoaFHueIsWLawcQaM0cOApSUcAAAAAoAEo3gMAVUqn03HOOYNj++07xqBB51tekEajS5euMWDAwHLHrBwBAAAAAOQzy+YDANXae+99Yu+990k6Bmy2AQNOjKeeeiI++aQkioraWjkCAAAAAMhrZt4DANV67bVX4rLLLorXXnsl6Siw2dauzURERCazJuEkAAAAAADVU7wHAKqUyWRizJhR8dFHS2PMmFGRyWSSjgS19tBD90RpaWlERJSWlsZDD92bcCIAAAAAgKpZNh8AqNLkyRNi+fJlERGxfPmymDJlQpx88mkJp4KaLV68KB577NFyxx57bEoceeRR0aVL14RSAUB+yv7nYjeojnECAAD1T/EeAKjU4sWLYsqUiZHNZiMiIpvNxuTJE+Oggw5V/CSvZbPZGDny5krbRo68Oa67bkSkUqmGDQUAeWzdn8cmHQEAAICwbD4AUIlsNhtjx47OFe5rOg75ZMGC+TF37r8rbZs799+xYMH8Bk4EAAAAAFAzM+8BgAoWLiyO6dPfrHC8rKwspk9/MxYuLI7u3XdMIBnUbMmSD2ts32mnnRsoDQDkv5bfGRSpVq2SjkGey5aWWqUBAADqmeI9AFBBt27do3fvvvH229OjrKwsd7ygoCD22KNPdOvWPcF0UL1+/faO1q1bx9q1ayu0tW6djn799k4gFQDkr1SrVor3AAAAecCy+QBABalUKgYNOq/CfcGrOg75JJvNRmlpaaVtpaVr3fYBAAAAAMhLivcAQKW6dOka/fsPzBXqU6lUDBgwMLp02SHhZFC9qVOfqrJAn81mY+rUpxo4EQAAAABAzRTvAYAqDRhwYmy3XfuIiGjfvkP0739iwomgZoceekSd2gEAAAAAkqB4DwBUKZ1OxznnDI7tt+8YgwadH+l0OulIUKN//eutOrUDAAAAACRB8R4AgCald+++dWoHAAAAAEiC4j0AUKVMJhNjxoyKjz5aGmPGjIpMJpN0JKjRW2+9Uad2AAAAAIAkKN4DAFWaPHlCLF++LCIili9fFlOmTEg4EdSsY8dOdWoHAAAAAEiC4j0AUKnFixfFlCkTI5vNRkRENpuNyZMnxuLFixJOBtUrKEjVqR0AAAAAIAmK9wBABdlsNsaOHZ0r3Nd0HPJJTcPT8AUAAAAA8pHiPQBQwcKFxTF9+ptRVlZW7nhZWVlMn/5mLFxYnFAyqFk2W1andgAAAACAJLRMOgAAkH+6desevXv3jbffnl6ugF9QUBB77NEnunXrnmA6qN7MmTNqbN955x4NEwZo0rLZbGQymaRj5KVMZk2l2/xXOp2OVMqtXAAAAPgvxXsAoIJUKhWDBp0XV111eaXHnWgmnx1++Ndj3LjR1bYDbA2ZTCYGDz4r6Rh5b8iQwUlHyEujRo2PwsLCpGMAAACQRyybDwBUqkuXrtG//8BcoT6VSsWAAQOjS5cdEk4G1Vu8+IM6tQMAAAAAJMHMewCgSgMGnBjPPz81li37ONq37xD9+5+YdCSoUdeu3aKw8HOxZs2nFdoKCz8XXbt2SyAV0NQVnHlsRMsWScfIK9lsNiLCij2bWrc+yu76S9IpAAAAyFOK9wBAldLpdJxzzuAYN250nH32eZFOp5OOBDUqLl5QaeE+ImLNmk+juHhB7LTTzg2cCmjyWraIVCtfsTelZF9RNukAAAAA5DVnFgCAau299z6x9977JB0Dam3Jkg9rbFe8BwAAAADyjXveAwDQpPTp069O7QAAAAAASVC8BwCgSZk+/c06tQMAAAAAJEHxHgCAJsXMewAAAACgMVK8BwCgSZk3b26d2gEAAAAAktAy6QAAADQt2Ww2MplMYq9/77131dh++eVXNVCa8tLpdKRSqUReGwAAAADIb4r3AABsNdlsNoYPvyZmz56ZdJQqvfPOv2Lw4LMSee1evXaNoUOHK+ADAAAAABVYNh8AqNZrr70Sl112Ubz22itJR6GRUJcGAAAAANh8Zt4DAFXKZDIxZsyoWLbs4xgzZlTssUfvSKfTSccij6VSqRg6dHiiy+ZHRLz77tvxm9+MqHD8iiuujt122yOBRBtYNh8AAAAAqIriPQBQpcmTJ8SyZR9HRMSyZR/HlCkT4uSTT0s4FfkulUpFYWFhohn22usrscMOXeODDxbljnXr1j322usrCaYCgDy1rjSySWfIM9nsht+Ii+42sa406QQAANDkKd4DAJVavHhRTJ48odyxyZMnxkEHHRpdunRNKBXU3pVXDo0rrrg4t//Tn16XYBoAyF+ld49NOgIAAADhnvcAQCWy2WyMHTs6ysrKyh1fv359jB07OjcTCfJZ27btctvHHXd8tGvXrpreAAAAAADJMvMeAKhg4cLimD79zUrbpk9/MxYuLI7u3Xds4FSw5QYOPCXpCACQV9LpdIwaNT7pGHkpk1kTQ4YMjoiIkSNHRTqd7O2A8lE6nU46AgAANEmK9wBABTvs0DVatGgR69evr9DWokWL2GEHy+YDADRmqVQqCgsVpWuSThf6PQEAAA3GsvkAQAVvvfVGpYX7iA1L57/11hsNGwgAAAAAAJo4xXsAoIK+ffeKoqKiStuKitpG3757NXAiAAAAAABo2hTvAYAKCgoK4owzBlXadsYZg6KgwEcIAAAAAADYmpx5BwAqyGazMXXq05W2TZ36VGSz2QZOBAAAAAAATZviPQBQQXHxgpg1a0albbNmzYji4gUNnAgAAAAAAJo2xXsAAAAAAAAASFjLpAMAAPmnW7fu0bp161i7dm2Fttat09GtW/cEUgFA/sqWrks6Ao2AcQIAAEB1FO8BgAqKixdUWriPiFi7NhPFxQtip512buBUAJC/snc/FtmkQwAAAACNmmXzAYAKliz5sE7tAAAAAADA5jHzHgCooF+/vWObbbaJ1atXV2jbZps20a/f3gmkAoD8lTrjmEi18hWb6mVL10X27seSjgEAAECecmYBAKigoKAgLr30BzFixPAKbZde+oMoKLB4DwBsKtWqpeI9teL2CgAAAFTFmQUAoFJ77NEnevbsFbNnz8od69lz19hjj94JpgKammw2G5lMJukYeSeTWVPpNuWl0+lIpVJJxwAAAADYKhTvAYAqXXbZVTFkyPmRzWYjlUrFZZddmXQkoInJZDIxePBZScfIa0OGDE46Qt4aNWp8FBYWJh0DAAAAYKuw5i0AUKV27drFgAEnRUFBQQwYcFK0a9cu6UgAAAAAANAkmXkPAFTrlFNOi1NOOS3pGEBzMGiniFaWQM/J/ufO2JaFL680GzF2ftIpAAAAALY6xXsAACA/tEpFqpXFwaheNsqSjgAAAABQL5wZAwAAAAAAAICEKd4DAAAAAAAAQMIU7wEAAAAAAAAgYYr3AAAAAAAAAJCwlkkHAAAqymazkclkko7xH9n//D+VaIqN0ul0pFL5kQUAAAAAALYWxXugyVL8rJriZ37LZrMxfPg1MXv2zKSj5KVevXaNoUOHG8MAAAAAADQpivdAk6T4WT3Fz/znjwYAAAAAAJoXxXugyVL8pLFKpVIxdOjwvFg5IpNZE0OGDI6IiJEjR0U6XZhwIitHAAAAwNaUL6tXZjJrKt1OknMQADQ0xXugSVL8rJ4vHvkvlUpFYWHyY2VT6XRh3mUCAAAAtly+rl658Vxa0qxeCUBDU7wHmizFTwAAAAConro0AOQPxXsAAAAAAGiG8mn1yg2y//l/flxRYPVKABqa4j0AAAAAADRT+bh6JQA0VwVJBwAAAAAAAACA5s7MewAAAKirdetzi7yyQTa74TdiqdlNrFufdAIAAADymOI9AAAA1FHZXX9JOkLeclEDAAAA1I5l8wEAAAAAAAAgYWbeAwAA+aG0zAxdalZalnSCnHQ6HaNGjU86Rl7KZNbEkCGDIyJi5MhRkU4XJpwo/6TT6aQjAAAAkGcU7wEAgPwwdkHSCWCzpFKpKCxUlK5JOl3o9wQAAAC1oHgPANDIZbPZyGQyScfIO5nMmkq3KS+dTkcqlUo6BgAAAAA0e4r3AACNXCaTicGDz0o6Rl7buHQzFY0aNT5/ZsQO2jGiVUHSKch3pWVWaQAAAACaJMV7AAAgP7QqiJTiPTXIJh0AAAAAoJ4o3gMANCGHnZKNFj7h5WT/U+WzKnx569dFPPuAXwoAAAAA5BOndgEAmpAWLSNatko6BQAAAAAAm8ualAAAAAAAAACQMMV7AAAAAAAAAEiY4j0AAAAAAAAAJEzxHgAAAAAAAAAS1jLpAAAAAAAAjVk2m41MJpN0jMhk1lS6naR0Oh2pVCrpGAAAjYLiPQAAAADAFspmszF8+DUxe/bMpKOUM2TI4KQjREREr167xtChwxXwAQBqwbL5AAAAAAB1oC4NAMDWYOY9AAAAAMAWSqVSMXTo8LxYNn+D7H/+nx9XFFg2HwCg9hTvASDy5/6E+SYf75eYj5yMAgCA5i2VSkVhYWHSMQAAaOQU7wEgIjKZTAwefFbSMfJavtwvMR+NGjXeiToAAAAAAOpE8R4AAMgPpdnIRlnSKfJH9j9L3lrZo7zSbM19AAAAABohxXsA+Ixzj49o5V/IHLWjypWui/jTpKRTQBMzdn7SCQAAAAAgMUoTAPAZrVpGtGqpUk1N8nPm5/p1SSegMTBOAAAAACD/KN4DW002m41MJpN0jLyTyaypdJv/SqfTkTKtG7aKZx/wd4nGJZ1Ox6hR45OOkXcymTUxZMjgiIgYOXJUpNOFCSfKT+l0OukIAAAAAFuN4j2w1WQymRg8+KykY+S1jSfhKW/UqPFRWKgoAdAcpVIp/wbUIJ0u9DsCAAAAaAYU7/PQihXLk46QU1aWjZKSVUnHyEtFRW2joCA/Zjduu+12SUcAIE8cdko2WviERw3Wr7NKAwAAAADkG6d285CZuWyu8eMfSDpCBSO+3jpat0g6Rf7IZjfcG9vS8P+1dn3E1U+tTToGNDktWka0bJV0CgAAAAAANpfiPVAvWreISLdUqP4vv4uKskkHAAAAAAAAyBuK93lo5MhRSUfIsWx+1fJp2XwAAAAAAACgcVO8z0P5dv/y9u3bJx0BAAAAAAAAoEkrSDoAAAAAAAAAADR3Zt4DAAAAkKhsNhuZTCbpGBERkcmsqXQ7Sel0OlIpt+4DAICmTvEeAD6jdF1ERDbpGOS5DeMEAIC6ymazMXz4NTF79syko1QwZMjgpCNERESvXrvG0KHDFfABAKCJU7wHgM/406SkEwAAQPOiJg0AAKB4DwDQpKy3IkA52f8soqEgUJ5xAkA+SaVSMXTo8LxZNn+DjStx5ceHCMvmAwBA86B4DwCfce7xEa38C0kNStfl5yoNzz7gpC4A0PikUqkoLCxMOgYAAECilCYA4DNatYxo1VIBlJpka+4CAAAAAAC1pHgP1IvMOkUtqmeMwNaTTqdj1KjxScfIO5nMmhgyZHBERIwcOSrSabP5KpNOp5OOAAAAAACE4j1QT378dGnSEQCaDcvM1iydLvQ7AgAAAADyWkHSAQAAAAAAAACguTPzHqgXNxzZKtLuGU41MuuyVmgAAAAAAAD4D8V7oF6kW6YU7wEAAAAAAKCWFO8B4DNK10VEZJOOkTey//lVpFyPU86GcQIAAAAAAFuH4j0AfMafJiWdAAAAAAAAaG4Kkg4AAAAAAAAAAM2dmfcAEBHpdDpGjRqfdIy8k8msiSFDBkdExMiRoyKdLkw4UX5Kp9NJRwAAAAAAoJFTvAeAiEilUlFYqDBdnXS60O8IAAAAAADqiWXzAQAAAAAAACBhZt4D9WLt+oiIbNIx8kY2u+F3kUqlEk6SPzaMEQAAtpZsNhuZTCbpGBGx4dY7lW0nKZ1O+zwOAABAXlO8B+rF1U+tTToCAAA0G9lsNoYPvyZmz56ZdJQKhgwZnHSEiIjo1WvXGDp0uAI+AAAAecuy+QAAANAEqEkDAABA42bmPbDVpNPpGDVqfNIx8k4msyY322jkyFGRThcmnCj/pNPppCMAADRqqVQqhg4dnjfL5m+w8TZa+XFVgWXzAQAAyHeK98BWk0qlorBQYbo66XSh3xEAAPXC53EAAABo3CybDwAAAAAAAAAJU7wHAAAAAAAAgIQp3gMAAAAAAABAwhTvAQAAAAAAACBhivcAAAAAAAAAkLCWSQcAAADIF9lsNjKZTNIxIpNZU+l20tLpdKRSqaRjAAAAADRJivcAAGxVip9VU/jMb9lsNoYPvyZmz56ZdJRyhgwZnHSEnF69do2hQ4cbxwAAAAD1QPEeaLIUj6qmeJT/jN+qGb/5TfGzegqf+c8fDQAAAABJUbwHmiTFo+opHuU347d6xm/+80dDY5VKpWLo0OF5cfHUBtn//D9//lK5gAoAAACg/ijeA02W88o0ZsYvjZXiZ/UUPvNfKpWKwsLCpGMAAAAA0AylstlstuZu1Nb69WXx8cefJB0DiPxZdnwDxSM2j/FbNeMXAAAAAIDGokOHNtGiRUGt+pp5DzRZZs7RmBm/AAAAAADQvNSuxA8AAAAAAAAA1JtmM/O+rKwsJkyYEBMnToyZM2fG6tWro1OnTrH33nvHaaedFvvuu2/SEQEAAAAAAABopprFPe9XrVoV3//+92PatGmVtqdSqTjnnHPi6quvrvNruec9AAAAAAAAABHueV9ONpuNyy67LFe4P+igg+L000+Pjh07xrvvvhujRo2K4uLiuPPOO6NDhw5xwQUXJJwYAAAAAAAAgOamyc+8nzRpUvzoRz+KiIiTTjopbrjhhnLty5cvjzPOOCPee++9SKfT8de//jV22GGHLX49M+8BAAAAAAAAiNi8mfe169WI3XnnnRERUVRUFFdddVWF9u222y6uvfbaiIjIZDIxbty4Bs0HAAAAAAAAAE26eD9//vx45513IiLi8MMPj+22267Sfvvss0984QtfiIiIxx9/vKHiAQAAAAAAAEBENPHi/auvvprbPuCAA6rtu99++0VERHFxccybN69ecwEAAAAAAADAppp08f69997Lbffo0aPavjvttFNue/bs2fUVCQAAAAAAAAAqaNLF+w8++CC33a1bt2r7du3atdLHAQAAAAAAAEB9a9LF+xUrVuS227RpU23fbbbZJre9atWqessEAAAAAAAAAJ/VMukA9Wnt2rW57cLCwmr7btq+6eM2V0FBKjp0qP5CAQAAAAAAAACavoKCVK37NunifYsWLXLbqVT1v5RsNpvbLijY8gUJUqlUtGhR+z8AAAAAAAAAAGjSy+ZvuhT+mjVrqu2byWRy261bt663TAAAAAAAAADwWU26eL/pfe4//fTTavuuXr06t73tttvWWyYAAAAAAAAA+KwmXbzv3r17bnvRokXV9t20vUuXLvWWCQAAAAAAAAA+q0kX73v27JnbnjdvXrV958+fn9v+0pe+VG+ZAAAAAAAAAOCzmnTxvl+/fpFKpSIi4pVXXqm277Rp0yIiomvXrrHjjjvWezYAAAAAAAAA2KhJF++7du0a/fr1i4iIJ554IkpKSirt98orr8ScOXMiIuLoo49uqHgAAAAAAAAAEBFNvHgfEXHWWWdFRMTy5ctj2LBhUVZWVq59xYoVMWzYsIiIaNWqVZx55pkNnhEAAAAAAACA5i2VzWazSYeob+edd168+OKLERGxzz77xNlnnx1dunSJmTNnxu9///soLi6OiIgrrrgivve97yUZFQAAAAAAAIBmqFkU70tKSuLCCy+Mf/7zn1X2Oeecc+Lqq6+OVCrVgMkAAAAAAAAAoJkU7yMiysrKYuLEiTFp0qSYMWNGrFq1Ktq3bx977bVXnHHGGXHAAQckHREAAAAAAACAZqrZFO8BAAAAAAAAIF8VJB0AAAAAAAAAAJo7xXsAAAAAAAAASFjLpANAdRYvXhzjx4+P5557LhYsWBAREV26dImDDjooTjnllNh1110TTgibZ+XKlXHcccfFhx9+GAMGDIhf/epXSUeCCp577rm44IILatX3oIMOitGjR9dzIth8s2fPjnvvvTdeeuml+OCDD6KsrCy6d+8eBx98cJxzzjnRtWvXpCNCzm233RYjR47c7MedeOKJMWLEiHpIBFvm008/jfvuuy+efPLJmD17dqxevTratWsXu+++e5xwwglx7LHHRosWLZKOCZVatmxZjB8/PqZOnRrz5s2LtWvXRrdu3eKrX/1qnHnmmfHFL34x6YhQzjXXXBP3339/XHjhhXH55ZdX27esrCwmTJgQEydOjJkzZ8bq1aujU6dOsffee8dpp50W++67bwOlhv/anDG80dq1a+Pkk0+OWbNmxZ133hlf/epX6zklVLQ5Y3fp0qVxzz33xIsvvhhz5syJ1atXR1FRUfTs2TOOPPLIOPXUU2ObbbZpoOSwweaMYTW6ZCjek7eeeuqpuOqqq6KkpKTc8Tlz5sScOXPinnvuiQsvvDAuueSShBLC5hs+fHh8+OGHSceAas2YMSPpCFAnv/3tb+P222+PdevWlTv+/vvvx/vvvx8PPvhg/OY3v4lDDz00oYSwdbRq1SrpCJAzd+7cuPDCC2POnDnljn/00UfxwgsvxAsvvBD3339/jBw5MrbddtuEUkLlXnzxxbjiiitixYoV5Y7PnTs35s6dG/fff39ceeWVMWjQoIQSQnlPPvlk3H///bXqu2rVqvj+978f06ZNK3d84cKFsXDhwnj00UfjnHPOiauvvro+okKlNmcMb+rGG2+MWbNm1UMiqJ3NGbtPPfVUXH311bFq1apyx5ctWxbTpk2LadOmxbhx4+L222+P3XbbrT7iQgWbO4bV6JKheE9eev311+Oyyy6L0tLSaNGiRZx66qlxyCGHRFFRUbzzzjsxatSoWLp0aYwcOTLatGkT5557btKRoUZPPfVUTJo0KekYUKN33nknIiI6duwYf/zjH6vtW1RU1BCRoNZGjhwZt912W0REtG/fPs4999zYa6+9Yt26dfH444/H/fffHyUlJXHppZfGww8/bBYdeeG0006Lr3/96zX2W7BgQVx++eVRWloanTp1iosvvrgB0kHNVq9eHeeff37Mnz8/IiL23XffOO2006Jr164xd+7c+NOf/hTvvfdeTJs2LYYMGRLjxo2LVCqVcGrY4NVXX40LL7wwSktLIyJys4i6du0a8+bNi3HjxsVbb70V119/faxcudLJSRL33HPP1XqWcjabjcsuuyxXuD/ooIPi9NNPj44dO8a7774bo0aNiuLi4rjzzjujQ4cOtV6BDepic8bwpm655ZYYN25cPSSC2tmcsTtt2rRcfaNVq1Zx6qmnxmGHHRbbbbddLFq0KCZMmBBTp06N4uLiOPfcc+Phhx+2QiD1bnPGsBpdshTvyUvXXXdd7ovzrbfeWu5k5n777RcDBgyIE044IZYsWRK33XZbnHzyyWZvkNc+/vjjGDZsWNIxoFbefffdiIjYfffdXflLo/Luu+/GHXfcERER3bt3j7Fjx8ZOO+2Uaz/wwANj9913j5/97GexZs2auOWWW+LWW29NKi7kdOrUKTp16lRtn7Vr18ZPfvKTKC0tjYKCgvj1r38dO+ywQwMlhOrddddducL98ccfH//7v/+bK85/5StfiQEDBsT3vve9eOmll2LatGnxxBNPxDe/+c0kI0NERKxbty6uvvrq3PmHiy++OC699NJce9++feOYY46JH/zgB/H444/HHXfcEUceeWTsvvvuSUWmmRszZkz86le/yo3ZmkyePDlexdk+ywAAJZtJREFUfPHFiIg46aST4oYbbsi19evXL4455pg444wz4r333ouRI0fG8ccf7/MF9Wpzx3DEhtvyDBs2LB555JF6TAbV25yxm81m49prr80V7kePHh37779/rr1Pnz5x9NFHx+233x633nprfPzxx/GrX/0qfv3rX9fnj0Azt7nvv2p0ySpIOgB81r/+9a/crM+jjz660llI22+/fZx33nkRsWGWx7PPPtuQEWGzXXvttbF06dLo0KFD0lGgWiUlJTFv3ryICCclaXRuvfXWWLduXaRSqbj55pvLFe43+va3vx29evWKiIhnnnkm1qxZ09AxYYuMHDky9xn53HPPLXfyB5L23HPP5bavvvrqCrPqW7duHVdeeWVu/+mnn26wbFCdZ599NvfZ96tf/Wq5wv1GLVu2jOuvvz622267WL9+fdx4440NHRNytya54YYbcjPgauPOO++MiA0rpl111VUV2rfbbru49tprIyIik8mY1Uy92dIx/Pe//z2+9a1v5Qr3tX0cbC1bMnbfeOONeO+99yJiwyprVX13+/73v587P/HXv/41Vq9evfWCw39syRhWo0ue4j15Z+3atfH1r389dt555/jGN75RZb9ddtklt71o0aKGiAZb5C9/+Us8/vjjUVBQEEOHDk06DlRrxowZkc1mIyLMuqdRWbZsWbzwwgsRseGLRZ8+farse95558Wpp54a5557ri/HNAozZsyI0aNHR0TEzjvvbMlm8s7SpUsjIqJdu3ax/fbbV9rnC1/4Qm57yZIlDZILavL3v/89t3322WdX2a9Nmza51SJefvnl+Oijj+o9G2x09913R//+/WPq1KkREfGlL30pV3Cvzvz583Mn3g8//PDYbrvtKu23zz775N6jH3/88a0TGjaxpWP4Bz/4QZxzzjm5IuhRRx0VgwYNqtessKktHbv//Oc/c9tHHnlklf1SqVR87Wtfi4gNNZF///vfdUwM5W3pGFajS55l88k7e++9d+y999419isuLs5td+7cuT4jwRZbunRpXHfddRERcc4550Tfvn0TTgTV23hyJyJijz32SDAJbJ6XXnopt5xX//79q+07cODAGDhwYAOkgq1j+PDhsW7duoiIGDp0aBQWFiacCMrr3LlzzJ07N1auXBlLliyp9DYQm56MtCQz+WLT8wo1fVfr2bNnRESUlZXFG2+8Ue3JeNiapk+fHqWlpdG6dev47ne/GxdffHG88cYbNT7u1VdfzW0fcMAB1fbdb7/9Ys6cOVFcXBzz5s2LnXfeua6xIWdLx/Drr78eERtmd1555ZUxcODAuO222+o5LfzXlo7dPn36xIUXXhiLFy8udwFrZTZOoInYsAIKbE1bOobV6JKneE+j9PHHH8ef/vSniIjYZptt4vDDD084EVRu2LBhsWzZsvjCF74Ql112mVlG5L2N97tv27ZtrF+/Pm644YZ48cUXY/78+dGyZcv4/Oc/H0ceeWScffbZ0a5du4TTwn/NmDEjt73prPuysrJYsmRJfPLJJ9GlS5do06ZNEvFgiz311FPxyiuvRETE1772tTj00EMTTgQVHXnkkTFt2rSIiPj1r38dI0aMKNe+fv36+NWvfpXbP/bYYxs0H1Rl03t+brPNNtX2bdnyv6fQ5s6dW1+RoIJ0Oh2nnHJKXHTRRdG9e/daP27jbOWIiB49elTbd9PbTc2ePVvxnq1qS8dwx44d4+STT45zzjnH9zgSsaVj94ADDqjxoqmN/vGPf+S2N+c1oDa2dAzXhhpd/VK8p9HIZDKxYMGCePrpp2PcuHGxZMmSSKVScc0110T79u2TjgcVTJw4MZ566qkoKCiIG264IdLpdNKRoEYbZ96XlpZG//79y53QzGQy8c4778Q777wTd911V9x2222x7777JhUVypk9e3ZERLRq1So6d+4cS5cujdtuuy0ee+yxWLFiRURsuD/ivvvuG5dccknss88+ScaFWtt0dlFl92KGfHD66afH008/HdOmTYsJEybEokWL4tRTT42uXbvG/PnzY+zYsfH2229HxIb7fh5yyCEJJ4YNNj2X8MEHH1Rb4Nx0KVAXZdOQhg0bFgUFm3/n0w8++CC33a1bt2r7du3atdLHwdawpWP43nvv3aLHwdaypWO3tp577rncJJpevXpZnYqtbmuPYTW6hqN4T6Mwffr0+Na3vlXu2A477BA///nPXdFDXlq8eHH8z//8T0RsWC5/r732SjgR1Gzt2rXx/vvvR0TEmjVrom3btnHOOefE/vvvH+3atYs5c+bEQw89FNOmTYtly5bFeeedF3/+859jzz33TDg5RCxfvjwiIoqKiuKNN96ICy+8MHdso/Xr18fLL78c//jHP+LKK6+Mc889t+GDwmZ46aWXcqtK7LffftGvX79kA0EV0ul0jBo1KkaPHh133nlnvPzyy/Hyyy+X69OpU6e48sor4/jjj08oJVTUr1+/mDx5ckRE/PWvf40LLrigyr7PPPNMbnv16tX1ng022tKT7hsvYI2IGmctb7ryxKpVq7bo9aAqWzqGFe5JWn2OwY8//jiGDRuW2z/vvPPq7bVovrbmGFaja1j+BaRRWLhwYYVjS5Ysifvuuy/+9a9/JZAIqjd06NBYuXJl9OjRI/7f//t/SceBWpk9e3Zupn2PHj1i4sSJ8cMf/jAOPvjg6Nu3bwwcODDGjx+fO6mZyWTiyiuvjLKysiRjQ0REfPLJJxGxYVxeeOGFsWLFijjrrLPi0UcfjenTp8dzzz0XV199dWyzzTaRzWbjl7/8ZfzlL39JODVU784778xtn3/++QkmgZq999578e6771ZZ1Fy6dGk89thjuQsFIR9885vfjMLCwoiI+P3vf19umfFNjRs3LmbNmpXbX7duXYPkg7pYu3ZtbnvjOK/Kpu2bPg6Are+TTz6Jiy66KLeqz3777ecCV/KeGl3DUrynUejRo0f8/ve/jwceeCBuv/32OPbYY2P9+vUxderUOPPMM+OFF15IOiLkPPDAA/H888/nlsuv6Usy5Isvf/nL8eSTT8add94Zo0ePjh133LHSfldccUVuNYn3338/nn322QZMCZX79NNPI2LDTLjly5fH8OHDY+jQofGlL30pWrduHTvssEN897vfjTvvvDNatWoVEREjRoyITCaTZGyo0vvvv5/7jLvrrru61z157dlnn40zzjgjnnzyydh2223j5z//ebzwwgsxffr0eOKJJ+Liiy+OVq1axTPPPBPf+c53nNwhb3Ts2DG+//3vR0RESUlJfOc734nx48fHhx9+GKWlpfHvf/87fvGLX8T1118fXbp0yT1u42cJyGctWrTIbadSqWr7ZrPZ3LbZzgD1Z9WqVXH++efHG2+8EREbZi7/5je/8d5L3lOja1jeEWgUdt111zjssMOiT58+8fWvfz1uuummuP766yNiw8n6H/7wh1FSUpJwSthwBdqIESMiImLQoEGx9957J5wIaq9Fixax8847x1e/+tUqC/cRG078fPvb387tv/TSSw0RD6q16YVSBx54YJxyyimV9uvXr19uma/Fixcbv+StKVOm5E6kn3TSSQmngap9+OGHcfnll8eaNWuiffv2cd9998Xpp58enTt3jtatW0ePHj3i0ksvjT/+8Y/RqlWrWL58eVxyySUuniJvXHDBBfGd73wnIjYsM/6LX/wiDj744Nhzzz3jmGOOifHjx0f37t3j17/+de4xmy4xDvlq03G6Zs2aavtu+p7cunXressE0Jx9+OGHcdZZZ8Vrr70WERsuIvzTn/4UnTp1SjgZ1EyNrmEp3tNonXzyyXH00UdHxIb73D7xxBMJJ6K5y2az8dOf/jRKSkqiR48ecdlllyUdCerNbrvtltsuLi5OMAlsUFRUlNs+6qijqu17xBFH5LY3Xu0O+ebJJ5+MiA0XTB1zzDEJp4GqTZw4MbdU/qWXXho777xzpf3233//OOOMMyJiwwWvTz/9dINlhOqkUqkYNmxYjBw5Mnr37l1uhnKnTp1i8ODB8cgjj0S7du1yxzt27JhEVNgsm97nfuMqVVXZ9JYn2267bb1lAmiuZsyYEaecckq8++67EbFhxv24cePii1/8YsLJYMup0dWflkkHgLo46qijcm8IG//hg6Tcd999uRmcZ599dsyZM6dCnw8//DC3vXLlyty47dixo6ssaVTcE5F8s+l76A477FBt327duuW2ly1bVm+ZYEvNnTs3Zs+eHRER++yzT7mlmiHfvPXWW7ntI488stq+3/jGN2LMmDERseHiqWOPPbY+o8Fm+cY3vhHf+MY3YuXKlbFkyZJo27ZtdOrUKVfMf//993N9q1ulCvJF9+7dc9uLFi2q9vPExvsuR4TPHQBb2XPPPReXXXZZ7kKpXXbZJf74xz+We5+GxkqNrn4o3pN3Vq1aFfPmzYsFCxbEUUcdVe19ubbbbrvcdmlpaQOkg6ptOnvzuuuuq7H/c889F88991xERAwZMiQuueSS+ooGtfLOO+/EggUL4qOPPoqBAwfG5z73uSr7fvTRR7ltM4/IB7vuumv89a9/jYgNS95WZ9MLTjadRQf5YtMZyWbdk+82na3Ztm3bavtuv/32ue1Vq1bVWyaoi3bt2lX6+eD111/Pbe++++4NGQm2SM+ePXPb8+bNi379+lXZd/78+bntL33pS/UZC6BZmTBhQgwdOjTWrVsXERF777133HHHHeXqGpBv1OiSp3hP3rnuuuti0qRJEbFhCcZNl2b+rHnz5uW2a5plB0D1Ro0aFX/5y18iIqJHjx5x4IEHVtn31VdfzW336dOn3rNBTTY9Gfnqq6/GiSeeWGXfjTOaI8ycIz/985//zG3vv//+CSaBmrVv3z63PW/evPjyl79cZd/Fixfntjct5ENS5s+fHw899FB89NFHcfLJJ1dZ3Mxms7kLq3beeefYaaedGjAlbJl+/fpFKpWKbDYbr7zyShx//PFV9p02bVpERHTt2tXnY4Ct5OGHH46f/OQnkc1mI2LDhdn/+7//G61bt044GVRPjS557nlP3tl3331z2w8++GCV/crKysq1H3TQQfWaC2oyYsSImDlzZrX/bTqTbsCAAbnjZt2TDw444IDc9sSJE6vs9+mnn8a9994bERGtWrWq8f7i0BAOPPDA3NL5jz32WCxdurTKvhMmTIiIiBYtWsQRRxzRIPlgc2xczadt27bugUje22+//XLbjzzySLV9J0+enNve9HsfJKW0tDTuuOOOuP/++6v9/PuXv/wliouLIyJi4MCBDRMO6qhr1665C1KeeOKJKCkpqbTfK6+8krvt38b71gJQN//85z9j6NChucL9mWeeGTfddJPCPY2CGl3yFO/JO8cee2xu9sZ9990Xf//73yv0yWazcf3118fbb78dERFf+9rXonfv3g2aE6CpOfbYY3NLHU2aNCmeeuqpCn1KS0vjqquuyp28/M53vlPuXuOQlBYtWsR5550XERElJSXxwx/+MD755JMK/caOHZv7bPGNb3wjOnfu3KA5oSaLFy+OZcuWRURE7969q12eDvLBcccdFx06dIiIDe+xU6dOrbTf5MmT46GHHoqIDff5/NrXvtZgGaEqu+yyS/Tq1SsiNsyO23R1no1mzpwZ1157bURsWDHizDPPbNCMUBdnnXVWREQsX748hg0bFmVlZeXaV6xYEcOGDYuIDRdmG98AdVdSUhI/+tGPYv369RERcfLJJ8c111zjux2Nhhpd8iybT94pKiqKa6+9Ni677LIoLS2Nc889N0455ZQ49NBDo2PHjjFnzpy49957c/eb22GHHeL6669PODVA49e2bdsYNmxYXHHFFVFWVhaXXnppnHLKKXHUUUdFUVFRzJo1K8aNGxezZs2KiA3L5V9++eUJp4b/GjRoUDz77LPx8ssvx9///vc48cQTY9CgQbHbbrvFqlWrYtKkSTFlypSIiOjQoUP87Gc/SzgxVDR37tzc9s4775xcEKiloqKi+J//+Z+4+OKLY/369XHRRRfFcccdF8ccc0x07tw5Pvzww3jsscfi0UcfjWw2G4WFhTFixIho2dLpCPLDD37wg/je974XmUwmzjzzzBg8eHD07ds31q1bFy+++GLcfffd8emnn0aLFi3ihhtuiG233TbpyFBrxx13XDz88MPx4osvxpQpU+KDDz6Is88+O7p06RIzZ86M3//+97kLsy+55BK3hADYCu66665YtGhRRER06tQpTj311Hj33XdrfFzXrl3L3T8ckqJGlzzflslLRx99dNx4441xzTXXxOrVq+O+++6L++67r0K/PffcM2655Rb30gDYSo499thYu3Zt/PznP88tj79xifxNHXTQQXHTTTfF5z73uQRSQuUKCgri97//fVx11VXx+OOPx//93//FddddV6Ffjx494vbbb3e/ZfLSxpM8ERtO3kBjcMQRR8Stt94aV199dZSUlMSUKVNyF0ttqlOnTnHTTTdF3759E0gJlTvssMPiJz/5Sfzyl7+M5cuXx4033lihT9u2bWPEiBFx6KGHJpAQ6uaWW26JCy+8MP75z3/GK6+8Eq+88kqFPuecc05ccMEFCaQDaHo2PY+2ZMmS+Pa3v12rx91www1x0kkn1Vcs2CxqdMlSvCdv9e/fP/bdd9+4++674/nnn4958+bF2rVro3379tGnT5847rjj4pvf/GYUFLj7A8DWNHDgwNh///3j7rvvjhdffDH3/tuxY8fo06dPnHDCCXHkkUcmHRMqVVhYGLfccku89NJL8dBDD8Vrr70WS5cujbZt28bnP//56N+/fwwcODDatGmTdFSo1Ka3e/Dll8bkG9/4RnzlK1+Je+65J55//vmYM2dOfPLJJ9G2bdvo2bNnHHHEEXHqqad6/yUvDRo0KPbZZ58YN25cTJs2LZYsWRKtWrWKHj16xGGHHRZnnnmmi/5otIqKimLcuHExceLEmDRpUsyYMSNWrVoV7du3j7322ivOOOOMOOCAA5KOCdAkfPzxx+UuyIbGTI0uOalsNptNOgQAAAAAAAAANGcuhwAAAAAAAACAhCneAwAAAAAAAEDCFO8BAAAAAAAAIGGK9wAAAAAAAACQMMV7AAAAAAAAAEiY4j0AAAAAAAAAJEzxHgAAAAAAAAASpngPAAAAAAAAAAlTvAcAAAAAAACAhLVMOgAAAABQs+Li4njvvfdi4cKFUVJSEqWlpbHNNttE586dY5dddolevXpFQYFr9AEAAKCxUrwHAACAPPXOO+/EAw88EFOnTo1FixZV27dt27Zx+OGHx6mnnhr77rtvAyXMH0cccUQUFxdHRMR+++0X48ePTzgRAAAAbB7FewAAAMgz77//fvziF7+Il156qdaPWbVqVUyaNCkmTZoU+++/f/z85z+PXXbZpR5TAgAAAFuT4j0AAADkkbvuuit++ctfxtq1a8sdT6VSseeee8bOO+8cHTp0iM997nOxdOnS+L//+7944403Yv369bm+//jHP+Jb3/pW3HLLLXHwwQc39I8AAAAAbAHFewAAAMgTN998c9xxxx3ljnXq1Ckuuuii+OY3vxnbb799pY/7+OOPY8KECXHHHXfEqlWrIiLik08+iSFDhsRdd90VvXv3rvfsAAAAQN0UJB0AAAAAiFzxfVMnnXRSPPHEE3HGGWdUWbiPiOjQoUOcd9558cQTT0Tfvn1zx9esWRM/+tGPorS0tN5yAwAAAFuH4j0AAAAkbP78+TFs2LByx7773e/GDTfcEG3atKn182y//fbxxz/+sdy97ufMmRMPPvjgVssKAAAA1A/FewAAAEjYjTfeGJlMJre///77x5VXXrlFz9WuXbv4+c9/Xu7YXXfdVZd4AAAAQANQvAcAAIAEvf/++/HEE0/k9lu2bBm/+MUvoqBgy7+y77///tGvX7/c/nvvvRezZs2qS0wAAACgnrVMOgAAAAA0Z/fee2+5/eOPPz523nnnOj/vCSecEG+88UZ07949Dj300Ein0xX6nHXWWTFt2rSIiLjwwgvj8ssvj+XLl8fIkSPjr3/9a6xcuTK6dOkSffr0ieOOOy4OO+ywCs+RzWbjb3/7W7z88svx6quvxuLFi2PFihXx6aefRjqdjo4dO0bPnj3ja1/7WpxwwglRVFS0WT/HK6+8Eo888ki8+uqrsWjRoshms9GlS5fYZ5994pRTTil3kcLmWr9+fTz99NPx7LPPxhtvvBEfffRRfPLJJ9GhQ4fo3r17HHTQQdG/f//4/Oc/v8WvAQAAALWVymaz2aRDAAAAQHN18MEHx4cffpjbHz9+fOy33351ft4VK1bE4sWLo1evXlX2+Wzx/vzzz4/TTjst3nvvvQp9DzjggBg7dmy5Y08++WTcdNNN8f7779cq07bbbhtXXHFFnHbaaTX2Xbx4cQwdOjSef/75avudeuqp8dOf/jSOPfbYKC4ujoiI/fbbL8aPH1/t41544YW44YYbaszesmXLOPXUU+NHP/pRbLPNNjXmBgAAgC1l5j0AAAAkZMaMGeUK9+3atYt99tlnqzz3tttuG9tuu+1mPeYXv/hFpYX7iIijjjqq3P4dd9wRN99882Y9/4oVK2LYsGFRWloaZ511VpX95syZE2eddVYsWbKkxue8//77Y8mSJbFu3bpa57jzzjvjf//3f6OsrKzGvuvWrYs///nP8eabb8bvfve76Ny5c61fBwAAADaH4j0AAAAk5K233iq3369fvzrd674u3njjjXj55ZcrbUulUvH1r389t//888+XK9y3bt06vvOd78Q3v/nN2GWXXaKoqCjWrFkT8+fPj7/97W8xZsyYchcp/OY3v4kTTjgh2rVrV+G1Pv3007jwwgvLFe47d+4cF110URx++OGx/fbbx9KlS+O5556LO+64IxYvXhxTp06t9c/54IMPxogRI8odO/LII+Pb3/529O7dO4qKimLp0qXxt7/9Lf74xz/G3LlzIyLi7bffjosuuijuueeeaN26da1fDwAAAGpL8R4AAAAS8tkl23fZZZeEkkSucL/99tvHD3/4wzjssMMilUrF9OnTY9q0adGlS5dc3+uvvz633aJFi/jDH/4QBx54YLnna9OmTXz5y1+OL3/5y3HyySfHaaedFnPmzImIiNWrV8czzzwTAwcOrJDjd7/7Xa5gHhHRu3fvGD16dLlVBLp16xann3569O/fPwYPHhyvv/56rX7GuXPnxvDhw3P7LVu2jF/+8pfRv3//cv26desWp5xySpxwwglx9dVXx6OPPhoREf/617/i5ptvjiuvvLJWrwcAAACbI5nL+QEAAID44IMPyu137do1oSQbFBYWxtixY+Okk06KDh06RPv27eOQQw6JH/7wh7k+77zzTq4IHxHxrW99q0Lh/rO22267+N73vlfu2MyZMyv0+/TTT+Ouu+7K7bdr1y5uv/32Kpf/b9u2bfzud7+Ljh071urnGzVqVKxZsya3/5Of/KRC4X5TrVu3jhtvvDH22muv3LE///nPsXz58lq9HgAAAGwOxXsAAABIyCeffFJuv02bNgkl2eCkk06Knj17VtuntLQ0Tj311DjwwANjp512ilNPPbVWz7377ruX21+5cmWFPlOnTo2SkpLc/hlnnFFuxn9ltttuu7joootqfP2VK1fGpEmTcvtf/OIX4/TTT6/xcS1atIghQ4bk9j/99NOYMGFCjY8DAACAzWXZfAAAAEjI+vXry+1ns9laP/ass86KadOm1bp/9+7d45lnnqm2zyGHHFLj8/Tt2zf69u1b69fd6LMXJqxbt65Cn5deeqnc/jHHHFOr5x4wYEBcf/31FX6fm3r11Vdj7dq1uf1jjz02CgpqN6fhgAMOiG222SZWr14dERH/+Mc/4rvf/W6tHgsAAAC1pXgPAAAACflsQXtjcTgp/fr126rPt2TJkpg5c2a8+eab8fzzz5drKysrq9D/nXfeyW0XFhbGl770pVq9zrbbbhuf//zn49///neVfV577bVy+7vttlutnjsiomXLlvHFL34xpk+fHhERb7zxRq0fCwAAALWleA8AAAAJ6dChQ7n9pUuXJpRkw/3d27dvv9mPy2az8e6778bbb78dc+fOjfnz58eCBQtiwYIFsWLFimof91nFxcW57a5du0aLFi1qnWOXXXaptnj/wQcflNv//ve/X+vn/qxly5bFunXromVLp1UAAADYenzLBAAAgITssssu5fbnzZtX68eOHz++xj677rprrZ9v2223rXXfiA2rBIwZMybuv//+WLRoUY39W7ZsWelS+Zv65JNPcttFRUWbladt27bVtq9cuXKznq8mK1asiO23336rPicAAADNm+I9AAAAJGSPPfYot5/kcuytW7eudd/3338/vve978X8+fOr7FNYWBg9e/aMPn36xH777Rdf+MIX4vjjj98aUStVU/6aLhzYXGvWrNmqzwcAAACK9wAAAJCQfv36RVFRUZSUlERExOLFi2P27NnRs2fPhJNVbfny5XHuueeWW4Z+m222iYMPPjj69esXPXv2jC984QvRrVu3KCgoyPWpzaoCbdq0ieXLl0dExKpVqzYr16az9ivTrl27cvt/+9vfomPHjpv1GgAAAFCfCmruAgAAANSHVq1axZFHHlnu2EMPPZRQmtq54447yhXu99tvv3jyySfj1ltvjXPPPTcOPvjg2HHHHcsV7iMid4FCdXbaaafcdnFxcaxdu7bWuT57T/vP+uwS95tziwIAAABoCIr3AAAAkKAzzzyz3P59990XH374YUJpajZp0qTcdmFhYdxyyy21msE+d+7ccvvZbLZCnz333DO3XVpaGu+8806tMq1bt67Gvr179y63//e//71Wz73Rv//97/j444836zEAAACwORTvAQAAIEF9+vSJQw89NLe/evXquO666xJMVLXly5eXK2D37NkzOnToUKvH/vWvfy23X1ZWVqHPIYccUm7/4YcfrtVzT506NVavXl1tn/322y9SqVRuf8KECbFu3bpaPf/ixYvj+OOPjwMPPDD69u1b4YILAAAA2BoU7wEAACBh11xzTRQWFub2n3zyyfjVr361xc+XzWbjD3/4w9aIVuF5N7Vs2bJaPe7pp5+Oxx57rNyx0tLSCv0OOeSQ6NKlS27/4YcfjhkzZlT73GvXro1bb721xgxdu3Ytd5HE/PnzY/To0TU+LiLixhtvzOVds2ZN7L777rV6HAAAAGwOxXsAAABI2E477RQ33HBDuZnho0aNih/84AexfPnyzXquWbNmxaBBg+LXv/71Vk4Zsd1225VbIn/BggUxefLkah/zyCOPxA9/+MMKx9esWVPhWMuWLeOSSy7J7ZeWlsZFF11UYcn9TduvvPLKmDVrVq3yX3jhhVFQ8N9TITfffHP8+c9/rvYxf/jDH8r9jIWFhXHeeefV6vUAAABgcyjeAwAAQB449thj45prrilXXJ4yZUocffTRcdttt0VxcXGVj12/fn387W9/i0suuSSOP/74+Mc//lGuvaCgIAYMGFDnjKlUqsLz/PjHP44bb7wxZsyYEZlMJlavXh3//ve/Y8KECXHmmWfGlVdeWemS9iUlJZW+xre+9a1yy+cvXLgwTjzxxBg5cmTMmTMn1q5dGx999FE8+uijceKJJ+Zm9Lds2bLG/HvttVdcfPHFuf2ysrK49tprY9CgQfHEE0/E0qVLY+3atbFgwYJ48skn4/TTT69wEcSVV15ZbnUAAAAA2FpS2c+ueQcAAAAk5plnnomf/OQnlS5J/8UvfjF222236Ny5c6TT6SgpKYm5c+fGm2++GStXrqz0+fbcc8+45pprol+/fhXazjrrrJg2bVpERHTv3j2eeeaZGvOtWLEivvWtb8W8efM26+faZZddIpvNxpw5cyIiomPHjvG3v/2t0r4lJSVx/vnnx+uvv16r595rr72ie/fuMWXKlIjYcH/78ePHV9o3m83GiBEjYsyYMZuVPyLiggsuiB/84Aeb/TgAAACoDTPvAQAAII8cccQR8dhjj8W3v/3taN26dbm2999/P6ZMmRJ/+tOf4o477ojx48fHCy+8UGnhfo899oibb745HnzwwUoL91tq2223jTFjxkSfPn1q1b9Vq1YxaNCgeOihh+KrX/1q7vjSpUvjvffeq/QxRUVFMW7cuBg0aFCNM+oPOeSQ+O1vfxutWrWqVZ5UKhU//vGP4ze/+U107969Vo/p1KlT/OY3v1G4BwAAoF6ZeQ8AAAB5aunSpfHggw/G008/HdOnT4+avsL36NEjDj300Ojfv3+tiutbMvN+o/Xr18fTTz8djz32WEyfPj0++uijWLt2bbRp0yY6dOgQPXv2jH79+sVxxx0XO+ywQ0REvPzyyzFo0KByrz906NBqX+f//u//4oEHHoiXXnop5s6dG2vXro3OnTvH7rvvHgMHDowjjzwyUqlUXH311TFhwoSIqH7m/aZKS0vjySefjBdffDHefPPNWLp0aZSUlERhYWF07Ngx9txzzzj00EPjqKOOisLCwlr/bgAAAGBLKN4DAABAI7By5cqYNWtWzJ07N1atWhWrV6+OVq1aRfv27WPHHXeMXXfdNTp06JB0TAAAAGALKd4DAAAAAAAAQMLc8x4AAAAAAAAAEqZ4DwAAAAAAAAAJU7wHAAAAAAAAgIQp3gMAAAAAAABAwhTvAQAAAAAAACBhivcAAAAAAAAAkDDFewAAAAAAAABImOI9AAAAAAAAACRM8R4AAAAAAAAAEqZ4DwAAAAAAAAAJU7wHAAAAAAAAgIQp3gMAAAAAAABAwhTvAQAAAAAAACBhivcAAAAAAAAAkDDFewAAAAAAAABImOI9AAAAAAAAACTs/wMxt5C+XsAjYwAAAABJRU5ErkJggg==", "text/plain": [ - "
" + "
" ] }, "metadata": {}, @@ -572,10 +1696,21 @@ } ], "source": [ - "#visual of the clean_df in a heatmap to look at correlations\n", - "plt.figure(figsize=(20,20))\n", - "sns.heatmap(df.corr().abs(), annot=True)\n", - "plt.show()" + "#grade\n", + "plt.figure(figsize=(25,15))\n", + "sns.set(font_scale=2)\n", + "ax = sns.boxplot(x=\"grade\", y=\"price\", data=df)\n", + "ax.set_title('House Grade vs. Price', fontsize=50)\n", + "ax.set_ylabel('Price', fontsize=30)\n", + "ax.set_xlabel('Grade', fontsize=30)\n", + "ax.set_ylim(bottom=0, top=6000000);" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "When we look at grade, we can see that as the categorical building grade designation improves, the house price does indeed rise as well. " ] } ], From 56175458cde6fe9fd8e82672efce97783115b1fa Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Tue, 30 Apr 2024 18:53:40 +0300 Subject: [PATCH 12/42] Update student.ipynb --- student.ipynb | 1925 ++++++++++++++++--------------------------------- 1 file changed, 622 insertions(+), 1303 deletions(-) diff --git a/student.ipynb b/student.ipynb index 1beeaa73..03ba5c12 100644 --- a/student.ipynb +++ b/student.ipynb @@ -7,7 +7,7 @@ "## Final Project Submission\n", "\n", "Please fill out:\n", - "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng.\n", + "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Derrick Kiprotich, Clyde Ochieng. \n", "* Student pace: full time\n", "* Scheduled project review date/time: \n", "* Instructor name: Nikita \n", @@ -54,18 +54,19 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "### Data Understanding:\n", + "### Hypothesis\n", + "* Null Hypothesis - There is no relationship between our independent variables and our dependent variable \n", "\n", - "The real estate agency in Kingsway is analyzing a dataset to determine the factors affecting house prices. The dataset likely includes features such as property size, location, age, and market trends. Key steps include assessing data quality, exploring relationships between features and prices, and preprocessing data for multilinear regression analysis. Multilinear regression will be used to model how these features collectively influence house prices, with evaluation metrics used to assess predictive accuracy." + "* Alternative Hypothesis - There is a relationship between our independent variables and our dependent variable" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The dataset utilized in this analysis is the King County Housing dataset, encompassing details on over 21,000 homes within King County. Each entry in the dataset includes information on various features such as bedroom/bathroom/floor counts, living space and lot square footage, zip code, building grade, condition, and more.\n", + "### Data Understanding:\n", "\n", - "The King County Housing Dataset comprises multiple features contributing to the final sale price of homes in King County. Descriptions of these features are provided below." + "In this project, we utilized the King County House Sales dataset, which serves as the foundational dataset for our analysis. It was sourced Kaggle.The dataset encompasses comprehensive information regarding house sales within King County, Washington, USA. It comprises a diverse array of features, including the number of bedrooms, bathrooms, square footage, as well as geographical and pricing details of the properties sold. This dataset is frequently employed in data science and machine learning endeavors, particularly for predictive modeling tasks such as regression analysis aimed at forecasting house prices based on the provided features." ] }, { @@ -74,1139 +75,110 @@ "source": [ "##### King County Housing Data Columns \n", "\n", - "* `id` - Unique identifier for a house\n", - "* `date` - Date house was sold\n", - "* `price` - Sale price (prediction target)\n", - "* `bedrooms` - Number of bedrooms\n", - "* `bathrooms` - Number of bathrooms\n", - "* `sqft_living` - Square footage of living space in the home\n", - "* `sqft_lot` - Square footage of the lot\n", - "* `floors` - Number of floors (levels) in house\n", - "* `waterfront` - Whether the house is on a waterfront\n", - "* `view` - Quality of view from house\n", - "* `condition` - How good the overall condition of the house is. \n", - "* `grade` - Overall grade of the house. \n", - "* `sqft_above` - Square footage of house apart from basement \n", - "* `sqft_basement` - Square footage of the basement – (Ignored)\n", - "* `yr_built` - Year when house was built\n", - "* `yr_renovated` - Year when house was renovated – (Ignored)\n", - "* `zipcode` - ZIP Code used by the United States Postal Service \n", - "* `lat` - Latitude coordinate\n", - "* `long` - Longitude coordinate\n", - "* `sqft_living15` - The square footage of interior housing living space for the nearest 15 neighbors\n", - "* `sqft_lot15` - The square footage of the land lots of the nearest 15 neighbors" + "The column names contained in column_names.md are:\n", + "* `id`: A unique identifier for each house sale.\n", + "* `date`: The date when the house was sold.\n", + "* `price`: The sale price of the house, serving as the target variable for predictive modeling.\n", + "* `bedrooms`, `bathrooms`, `sqft_living`, `sqft_lot`: Numerical features representing the number of bedrooms and bathrooms, as well as the living area and lot area of the house, respectively.\n", + "* `floors`: The number of floors in the house.\n", + "* `waterfront`, `view`, `condition`, `grade`: Categorical features describing aspects such as waterfront availability, property view, condition, and overall grade assigned to the housing unit.\n", + "* `yr_built`, `yr_renovated`: Year of construction and renovation of the house.\n", + "* `zipcode`, `lat`, `long`: Geographical features including ZIP code, latitude, and longitude coordinates.\n", + "* `sqft_above`, `sqft_basement`, `sqft_living15`, `sqft_lot15`: Additional numerical features providing details about the house's above-ground and basement square footage, as well as living area and lot area of the nearest 15 neighboring houses." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "### Data Preparation\n", + "## Data Loading\n", "\n", - "Importing data." + "#### Import Necessary Libraries" ] }, { "cell_type": "code", - "execution_count": 98, + "execution_count": 15, "metadata": {}, "outputs": [], "source": [ - "#importing libraries \n", - "import pandas as pd\n", + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", "import numpy as np\n", - "from matplotlib import pyplot as plt\n", + "import pandas as pd\n", + "import scipy.stats as stats\n", "import seaborn as sns\n", - "from sklearn.preprocessing import OneHotEncoder, StandardScaler\n", - "from sklearn.linear_model import LinearRegression\n", - "import statsmodels.api as sm\n", - "from statsmodels.formula.api import ols\n", - "from scipy import stats\n", - "import warnings\n", - "warnings.filterwarnings('ignore')" - ] - }, - { - "cell_type": "code", - "execution_count": 99, - "metadata": {}, - "outputs": [], - "source": [ - "df = pd.read_csv('data/kc_house_data.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 100, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(21597, 21)" - ] - }, - "execution_count": 100, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.shape" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The dataset contains 21,597 houses with 21 features." - ] - }, - { - "cell_type": "code", - "execution_count": 101, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 21597 entries, 0 to 21596\n", - "Data columns (total 21 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 21597 non-null int64 \n", - " 1 date 21597 non-null object \n", - " 2 price 21597 non-null float64\n", - " 3 bedrooms 21597 non-null int64 \n", - " 4 bathrooms 21597 non-null float64\n", - " 5 sqft_living 21597 non-null int64 \n", - " 6 sqft_lot 21597 non-null int64 \n", - " 7 floors 21597 non-null float64\n", - " 8 waterfront 19221 non-null object \n", - " 9 view 21534 non-null object \n", - " 10 condition 21597 non-null object \n", - " 11 grade 21597 non-null object \n", - " 12 sqft_above 21597 non-null int64 \n", - " 13 sqft_basement 21597 non-null object \n", - " 14 yr_built 21597 non-null int64 \n", - " 15 yr_renovated 17755 non-null float64\n", - " 16 zipcode 21597 non-null int64 \n", - " 17 lat 21597 non-null float64\n", - " 18 long 21597 non-null float64\n", - " 19 sqft_living15 21597 non-null int64 \n", - " 20 sqft_lot15 21597 non-null int64 \n", - "dtypes: float64(6), int64(9), object(6)\n", - "memory usage: 3.5+ MB\n" - ] - } - ], - "source": [ - "df.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 102, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
idpricebedroomsbathroomssqft_livingsqft_lotfloorssqft_aboveyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15
count21597.0000021597.0000021597.0000021597.0000021597.0000021597.0000021597.0000021597.0000021597.0000017755.0000021597.0000021597.0000021597.0000021597.0000021597.00000
mean4580474287.77099540296.573513.373202.115832080.3218515099.408761.494101788.596841970.9996883.6367898077.9518547.56009-122.213981986.6203212758.28351
std2876735715.74778367368.140100.926300.76898918.1061341412.636880.53968827.7597629.37523399.9464153.513070.138550.14072685.2304727274.44195
min1000102.0000078000.000001.000000.50000370.00000520.000001.00000370.000001900.000000.0000098001.0000047.15590-122.51900399.00000651.00000
25%2123049175.00000322000.000003.000001.750001430.000005040.000001.000001190.000001951.000000.0000098033.0000047.47110-122.328001490.000005100.00000
50%3904930410.00000450000.000003.000002.250001910.000007618.000001.500001560.000001975.000000.0000098065.0000047.57180-122.231001840.000007620.00000
75%7308900490.00000645000.000004.000002.500002550.0000010685.000002.000002210.000001997.000000.0000098118.0000047.67800-122.125002360.0000010083.00000
max9900000190.000007700000.0000033.000008.0000013540.000001651359.000003.500009410.000002015.000002015.0000098199.0000047.77760-121.315006210.00000871200.00000
\n", - "
" - ], - "text/plain": [ - " id price bedrooms bathrooms sqft_living \\\n", - "count 21597.00000 21597.00000 21597.00000 21597.00000 21597.00000 \n", - "mean 4580474287.77099 540296.57351 3.37320 2.11583 2080.32185 \n", - "std 2876735715.74778 367368.14010 0.92630 0.76898 918.10613 \n", - "min 1000102.00000 78000.00000 1.00000 0.50000 370.00000 \n", - "25% 2123049175.00000 322000.00000 3.00000 1.75000 1430.00000 \n", - "50% 3904930410.00000 450000.00000 3.00000 2.25000 1910.00000 \n", - "75% 7308900490.00000 645000.00000 4.00000 2.50000 2550.00000 \n", - "max 9900000190.00000 7700000.00000 33.00000 8.00000 13540.00000 \n", - "\n", - " sqft_lot floors sqft_above yr_built yr_renovated \\\n", - "count 21597.00000 21597.00000 21597.00000 21597.00000 17755.00000 \n", - "mean 15099.40876 1.49410 1788.59684 1970.99968 83.63678 \n", - "std 41412.63688 0.53968 827.75976 29.37523 399.94641 \n", - "min 520.00000 1.00000 370.00000 1900.00000 0.00000 \n", - "25% 5040.00000 1.00000 1190.00000 1951.00000 0.00000 \n", - "50% 7618.00000 1.50000 1560.00000 1975.00000 0.00000 \n", - "75% 10685.00000 2.00000 2210.00000 1997.00000 0.00000 \n", - "max 1651359.00000 3.50000 9410.00000 2015.00000 2015.00000 \n", - "\n", - " zipcode lat long sqft_living15 sqft_lot15 \n", - "count 21597.00000 21597.00000 21597.00000 21597.00000 21597.00000 \n", - "mean 98077.95185 47.56009 -122.21398 1986.62032 12758.28351 \n", - "std 53.51307 0.13855 0.14072 685.23047 27274.44195 \n", - "min 98001.00000 47.15590 -122.51900 399.00000 651.00000 \n", - "25% 98033.00000 47.47110 -122.32800 1490.00000 5100.00000 \n", - "50% 98065.00000 47.57180 -122.23100 1840.00000 7620.00000 \n", - "75% 98118.00000 47.67800 -122.12500 2360.00000 10083.00000 \n", - "max 98199.00000 47.77760 -121.31500 6210.00000 871200.00000 " - ] - }, - "execution_count": 102, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 103, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "count 21597.00000\n", - "mean 540296.57351\n", - "std 367368.14010\n", - "min 78000.00000\n", - "25% 322000.00000\n", - "50% 450000.00000\n", - "75% 645000.00000\n", - "max 7700000.00000\n", - "Name: price, dtype: float64" - ] - }, - "execution_count": 103, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# descriptive statistics for our target price.\n", - "df['price'].describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The average price of homes in the data set is 540,297 dollars. \n", - "The prices ranges from 78,000 to 8,000,000 dollars and\n", - "the median house price is 450,000 dollars" - ] - }, - { - "cell_type": "code", - "execution_count": 104, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "count 21597.00000\n", - "mean 2080.32185\n", - "std 918.10613\n", - "min 370.00000\n", - "25% 1430.00000\n", - "50% 1910.00000\n", - "75% 2550.00000\n", - "max 13540.00000\n", - "Name: sqft_living, dtype: float64" - ] - }, - "execution_count": 104, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "# descriptive statistics for square footage\n", - "df['sqft_living'].describe()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The mean square-feet of living space is 2,080 sq-ft and the range of living space ranges from 370 sq-ft to 13,540 sq-ft. The median sq footage is 1,910." - ] - }, - { - "cell_type": "code", - "execution_count": 105, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "bedrooms\n", - "3 9824\n", - "4 6882\n", - "2 2760\n", - "5 1601\n", - "6 272\n", - "1 196\n", - "7 38\n", - "8 13\n", - "9 6\n", - "10 3\n", - "11 1\n", - "33 1\n", - "Name: count, dtype: int64" - ] - }, - "execution_count": 105, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df['bedrooms'].value_counts()" + "import statsmodels.api as sm\n" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The bedroom counts range from 1 bedroom to 33" - ] - }, - { - "cell_type": "code", - "execution_count": 106, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "bathrooms\n", - "2.50000 5377\n", - "1.00000 3851\n", - "1.75000 3048\n", - "2.25000 2047\n", - "2.00000 1930\n", - "1.50000 1445\n", - "2.75000 1185\n", - "3.00000 753\n", - "3.50000 731\n", - "3.25000 589\n", - "3.75000 155\n", - "4.00000 136\n", - "4.50000 100\n", - "4.25000 79\n", - "0.75000 71\n", - "4.75000 23\n", - "5.00000 21\n", - "5.25000 13\n", - "5.50000 10\n", - "1.25000 9\n", - "6.00000 6\n", - "0.50000 4\n", - "5.75000 4\n", - "6.75000 2\n", - "8.00000 2\n", - "6.25000 2\n", - "6.50000 2\n", - "7.50000 1\n", - "7.75000 1\n", - "Name: count, dtype: int64" - ] - }, - "execution_count": 106, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df['bathrooms'].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 107, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "floors\n", - "1.00000 10673\n", - "2.00000 8235\n", - "1.50000 1910\n", - "3.00000 611\n", - "2.50000 161\n", - "3.50000 7\n", - "Name: count, dtype: int64" - ] - }, - "execution_count": 107, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df['floors'].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 108, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "sqft_lot\n", - "5000 358\n", - "6000 290\n", - "4000 251\n", - "7200 220\n", - "4800 119\n", - " ... \n", - "22605 1\n", - "25248 1\n", - "9934 1\n", - "9142 1\n", - "1076 1\n", - "Name: count, Length: 9776, dtype: int64" - ] - }, - "execution_count": 108, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df['sqft_lot'].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 109, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "count 21597.00000\n", - "mean 15099.40876\n", - "std 41412.63688\n", - "min 520.00000\n", - "25% 5040.00000\n", - "50% 7618.00000\n", - "75% 10685.00000\n", - "max 1651359.00000\n", - "Name: sqft_lot, dtype: float64" - ] - }, - "execution_count": 109, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df['sqft_lot'].describe()" - ] - }, - { - "cell_type": "code", - "execution_count": 110, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "yr_built\n", - "2014 559\n", - "2006 453\n", - "2005 450\n", - "2004 433\n", - "2003 420\n", - " ... \n", - "1933 30\n", - "1901 29\n", - "1902 27\n", - "1935 24\n", - "1934 21\n", - "Name: count, Length: 116, dtype: int64" - ] - }, - "execution_count": 110, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df['yr_built'].value_counts()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The year built ranges from 1934 to 2014." - ] - }, - { - "cell_type": "code", - "execution_count": 111, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "condition\n", - "Average 14020\n", - "Good 5677\n", - "Very Good 1701\n", - "Fair 170\n", - "Poor 29\n", - "Name: count, dtype: int64" - ] - }, - "execution_count": 111, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df['condition'].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 112, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "waterfront\n", - "NO 19075\n", - "YES 146\n", - "Name: count, dtype: int64" - ] - }, - "execution_count": 112, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df['waterfront'].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 113, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "# examining the relationship between sqft_living and price\n", - "sns.jointplot(x='sqft_living', y='price', data=df, kind='reg')\n", - "\n", - "plt.tight_layout()" - ] - }, - { - "cell_type": "code", - "execution_count": 114, - "metadata": {}, - "outputs": [ - { - "data": { - "image/png": "", - "text/plain": [ - "
" - ] - }, - "metadata": {}, - "output_type": "display_data" - } - ], - "source": [ - "sns.jointplot(x='bathrooms', y='price', data=df, kind='reg')\n", - "plt.tight_layout()\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Preperation\n", - "\n", - "Data Preparation Fundamentals - Applying appropriate preprocessing and feature engineering steps to tabular data in preparation for statistical modeling\n", - "\n", - "Data Cleaning Steps\n", - "Handling Missing Values: Identify and address and missing values using techniques such as dropping or replacing data.\n", - "\n", - "Handling Non-Numeric Data: A Linear regression model needs all of the features to be numeric, not categorical. Identify the data type 'object' and address them using techniques such as ordinal or one-hot encoding.\n", - "\n", - "This notebook contains a breakdown of the step-by-step processes that we used to compile, scrub, and transform our data. It includes variations of narrowing our scope and explorations into the impacts that our different transformations have on the data." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Preprocessing with Scikit-learn\n", - "Let explore and clean our data set to prep for our Linear Regression Model.\n", - "Preprocessing Steps.\n", - "\n", - "1. Handle Missing Values\n", - "2. Convert Categorical Features into Numbers\n", - "3. Find and Remove Outliers" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Handling Missing Values\n", - "Below, let's check to see if there are any NaNs in our data" - ] - }, - { - "cell_type": "code", - "execution_count": 115, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "id 0\n", - "date 0\n", - "price 0\n", - "bedrooms 0\n", - "bathrooms 0\n", - "sqft_living 0\n", - "sqft_lot 0\n", - "floors 0\n", - "waterfront 2376\n", - "view 63\n", - "condition 0\n", - "grade 0\n", - "sqft_above 0\n", - "sqft_basement 0\n", - "yr_built 0\n", - "yr_renovated 3842\n", - "zipcode 0\n", - "lat 0\n", - "long 0\n", - "sqft_living15 0\n", - "sqft_lot15 0\n", - "dtype: int64" - ] - }, - "execution_count": 115, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#locate missing values\n", - "df.isna().sum()" - ] - }, - { - "cell_type": "code", - "execution_count": 116, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "id 0.0\n", - "date 0.0\n", - "price 0.0\n", - "bedrooms 0.0\n", - "bathrooms 0.0\n", - "sqft_living 0.0\n", - "sqft_lot 0.0\n", - "floors 0.0\n", - "waterfront 11.00152798999861\n", - "view 0.29170718155299347\n", - "condition 0.0\n", - "grade 0.0\n", - "sqft_above 0.0\n", - "sqft_basement 0.0\n", - "yr_built 0.0\n", - "yr_renovated 17.78950780200954\n", - "zipcode 0.0\n", - "lat 0.0\n", - "long 0.0\n", - "sqft_living15 0.0\n", - "sqft_lot15 0.0\n" - ] - } - ], - "source": [ - "#dealing with missing values\n", - "for column in df.columns:\n", - " percentage_of_nan = (sum(df[column].isnull())/len(df[column])) * 100 \n", - " print(column, percentage_of_nan)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The feature 'waterfront' is the only feature with missing values and about 11% of the values have NaNs. Lets investigate this feature to handle it's missing values" - ] - }, - { - "cell_type": "code", - "execution_count": 117, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "waterfront\n", - "NO 19075\n", - "YES 146\n", - "Name: count, dtype: int64" - ] - }, - "execution_count": 117, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df['waterfront'].value_counts()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see that the 'waterfront' feature only has two values, yes or no.\n", - "Thus NaN values can be considered no because they do not exist in their homes." - ] - }, - { - "cell_type": "code", - "execution_count": 118, - "metadata": {}, - "outputs": [], - "source": [ - "df['waterfront'].fillna('NO', inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 119, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "waterfront\n", - "NO 21451\n", - "YES 146\n", - "Name: count, dtype: int64" - ] - }, - "execution_count": 119, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df['waterfront'].value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 120, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "id 0\n", - "date 0\n", - "price 0\n", - "bedrooms 0\n", - "bathrooms 0\n", - "sqft_living 0\n", - "sqft_lot 0\n", - "floors 0\n", - "waterfront 0\n", - "view 63\n", - "condition 0\n", - "grade 0\n", - "sqft_above 0\n", - "sqft_basement 0\n", - "yr_built 0\n", - "yr_renovated 3842\n", - "zipcode 0\n", - "lat 0\n", - "long 0\n", - "sqft_living15 0\n", - "sqft_lot15 0\n", - "dtype: int64" - ] - }, - "execution_count": 120, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "#recheck for missing values\n", - "df.isna().sum()" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Convert Categorical Features into Numbers" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "Our model would crash because some of the columns are non-numeric. Features with a numeric data type will work with our model, but these features need to be converted:\n", - "* waterfront (object)\n", - "* condition (object)\n", - "* grade (object)\n", - "\n", - "Let's inspect the value counts of the specified features:" - ] - }, - { - "cell_type": "code", - "execution_count": 121, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "waterfront\n", - "NO 21451\n", - "YES 146\n", - "Name: count, dtype: int64\n", - "\n", - "condition\n", - "Average 14020\n", - "Good 5677\n", - "Very Good 1701\n", - "Fair 170\n", - "Poor 29\n", - "Name: count, dtype: int64\n", - "\n", - "grade\n", - "7 Average 8974\n", - "8 Good 6065\n", - "9 Better 2615\n", - "6 Low Average 2038\n", - "10 Very Good 1134\n", - "11 Excellent 399\n", - "5 Fair 242\n", - "12 Luxury 89\n", - "4 Low 27\n", - "13 Mansion 13\n", - "3 Poor 1\n", - "Name: count, dtype: int64\n" - ] - } - ], - "source": [ - "print(df['waterfront'].value_counts())\n", - "print()\n", - "print(df['condition'].value_counts())\n", - "print()\n", - "print(df['grade'].value_counts())" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Split function to seperate the numeric value of 'grade'" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The Grade feature is an object data type however the numeric grade is listed in the front. We will use a simple string split function to isolate the numeric part of the feature.\n", - "\n", - "Waterfront has only 2 categories and can be converted into binary in place, whereas Condition has more than 2 categories and will need to be expanded into multiple columns." - ] - }, - { - "cell_type": "code", - "execution_count": 122, - "metadata": {}, - "outputs": [], - "source": [ - "df = df.assign(grade=df.grade.str.split(' ')).explode('grade')" - ] - }, - { - "cell_type": "code", - "execution_count": 123, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "False 46366\n", - "Name: count, dtype: int64" - ] - }, - "execution_count": 123, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.duplicated().value_counts()" - ] - }, - { - "cell_type": "code", - "execution_count": 124, - "metadata": {}, - "outputs": [], - "source": [ - "df = df.drop_duplicates()" - ] - }, - { - "cell_type": "code", - "execution_count": 125, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "(46366, 21)" - ] - }, - "execution_count": 125, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "df.shape" + "### Loading Data" ] }, { "cell_type": "code", - "execution_count": 126, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ - "df = df.drop_duplicates(subset='id')" + "# Creating a function that loads data and return it in a dataframe\n", + "def load_data(file_path):\n", + " house_data = pd.read_csv(file_path)\n", + "\n", + " #shape\n", + " shape = house_data.shape\n", + " print(f\"The dataset contains {shape[0]} houses with {shape[1]} features\")\n", + " print()\n", + " \n", + " #Data Types\n", + " data_types = house_data.dtypes\n", + " print(\"Columns and their data types:\")\n", + " for column, dtype in data_types.items():\n", + " print(f\"{column}: {dtype}\")\n", + " print()\n", + "\n", + " return house_data\n" ] }, { "cell_type": "code", - "execution_count": 127, + "execution_count": 17, "metadata": {}, "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The dataset contains 21597 houses with 21 features\n", + "\n", + "Columns and their data types:\n", + "id: int64\n", + "date: object\n", + "price: float64\n", + "bedrooms: int64\n", + "bathrooms: float64\n", + "sqft_living: int64\n", + "sqft_lot: int64\n", + "floors: float64\n", + "waterfront: object\n", + "view: object\n", + "condition: object\n", + "grade: object\n", + "sqft_above: int64\n", + "sqft_basement: object\n", + "yr_built: int64\n", + "yr_renovated: float64\n", + "zipcode: int64\n", + "lat: float64\n", + "long: float64\n", + "sqft_living15: int64\n", + "sqft_lot15: int64\n", + "\n" + ] + }, { "data": { "text/html": [ @@ -1256,23 +228,23 @@ " 0\n", " 7129300520\n", " 10/13/2014\n", - " 221900.00000\n", + " 221900.0\n", " 3\n", - " 1.00000\n", + " 1.00\n", " 1180\n", " 5650\n", - " 1.00000\n", - " NO\n", + " 1.0\n", + " NaN\n", " NONE\n", " ...\n", - " 7\n", + " 7 Average\n", " 1180\n", " 0.0\n", " 1955\n", - " 0.00000\n", + " 0.0\n", " 98178\n", - " 47.51120\n", - " -122.25700\n", + " 47.5112\n", + " -122.257\n", " 1340\n", " 5650\n", " \n", @@ -1280,47 +252,71 @@ " 1\n", " 6414100192\n", " 12/9/2014\n", - " 538000.00000\n", + " 538000.0\n", " 3\n", - " 2.25000\n", + " 2.25\n", " 2570\n", " 7242\n", - " 2.00000\n", + " 2.0\n", " NO\n", " NONE\n", " ...\n", - " 7\n", + " 7 Average\n", " 2170\n", " 400.0\n", " 1951\n", - " 1991.00000\n", + " 1991.0\n", " 98125\n", - " 47.72100\n", - " -122.31900\n", + " 47.7210\n", + " -122.319\n", " 1690\n", " 7639\n", " \n", " \n", + " 2\n", + " 5631500400\n", + " 2/25/2015\n", + " 180000.0\n", + " 2\n", + " 1.00\n", + " 770\n", + " 10000\n", + " 1.0\n", + " NO\n", + " NONE\n", + " ...\n", + " 6 Low Average\n", + " 770\n", + " 0.0\n", + " 1933\n", + " NaN\n", + " 98028\n", + " 47.7379\n", + " -122.233\n", + " 2720\n", + " 8062\n", + " \n", + " \n", " 3\n", " 2487200875\n", " 12/9/2014\n", - " 604000.00000\n", + " 604000.0\n", " 4\n", - " 3.00000\n", + " 3.00\n", " 1960\n", " 5000\n", - " 1.00000\n", + " 1.0\n", " NO\n", " NONE\n", " ...\n", - " 7\n", + " 7 Average\n", " 1050\n", " 910.0\n", " 1965\n", - " 0.00000\n", + " 0.0\n", " 98136\n", - " 47.52080\n", - " -122.39300\n", + " 47.5208\n", + " -122.393\n", " 1360\n", " 5000\n", " \n", @@ -1328,51 +324,27 @@ " 4\n", " 1954400510\n", " 2/18/2015\n", - " 510000.00000\n", + " 510000.0\n", " 3\n", - " 2.00000\n", + " 2.00\n", " 1680\n", " 8080\n", - " 1.00000\n", + " 1.0\n", " NO\n", " NONE\n", " ...\n", - " 8\n", + " 8 Good\n", " 1680\n", " 0.0\n", " 1987\n", - " 0.00000\n", + " 0.0\n", " 98074\n", - " 47.61680\n", - " -122.04500\n", + " 47.6168\n", + " -122.045\n", " 1800\n", " 7503\n", " \n", " \n", - " 5\n", - " 7237550310\n", - " 5/12/2014\n", - " 1230000.00000\n", - " 4\n", - " 4.50000\n", - " 5420\n", - " 101930\n", - " 1.00000\n", - " NO\n", - " NONE\n", - " ...\n", - " 11\n", - " 3890\n", - " 1530.0\n", - " 2001\n", - " 0.00000\n", - " 98053\n", - " 47.65610\n", - " -122.00500\n", - " 4760\n", - " 101930\n", - " \n", - " \n", " ...\n", " ...\n", " ...\n", @@ -1400,23 +372,23 @@ " 21592\n", " 263000018\n", " 5/21/2014\n", - " 360000.00000\n", + " 360000.0\n", " 3\n", - " 2.50000\n", + " 2.50\n", " 1530\n", " 1131\n", - " 3.00000\n", + " 3.0\n", " NO\n", " NONE\n", " ...\n", - " 8\n", + " 8 Good\n", " 1530\n", " 0.0\n", " 2009\n", - " 0.00000\n", + " 0.0\n", " 98103\n", - " 47.69930\n", - " -122.34600\n", + " 47.6993\n", + " -122.346\n", " 1530\n", " 1509\n", " \n", @@ -1424,23 +396,23 @@ " 21593\n", " 6600060120\n", " 2/23/2015\n", - " 400000.00000\n", + " 400000.0\n", " 4\n", - " 2.50000\n", + " 2.50\n", " 2310\n", " 5813\n", - " 2.00000\n", + " 2.0\n", " NO\n", " NONE\n", " ...\n", - " 8\n", + " 8 Good\n", " 2310\n", " 0.0\n", " 2014\n", - " 0.00000\n", + " 0.0\n", " 98146\n", - " 47.51070\n", - " -122.36200\n", + " 47.5107\n", + " -122.362\n", " 1830\n", " 7200\n", " \n", @@ -1448,23 +420,23 @@ " 21594\n", " 1523300141\n", " 6/23/2014\n", - " 402101.00000\n", + " 402101.0\n", " 2\n", - " 0.75000\n", + " 0.75\n", " 1020\n", " 1350\n", - " 2.00000\n", + " 2.0\n", " NO\n", " NONE\n", " ...\n", - " 7\n", + " 7 Average\n", " 1020\n", " 0.0\n", " 2009\n", - " 0.00000\n", + " 0.0\n", " 98144\n", - " 47.59440\n", - " -122.29900\n", + " 47.5944\n", + " -122.299\n", " 1020\n", " 2007\n", " \n", @@ -1472,23 +444,23 @@ " 21595\n", " 291310100\n", " 1/16/2015\n", - " 400000.00000\n", + " 400000.0\n", " 3\n", - " 2.50000\n", + " 2.50\n", " 1600\n", " 2388\n", - " 2.00000\n", - " NO\n", + " 2.0\n", + " NaN\n", " NONE\n", " ...\n", - " 8\n", + " 8 Good\n", " 1600\n", " 0.0\n", " 2004\n", - " 0.00000\n", + " 0.0\n", " 98027\n", - " 47.53450\n", - " -122.06900\n", + " 47.5345\n", + " -122.069\n", " 1410\n", " 1287\n", " \n", @@ -1496,99 +468,216 @@ " 21596\n", " 1523300157\n", " 10/15/2014\n", - " 325000.00000\n", + " 325000.0\n", " 2\n", - " 0.75000\n", + " 0.75\n", " 1020\n", " 1076\n", - " 2.00000\n", + " 2.0\n", " NO\n", " NONE\n", " ...\n", - " 7\n", + " 7 Average\n", " 1020\n", " 0.0\n", " 2008\n", - " 0.00000\n", + " 0.0\n", " 98144\n", - " 47.59410\n", - " -122.29900\n", + " 47.5941\n", + " -122.299\n", " 1020\n", " 1357\n", " \n", " \n", "\n", - "

17565 rows × 21 columns

\n", + "

21597 rows × 21 columns

\n", "" ], "text/plain": [ - " id date price bedrooms bathrooms sqft_living \\\n", - "0 7129300520 10/13/2014 221900.00000 3 1.00000 1180 \n", - "1 6414100192 12/9/2014 538000.00000 3 2.25000 2570 \n", - "3 2487200875 12/9/2014 604000.00000 4 3.00000 1960 \n", - "4 1954400510 2/18/2015 510000.00000 3 2.00000 1680 \n", - "5 7237550310 5/12/2014 1230000.00000 4 4.50000 5420 \n", - "... ... ... ... ... ... ... \n", - "21592 263000018 5/21/2014 360000.00000 3 2.50000 1530 \n", - "21593 6600060120 2/23/2015 400000.00000 4 2.50000 2310 \n", - "21594 1523300141 6/23/2014 402101.00000 2 0.75000 1020 \n", - "21595 291310100 1/16/2015 400000.00000 3 2.50000 1600 \n", - "21596 1523300157 10/15/2014 325000.00000 2 0.75000 1020 \n", + " id date price bedrooms bathrooms sqft_living \\\n", + "0 7129300520 10/13/2014 221900.0 3 1.00 1180 \n", + "1 6414100192 12/9/2014 538000.0 3 2.25 2570 \n", + "2 5631500400 2/25/2015 180000.0 2 1.00 770 \n", + "3 2487200875 12/9/2014 604000.0 4 3.00 1960 \n", + "4 1954400510 2/18/2015 510000.0 3 2.00 1680 \n", + "... ... ... ... ... ... ... \n", + "21592 263000018 5/21/2014 360000.0 3 2.50 1530 \n", + "21593 6600060120 2/23/2015 400000.0 4 2.50 2310 \n", + "21594 1523300141 6/23/2014 402101.0 2 0.75 1020 \n", + "21595 291310100 1/16/2015 400000.0 3 2.50 1600 \n", + "21596 1523300157 10/15/2014 325000.0 2 0.75 1020 \n", "\n", - " sqft_lot floors waterfront view ... grade sqft_above sqft_basement \\\n", - "0 5650 1.00000 NO NONE ... 7 1180 0.0 \n", - "1 7242 2.00000 NO NONE ... 7 2170 400.0 \n", - "3 5000 1.00000 NO NONE ... 7 1050 910.0 \n", - "4 8080 1.00000 NO NONE ... 8 1680 0.0 \n", - "5 101930 1.00000 NO NONE ... 11 3890 1530.0 \n", - "... ... ... ... ... ... ... ... ... \n", - "21592 1131 3.00000 NO NONE ... 8 1530 0.0 \n", - "21593 5813 2.00000 NO NONE ... 8 2310 0.0 \n", - "21594 1350 2.00000 NO NONE ... 7 1020 0.0 \n", - "21595 2388 2.00000 NO NONE ... 8 1600 0.0 \n", - "21596 1076 2.00000 NO NONE ... 7 1020 0.0 \n", + " sqft_lot floors waterfront view ... grade sqft_above \\\n", + "0 5650 1.0 NaN NONE ... 7 Average 1180 \n", + "1 7242 2.0 NO NONE ... 7 Average 2170 \n", + "2 10000 1.0 NO NONE ... 6 Low Average 770 \n", + "3 5000 1.0 NO NONE ... 7 Average 1050 \n", + "4 8080 1.0 NO NONE ... 8 Good 1680 \n", + "... ... ... ... ... ... ... ... \n", + "21592 1131 3.0 NO NONE ... 8 Good 1530 \n", + "21593 5813 2.0 NO NONE ... 8 Good 2310 \n", + "21594 1350 2.0 NO NONE ... 7 Average 1020 \n", + "21595 2388 2.0 NaN NONE ... 8 Good 1600 \n", + "21596 1076 2.0 NO NONE ... 7 Average 1020 \n", "\n", - " yr_built yr_renovated zipcode lat long sqft_living15 \\\n", - "0 1955 0.00000 98178 47.51120 -122.25700 1340 \n", - "1 1951 1991.00000 98125 47.72100 -122.31900 1690 \n", - "3 1965 0.00000 98136 47.52080 -122.39300 1360 \n", - "4 1987 0.00000 98074 47.61680 -122.04500 1800 \n", - "5 2001 0.00000 98053 47.65610 -122.00500 4760 \n", - "... ... ... ... ... ... ... \n", - "21592 2009 0.00000 98103 47.69930 -122.34600 1530 \n", - "21593 2014 0.00000 98146 47.51070 -122.36200 1830 \n", - "21594 2009 0.00000 98144 47.59440 -122.29900 1020 \n", - "21595 2004 0.00000 98027 47.53450 -122.06900 1410 \n", - "21596 2008 0.00000 98144 47.59410 -122.29900 1020 \n", + " sqft_basement yr_built yr_renovated zipcode lat long \\\n", + "0 0.0 1955 0.0 98178 47.5112 -122.257 \n", + "1 400.0 1951 1991.0 98125 47.7210 -122.319 \n", + "2 0.0 1933 NaN 98028 47.7379 -122.233 \n", + "3 910.0 1965 0.0 98136 47.5208 -122.393 \n", + "4 0.0 1987 0.0 98074 47.6168 -122.045 \n", + "... ... ... ... ... ... ... \n", + "21592 0.0 2009 0.0 98103 47.6993 -122.346 \n", + "21593 0.0 2014 0.0 98146 47.5107 -122.362 \n", + "21594 0.0 2009 0.0 98144 47.5944 -122.299 \n", + "21595 0.0 2004 0.0 98027 47.5345 -122.069 \n", + "21596 0.0 2008 0.0 98144 47.5941 -122.299 \n", "\n", - " sqft_lot15 \n", - "0 5650 \n", - "1 7639 \n", - "3 5000 \n", - "4 7503 \n", - "5 101930 \n", - "... ... \n", - "21592 1509 \n", - "21593 7200 \n", - "21594 2007 \n", - "21595 1287 \n", - "21596 1357 \n", + " sqft_living15 sqft_lot15 \n", + "0 1340 5650 \n", + "1 1690 7639 \n", + "2 2720 8062 \n", + "3 1360 5000 \n", + "4 1800 7503 \n", + "... ... ... \n", + "21592 1530 1509 \n", + "21593 1830 7200 \n", + "21594 1020 2007 \n", + "21595 1410 1287 \n", + "21596 1020 1357 \n", "\n", - "[17565 rows x 21 columns]" + "[21597 rows x 21 columns]" ] }, - "execution_count": 127, + "execution_count": 17, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df.dropna()" + "load_data('data/kc_house_data.csv') # Assuming 'data' folder is in the same directory\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The dataset contains 21 columns, each representing a distinct feature, and 21,597 rows, with each row corresponding to a specific house sale entry.\n", + "\n", + "The dataset contains a mix of data types, including integers (int64), floating-point numbers (float64), and objects (strings). For instance, numerical features such as bedrooms, bathrooms, and sqft_living are represented as integers or floating-point numbers to facilitate mathematical computations, while categorical features like waterfront and view are stored as objects to accommodate text-based categories." + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The dataset contains 21597 houses with 21 features\n", + "\n", + "Columns and their data types:\n", + "id: int64\n", + "date: object\n", + "price: float64\n", + "bedrooms: int64\n", + "bathrooms: float64\n", + "sqft_living: int64\n", + "sqft_lot: int64\n", + "floors: float64\n", + "waterfront: object\n", + "view: object\n", + "condition: object\n", + "grade: object\n", + "sqft_above: int64\n", + "sqft_basement: object\n", + "yr_built: int64\n", + "yr_renovated: float64\n", + "zipcode: int64\n", + "lat: float64\n", + "long: float64\n", + "sqft_living15: int64\n", + "sqft_lot15: int64\n", + "\n" + ] + } + ], + "source": [ + "kings_data = load_data('data/kc_house_data.csv')" ] }, { "cell_type": "code", - "execution_count": 128, + "execution_count": 19, + "metadata": {}, + "outputs": [], + "source": [ + "#create a function that takes in a column and returns the column statistics as a dictionary\n", + "def descriptive_analytics(column):\n", + " stats_dict = column.describe().to_dict()\n", + " \n", + " print(\"Descriptive Statistics for Column '{}':\".format(column.name))\n", + " print(\"The count of the column is:\", stats_dict['count'])\n", + " print(\"The mean of the column is:\", stats_dict['mean'])\n", + " print(\"The standard deviation of the column is:\", stats_dict['std'])\n", + " print(\"The minimum value of the column is:\", stats_dict['min'])\n", + " print(\"The 25th percentile of the column is:\", stats_dict['25%'])\n", + " print(\"The median of the column is:\", stats_dict['50%'])\n", + " print(\"The 75th percentile of the column is:\", stats_dict['75%'])\n", + " print(\"The maximum value of the column is:\", stats_dict['max'])" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Descriptive Statistics for Column 'price':\n", + "The count of the column is: 21597.0\n", + "The mean of the column is: 540296.5735055795\n", + "The standard deviation of the column is: 367368.1401013936\n", + "The minimum value of the column is: 78000.0\n", + "The 25th percentile of the column is: 322000.0\n", + "The median of the column is: 450000.0\n", + "The 75th percentile of the column is: 645000.0\n", + "The maximum value of the column is: 7700000.0\n" + ] + } + ], + "source": [ + "descriptive_analytics(kings_data['price'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the maximum price of a house is 7700000 dollars and the minimum price is 78000 dollars\n", + "\n", + "There are 21597 prices regarding to the houses in the dataset\n", + "\n", + "Average price of a house is 540296.57 dollars" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preperation\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -1596,121 +685,351 @@ "output_type": "stream", "text": [ "\n", - "Index: 21420 entries, 0 to 21596\n", + "RangeIndex: 21597 entries, 0 to 21596\n", "Data columns (total 21 columns):\n", " # Column Non-Null Count Dtype \n", "--- ------ -------------- ----- \n", - " 0 id 21420 non-null int64 \n", - " 1 date 21420 non-null object \n", - " 2 price 21420 non-null float64\n", - " 3 bedrooms 21420 non-null int64 \n", - " 4 bathrooms 21420 non-null float64\n", - " 5 sqft_living 21420 non-null int64 \n", - " 6 sqft_lot 21420 non-null int64 \n", - " 7 floors 21420 non-null float64\n", - " 8 waterfront 21420 non-null object \n", - " 9 view 21357 non-null object \n", - " 10 condition 21420 non-null object \n", - " 11 grade 21420 non-null object \n", - " 12 sqft_above 21420 non-null int64 \n", - " 13 sqft_basement 21420 non-null object \n", - " 14 yr_built 21420 non-null int64 \n", - " 15 yr_renovated 17616 non-null float64\n", - " 16 zipcode 21420 non-null int64 \n", - " 17 lat 21420 non-null float64\n", - " 18 long 21420 non-null float64\n", - " 19 sqft_living15 21420 non-null int64 \n", - " 20 sqft_lot15 21420 non-null int64 \n", + " 0 id 21597 non-null int64 \n", + " 1 date 21597 non-null object \n", + " 2 price 21597 non-null float64\n", + " 3 bedrooms 21597 non-null int64 \n", + " 4 bathrooms 21597 non-null float64\n", + " 5 sqft_living 21597 non-null int64 \n", + " 6 sqft_lot 21597 non-null int64 \n", + " 7 floors 21597 non-null float64\n", + " 8 waterfront 19221 non-null object \n", + " 9 view 21534 non-null object \n", + " 10 condition 21597 non-null object \n", + " 11 grade 21597 non-null object \n", + " 12 sqft_above 21597 non-null int64 \n", + " 13 sqft_basement 21597 non-null object \n", + " 14 yr_built 21597 non-null int64 \n", + " 15 yr_renovated 17755 non-null float64\n", + " 16 zipcode 21597 non-null int64 \n", + " 17 lat 21597 non-null float64\n", + " 18 long 21597 non-null float64\n", + " 19 sqft_living15 21597 non-null int64 \n", + " 20 sqft_lot15 21597 non-null int64 \n", "dtypes: float64(6), int64(9), object(6)\n", - "memory usage: 3.6+ MB\n" + "memory usage: 3.5+ MB\n" ] } ], "source": [ - "df.info()" + "kings_data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": {}, + "outputs": [], + "source": [ + "def identify_issues(dataset):\n", + " # Identify missing values as a percentage of the whole dataset\n", + " missing_values = (dataset.isnull().sum())/len(dataset) * 100\n", + "\n", + " # Identify duplicates\n", + " duplicates = dataset.duplicated().sum()\n", + " \n", + " #return a dictionary \n", + " return {'duplicates': duplicates,\n", + " 'missing values': missing_values.round(2)} \n" ] }, { "cell_type": "code", - "execution_count": 129, + "execution_count": 23, "metadata": {}, "outputs": [ { "data": { "text/plain": [ - "grade\n", - "7 8889\n", - "8 6041\n", - "9 2606\n", - "6 1995\n", - "10 1130\n", - "11 396\n", - "5 234\n", - "12 88\n", - "4 27\n", - "13 13\n", - "3 1\n", - "Name: count, dtype: int64" + "{'duplicates': 0,\n", + " 'missing values': id 0.00\n", + " date 0.00\n", + " price 0.00\n", + " bedrooms 0.00\n", + " bathrooms 0.00\n", + " sqft_living 0.00\n", + " sqft_lot 0.00\n", + " floors 0.00\n", + " waterfront 11.00\n", + " view 0.29\n", + " condition 0.00\n", + " grade 0.00\n", + " sqft_above 0.00\n", + " sqft_basement 0.00\n", + " yr_built 0.00\n", + " yr_renovated 17.79\n", + " zipcode 0.00\n", + " lat 0.00\n", + " long 0.00\n", + " sqft_living15 0.00\n", + " sqft_lot15 0.00\n", + " dtype: float64}" ] }, - "execution_count": 129, + "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ - "df['grade'].value_counts()" + "identify_issues(kings_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The examination indicates that there are no duplicate entries within the dataset, ensuring the integrity of the records. However, attention is warranted to address missing values present in certain columns. Specifically, the 'waterfront' feature exhibits 11% of null values, representing a negligible portion of the dataset. Similarly, the 'yr_renovated' feature shows a relatively higher percentage of missing values, accounting for approximately 17.79% of the dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Before making changes make a copy instead of overwriting data" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": {}, + "outputs": [], + "source": [ + "house_data_clean = kings_data.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 25, + "metadata": {}, + "outputs": [], + "source": [ + "# Changing the date to date time\n", + "house_data_clean['date'] = pd.to_datetime(house_data_clean['date'])\n", + "\n", + "# Extracting only the year from the column Date\n", + "house_data_clean.date = house_data_clean['date'].dt.year\n", + "\n", + "# Changing the dates for the year built \n", + "house_data_clean['yr_built'] = pd.to_datetime(house_data_clean['yr_built'],format='%Y').dt.year\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above code converts the 'date' column data to only contain the year the house was sold, for the purpose of analysis we will use only the year since the changes month by month will be minor." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ - "The most common buiding grade is a 7" + "#### Dealing with the missing values" + ] + }, + { + "cell_type": "code", + "execution_count": 26, + "metadata": {}, + "outputs": [], + "source": [ + "def missing_values(dataset):\n", + " # drop the rows from views\n", + " dataset.dropna(subset=['view'],inplace=True)\n", + "\n", + " # Filling the NaN values for waterfront with NO\n", + " dataset.waterfront.fillna('NO',inplace=True)\n", + " \n", + " # Dropping the yr_renovated column \n", + " dataset.drop('yr_renovated',axis=1,inplace=True)" ] }, { "cell_type": "code", - "execution_count": 130, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ - "# Change the data type from object to int.\n", - "df['grade'] = df['grade'].astype(int)" + "missing_values(house_data_clean)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "'yr_renovated' has the highest percentage of NaN values 17%. This will be dropped since it will not be used within our model inline with the business problem.\n", + "\n", + "'Waterfront' feature has 11% null values, this was filled with NO on the assumption that these cells were not filled since they lacked waterfronts\n", + "\n", + "For the 'View' column, the null values were dropped by row since the overall percentage impact would be minute" ] }, { "cell_type": "code", - "execution_count": 131, + "execution_count": 28, "metadata": {}, "outputs": [ { "data": { - "image/png": "", "text/plain": [ - "
" + "{'duplicates': 2,\n", + " 'missing values': id 0.0\n", + " date 0.0\n", + " price 0.0\n", + " bedrooms 0.0\n", + " bathrooms 0.0\n", + " sqft_living 0.0\n", + " sqft_lot 0.0\n", + " floors 0.0\n", + " waterfront 0.0\n", + " view 0.0\n", + " condition 0.0\n", + " grade 0.0\n", + " sqft_above 0.0\n", + " sqft_basement 0.0\n", + " yr_built 0.0\n", + " zipcode 0.0\n", + " lat 0.0\n", + " long 0.0\n", + " sqft_living15 0.0\n", + " sqft_lot15 0.0\n", + " dtype: float64}" ] }, + "execution_count": 28, "metadata": {}, - "output_type": "display_data" + "output_type": "execute_result" } ], "source": [ - "#grade\n", - "plt.figure(figsize=(25,15))\n", - "sns.set(font_scale=2)\n", - "ax = sns.boxplot(x=\"grade\", y=\"price\", data=df)\n", - "ax.set_title('House Grade vs. Price', fontsize=50)\n", - "ax.set_ylabel('Price', fontsize=30)\n", - "ax.set_xlabel('Grade', fontsize=30)\n", - "ax.set_ylim(bottom=0, top=6000000);" + "identify_issues(house_data_clean)" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": 29, "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewconditiongradesqft_abovesqft_basementyr_builtzipcodelatlongsqft_living15sqft_lot15
394718250690312014550000.041.75241084472.0NOGOODGood8 Good2060350.019369807447.6499-122.088252014789
2003886489001102014555000.032.50194032112.0NONONEAverage8 Good19400.020099802747.5644-122.09318803078
\n", + "
" + ], + "text/plain": [ + " id date price bedrooms bathrooms sqft_living sqft_lot \\\n", + "3947 1825069031 2014 550000.0 4 1.75 2410 8447 \n", + "20038 8648900110 2014 555000.0 3 2.50 1940 3211 \n", + "\n", + " floors waterfront view condition grade sqft_above sqft_basement \\\n", + "3947 2.0 NO GOOD Good 8 Good 2060 350.0 \n", + "20038 2.0 NO NONE Average 8 Good 1940 0.0 \n", + "\n", + " yr_built zipcode lat long sqft_living15 sqft_lot15 \n", + "3947 1936 98074 47.6499 -122.088 2520 14789 \n", + "20038 2009 98027 47.5644 -122.093 1880 3078 " + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], "source": [ - "When we look at grade, we can see that as the categorical building grade designation improves, the house price does indeed rise as well. " + "house_data_clean[house_data_clean.duplicated()]" ] } ], From ce7d81d196580524a1fbf0b0df978bd49472bd9a Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Tue, 30 Apr 2024 18:56:03 +0300 Subject: [PATCH 13/42] Update student.ipynb --- student.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/student.ipynb b/student.ipynb index 03ba5c12..ed8e6379 100644 --- a/student.ipynb +++ b/student.ipynb @@ -7,7 +7,7 @@ "## Final Project Submission\n", "\n", "Please fill out:\n", - "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Derrick Kiprotich, Clyde Ochieng. \n", + "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng. \n", "* Student pace: full time\n", "* Scheduled project review date/time: \n", "* Instructor name: Nikita \n", From 26261b768e2cc3e4d4687414bc5c2a97decea023 Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Tue, 30 Apr 2024 18:59:34 +0300 Subject: [PATCH 14/42] Update student.ipynb --- student.ipynb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/student.ipynb b/student.ipynb index b3e32a00..59641be8 100644 --- a/student.ipynb +++ b/student.ipynb @@ -7,7 +7,7 @@ "## Final Project Submission\n", "\n", "Please fill out:\n", - "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng.\n", + "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Derrick Kiptoo Clyde Ochieng.\n", "* Student pace: full time\n", "* Scheduled project review date/time: \n", "* Instructor name: Nikita \n", @@ -105,7 +105,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.4" + "version": "3.10.13" } }, "nbformat": 4, From 6c8c8b7272dd2b18c70ccc37b0c3db82aed9e419 Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Tue, 30 Apr 2024 19:02:17 +0300 Subject: [PATCH 15/42] Update student.ipynb --- student.ipynb | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/student.ipynb b/student.ipynb index ed8e6379..946abd6c 100644 --- a/student.ipynb +++ b/student.ipynb @@ -98,7 +98,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 1, "metadata": {}, "outputs": [], "source": [ @@ -120,7 +120,7 @@ }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -145,7 +145,7 @@ }, { "cell_type": "code", - "execution_count": 17, + "execution_count": 3, "metadata": {}, "outputs": [ { @@ -549,7 +549,7 @@ "[21597 rows x 21 columns]" ] }, - "execution_count": 17, + "execution_count": 3, "metadata": {}, "output_type": "execute_result" } @@ -571,7 +571,7 @@ }, { "cell_type": "code", - "execution_count": 18, + "execution_count": 4, "metadata": {}, "outputs": [ { @@ -612,7 +612,7 @@ }, { "cell_type": "code", - "execution_count": 19, + "execution_count": 5, "metadata": {}, "outputs": [], "source": [ @@ -633,7 +633,7 @@ }, { "cell_type": "code", - "execution_count": 20, + "execution_count": 6, "metadata": {}, "outputs": [ { @@ -677,7 +677,7 @@ }, { "cell_type": "code", - "execution_count": 21, + "execution_count": 7, "metadata": {}, "outputs": [ { @@ -721,7 +721,7 @@ }, { "cell_type": "code", - "execution_count": 22, + "execution_count": 8, "metadata": {}, "outputs": [], "source": [ @@ -739,7 +739,7 @@ }, { "cell_type": "code", - "execution_count": 23, + "execution_count": 9, "metadata": {}, "outputs": [ { @@ -770,7 +770,7 @@ " dtype: float64}" ] }, - "execution_count": 23, + "execution_count": 9, "metadata": {}, "output_type": "execute_result" } @@ -795,7 +795,7 @@ }, { "cell_type": "code", - "execution_count": 24, + "execution_count": 10, "metadata": {}, "outputs": [], "source": [ @@ -804,7 +804,7 @@ }, { "cell_type": "code", - "execution_count": 25, + "execution_count": 11, "metadata": {}, "outputs": [], "source": [ @@ -834,7 +834,7 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": 12, "metadata": {}, "outputs": [], "source": [ @@ -851,7 +851,7 @@ }, { "cell_type": "code", - "execution_count": 27, + "execution_count": 13, "metadata": {}, "outputs": [], "source": [ @@ -871,7 +871,7 @@ }, { "cell_type": "code", - "execution_count": 28, + "execution_count": 14, "metadata": {}, "outputs": [ { @@ -901,7 +901,7 @@ " dtype: float64}" ] }, - "execution_count": 28, + "execution_count": 14, "metadata": {}, "output_type": "execute_result" } @@ -912,7 +912,7 @@ }, { "cell_type": "code", - "execution_count": 29, + "execution_count": 15, "metadata": {}, "outputs": [ { @@ -1023,7 +1023,7 @@ "20038 2009 98027 47.5644 -122.093 1880 3078 " ] }, - "execution_count": 29, + "execution_count": 15, "metadata": {}, "output_type": "execute_result" } From faab87e08b0594aa0a086bfac402e06c194be1a0 Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Tue, 30 Apr 2024 23:00:38 +0300 Subject: [PATCH 16/42] Update student.ipynb --- student.ipynb | 1 - 1 file changed, 1 deletion(-) diff --git a/student.ipynb b/student.ipynb index 09a140ae..d18aa6f3 100644 --- a/student.ipynb +++ b/student.ipynb @@ -6,7 +6,6 @@ "source": [ "## Final Project Submission\n", "\n", - "Please fill out:\n", Clyde "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng. \n", From 42bea008534fe4c9531c932f1686733927c8b124 Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Tue, 30 Apr 2024 23:08:00 +0300 Subject: [PATCH 17/42] Update student.ipynb --- student.ipynb | 72 +++++++++++++++++++++++++-------------------------- 1 file changed, 36 insertions(+), 36 deletions(-) diff --git a/student.ipynb b/student.ipynb index d18aa6f3..7d3fde3c 100644 --- a/student.ipynb +++ b/student.ipynb @@ -1,40 +1,40 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Final Project Submission\n", - "\n", -Clyde - "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng. \n", - - "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng, Derrick Kiptoo.\n", - main - "* Student pace: full time\n", - "* Scheduled project review date/time: \n", - "* Instructor name: Nikita \n", - "* Blog post URL:\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Kings County Housing Analysis with Multiple Linear Regression" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Overview\n", - "\n", - "\n", - "A real estate agency in Kingsway seeks to determine what are the contributing factors that affect the price of houses to make improvements where necessary. They want to employ an analytical approach rather than sentimental before arriving at a decision. Multilinear regression has been used for this project to understand how various features affect their pricing to better their services." - ] - }, - { + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Final Project Submission\n", + "\n", + "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng. \n", + + "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng, Derrick Kiptoo.\n", + "* Student pace: full time\n", + "* Scheduled project review date/time: \n", + "* Instructor name: Nikita \n", + "* Blog post URL:\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Kings County Housing Analysis with Multiple Linear Regression" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Overview\n", + "\n", + "\n", + "A real estate agency in Kingsway seeks to determine what are the contributing factors that affect the price of houses to make improvements where necessary. They want to employ an analytical approach rather than sentimental before arriving at a decision. Multilinear regression has been used for this project to understand how various features affect their pricing to better their services." + ] + } + ] + } + "cell_type": "markdown", "metadata": {}, "source": [ From 872b835d4f541d0fc23995a1d9947487c482915e Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Tue, 30 Apr 2024 23:11:24 +0300 Subject: [PATCH 18/42] Update student.ipynb --- student.ipynb | 1 - 1 file changed, 1 deletion(-) diff --git a/student.ipynb b/student.ipynb index 7d3fde3c..45f116ae 100644 --- a/student.ipynb +++ b/student.ipynb @@ -35,7 +35,6 @@ ] } - "cell_type": "markdown", "metadata": {}, "source": [ "## Business Problem\n", From d068fa1bd538aa473be4cbcba93ff4cfa58cbdbe Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Tue, 30 Apr 2024 23:14:58 +0300 Subject: [PATCH 19/42] Update student.ipynb --- student.ipynb | 66 +++++++++++++++++++++++++-------------------------- 1 file changed, 32 insertions(+), 34 deletions(-) diff --git a/student.ipynb b/student.ipynb index 45f116ae..4593bb43 100644 --- a/student.ipynb +++ b/student.ipynb @@ -1,40 +1,38 @@ { "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Final Project Submission\n", - "\n", - "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng. \n", - - "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng, Derrick Kiptoo.\n", - "* Student pace: full time\n", - "* Scheduled project review date/time: \n", - "* Instructor name: Nikita \n", - "* Blog post URL:\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Kings County Housing Analysis with Multiple Linear Regression" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Overview\n", - "\n", - "\n", - "A real estate agency in Kingsway seeks to determine what are the contributing factors that affect the price of houses to make improvements where necessary. They want to employ an analytical approach rather than sentimental before arriving at a decision. Multilinear regression has been used for this project to understand how various features affect their pricing to better their services." - ] - } + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Final Project Submission\n", + "\n", + "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng. \n", + "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng, Derrick Kiptoo.\n", + "* Student pace: full time\n", + "* Scheduled project review date/time: \n", + "* Instructor name: Nikita \n", + "* Blog post URL:\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Kings County Housing Analysis with Multiple Linear Regression" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Overview\n", + "\n", + "A real estate agency in Kingsway seeks to determine what are the contributing factors that affect the price of houses to make improvements where necessary. They want to employ an analytical approach rather than sentimental before arriving at a decision. Multilinear regression has been used for this project to understand how various features affect their pricing to better their services." + ] + } ] - } - + } + "metadata": {}, "source": [ "## Business Problem\n", From 18077fac0082e399f6ab72625423e677fd7b8764 Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Tue, 30 Apr 2024 23:43:40 +0300 Subject: [PATCH 20/42] Update student.ipynb --- student.ipynb | 164 +++++++++++++++++++++----------------------------- 1 file changed, 69 insertions(+), 95 deletions(-) diff --git a/student.ipynb b/student.ipynb index 4593bb43..c174e446 100644 --- a/student.ipynb +++ b/student.ipynb @@ -29,101 +29,75 @@ "\n", "A real estate agency in Kingsway seeks to determine what are the contributing factors that affect the price of houses to make improvements where necessary. They want to employ an analytical approach rather than sentimental before arriving at a decision. Multilinear regression has been used for this project to understand how various features affect their pricing to better their services." ] - } - ] - } - - "metadata": {}, - "source": [ - "## Business Problem\n", - "\n", - "In the face of market fluctuations and heightened competition within the real estate sector, our agency is grappling with pricing volatility, which poses significant challenges for our agents in devising effective business strategies. We seek strategic guidance to optimize our purchasing and selling endeavors, prioritizing informed decision-making to identify key areas of focus that promise maximum returns on investment." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Objectives\n", - "* To determine the key factors influencing house prices.\n", - "* To develop multilinear regression models to predict house prices based on relevant features.\n", - "* To use insights from the regression analysis to optimize pricing strategies for both purchasing and selling properties.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Hypothesis\n", - "* Null Hypothesis - There is no relationship between our independent variables and our dependent variable \n", - "\n", - "* Alternative Hypothesis - There is a relationship between our independent variables and our dependent variable" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Data Understanding:\n", - "\n", - "In this project, we utilized the King County House Sales dataset, which serves as the foundational dataset for our analysis. It was sourced Kaggle.The dataset encompasses comprehensive information regarding house sales within King County, Washington, USA. It comprises a diverse array of features, including the number of bedrooms, bathrooms, square footage, as well as geographical and pricing details of the properties sold. This dataset is frequently employed in data science and machine learning endeavors, particularly for predictive modeling tasks such as regression analysis aimed at forecasting house prices based on the provided features." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### King County Housing Data Columns \n", - "\n", - "The column names contained in column_names.md are:\n", - "* `id`: A unique identifier for each house sale.\n", - "* `date`: The date when the house was sold.\n", - "* `price`: The sale price of the house, serving as the target variable for predictive modeling.\n", - "* `bedrooms`, `bathrooms`, `sqft_living`, `sqft_lot`: Numerical features representing the number of bedrooms and bathrooms, as well as the living area and lot area of the house, respectively.\n", - "* `floors`: The number of floors in the house.\n", - "* `waterfront`, `view`, `condition`, `grade`: Categorical features describing aspects such as waterfront availability, property view, condition, and overall grade assigned to the housing unit.\n", - "* `yr_built`, `yr_renovated`: Year of construction and renovation of the house.\n", - "* `zipcode`, `lat`, `long`: Geographical features including ZIP code, latitude, and longitude coordinates.\n", - "* `sqft_above`, `sqft_basement`, `sqft_living15`, `sqft_lot15`: Additional numerical features providing details about the house's above-ground and basement square footage, as well as living area and lot area of the nearest 15 neighboring houses." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Loading\n", - "\n", - "#### Import Necessary Libraries" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "%matplotlib inline\n", - "import numpy as np\n", - "import pandas as pd\n", - "import scipy.stats as stats\n", - "import seaborn as sns\n", - "import statsmodels.api as sm\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Loading Data" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Business Problem\n", + "\n", + "In the face of market fluctuations and heightened competition within the real estate sector, our agency is grappling with pricing volatility, which poses significant challenges for our agents in devising effective business strategies. We seek strategic guidance to optimize our purchasing and selling endeavors, prioritizing informed decision-making to identify key areas of focus that promise maximum returns on investment." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Objectives\n", + "* To determine the key factors influencing house prices.\n", + "* To develop multilinear regression models to predict house prices based on relevant features.\n", + "* To use insights from the regression analysis to optimize pricing strategies for both purchasing and selling properties.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Hypothesis\n", + "* Null Hypothesis - There is no relationship between our independent variables and our dependent variable \n", + "\n", + "* Alternative Hypothesis - There is a relationship between our independent variables and our dependent variable" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data Understanding:\n", + "\n", + "In this project, we utilized the King County House Sales dataset, which serves as the foundational dataset for our analysis. It was sourced Kaggle.The dataset encompasses comprehensive information regarding house sales within King County, Washington, USA. It comprises a diverse array of features, including the number of bedrooms, bathrooms, square footage, as well as geographical and pricing details of the properties sold. This dataset is frequently employed in data science and machine learning endeavors, particularly for predictive modeling tasks such as regression analysis aimed at forecasting house prices based on the provided features." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### King County Housing Data Columns \n", + "\n", + "The column names contained in column_names.md are:\n", + "* `id`: A unique identifier for each house sale.\n", + "* `date`: The date when the house was sold.\n", + "* `price`: The sale price of the house, serving as the target variable for predictive modeling.\n", + "* `bedrooms`, `bathrooms`, `sqft_living`, `sqft_lot`: Numerical features representing the number of bedrooms and bathrooms, as well as the living area and lot area of the house, respectively.\n", + "* `floors`: The number of floors in the house.\n", + "* `waterfront`, `view`, `condition`, `grade`: Categorical features describing aspects such as waterfront availability, property view, condition, and overall grade assigned to the housing unit.\n", + "* `yr_built`, `yr_renovated`: Year of construction and renovation of the house.\n", + "* `zipcode`, `lat`, `long`: Geographical features including ZIP code, latitude, and longitude coordinates.\n", + "* `sqft_above`, `sqft_basement`, `sqft_living15`, `sqft_lot15`: Additional numerical features providing details about the house's above-ground and basement square footage, as well as living area and lot area of the nearest 15 neighboring houses." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Loading\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ "# Creating a function that loads data and return it in a dataframe\n", "def load_data(file_path):\n", " house_data = pd.read_csv(file_path)\n", From 457ad7cc459ba0165caa25a11e4788feddf3eda9 Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Tue, 30 Apr 2024 23:44:41 +0300 Subject: [PATCH 21/42] Update student.ipynb --- student.ipynb | 2025 +++++++++++++++++++++++++------------------------ 1 file changed, 1013 insertions(+), 1012 deletions(-) diff --git a/student.ipynb b/student.ipynb index c174e446..cea94b8e 100644 --- a/student.ipynb +++ b/student.ipynb @@ -1,1031 +1,1032 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Final Project Submission\n", - "\n", - "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng. \n", - "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng, Derrick Kiptoo.\n", - "* Student pace: full time\n", - "* Scheduled project review date/time: \n", - "* Instructor name: Nikita \n", - "* Blog post URL:\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Kings County Housing Analysis with Multiple Linear Regression" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Overview\n", - "\n", - "A real estate agency in Kingsway seeks to determine what are the contributing factors that affect the price of houses to make improvements where necessary. They want to employ an analytical approach rather than sentimental before arriving at a decision. Multilinear regression has been used for this project to understand how various features affect their pricing to better their services." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Business Problem\n", - "\n", - "In the face of market fluctuations and heightened competition within the real estate sector, our agency is grappling with pricing volatility, which poses significant challenges for our agents in devising effective business strategies. We seek strategic guidance to optimize our purchasing and selling endeavors, prioritizing informed decision-making to identify key areas of focus that promise maximum returns on investment." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Objectives\n", - "* To determine the key factors influencing house prices.\n", - "* To develop multilinear regression models to predict house prices based on relevant features.\n", - "* To use insights from the regression analysis to optimize pricing strategies for both purchasing and selling properties.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Hypothesis\n", - "* Null Hypothesis - There is no relationship between our independent variables and our dependent variable \n", - "\n", - "* Alternative Hypothesis - There is a relationship between our independent variables and our dependent variable" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Data Understanding:\n", - "\n", - "In this project, we utilized the King County House Sales dataset, which serves as the foundational dataset for our analysis. It was sourced Kaggle.The dataset encompasses comprehensive information regarding house sales within King County, Washington, USA. It comprises a diverse array of features, including the number of bedrooms, bathrooms, square footage, as well as geographical and pricing details of the properties sold. This dataset is frequently employed in data science and machine learning endeavors, particularly for predictive modeling tasks such as regression analysis aimed at forecasting house prices based on the provided features." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### King County Housing Data Columns \n", - "\n", - "The column names contained in column_names.md are:\n", - "* `id`: A unique identifier for each house sale.\n", - "* `date`: The date when the house was sold.\n", - "* `price`: The sale price of the house, serving as the target variable for predictive modeling.\n", - "* `bedrooms`, `bathrooms`, `sqft_living`, `sqft_lot`: Numerical features representing the number of bedrooms and bathrooms, as well as the living area and lot area of the house, respectively.\n", - "* `floors`: The number of floors in the house.\n", - "* `waterfront`, `view`, `condition`, `grade`: Categorical features describing aspects such as waterfront availability, property view, condition, and overall grade assigned to the housing unit.\n", - "* `yr_built`, `yr_renovated`: Year of construction and renovation of the house.\n", - "* `zipcode`, `lat`, `long`: Geographical features including ZIP code, latitude, and longitude coordinates.\n", - "* `sqft_above`, `sqft_basement`, `sqft_living15`, `sqft_lot15`: Additional numerical features providing details about the house's above-ground and basement square footage, as well as living area and lot area of the nearest 15 neighboring houses." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Loading\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "# Creating a function that loads data and return it in a dataframe\n", - "def load_data(file_path):\n", - " house_data = pd.read_csv(file_path)\n", - "\n", - " #shape\n", - " shape = house_data.shape\n", - " print(f\"The dataset contains {shape[0]} houses with {shape[1]} features\")\n", - " print()\n", - " \n", - " #Data Types\n", - " data_types = house_data.dtypes\n", - " print(\"Columns and their data types:\")\n", - " for column, dtype in data_types.items():\n", - " print(f\"{column}: {dtype}\")\n", - " print()\n", - "\n", - " return house_data\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Final Project Submission\n", + "\n", + "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng. \n", + "* Student pace: full time\n", + "* Scheduled project review date/time: \n", + "* Instructor name: Nikita \n", + "* Blog post URL:\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Kings County Housing Analysis with Multiple Linear Regression" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Overview\n", + "\n", + "A real estate agency in Kingsway seeks to determine what are the contributing factors that affect the price of houses to make improvements where necessary. They want to employ an analytical approach rather than sentimental before arriving at a decision. Multilinear regression has been used for this project to understand how various features affect their pricing to better their services." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Business Problem\n", + "\n", + "In the face of market fluctuations and heightened competition within the real estate sector, our agency is grappling with pricing volatility, which poses significant challenges for our agents in devising effective business strategies. We seek strategic guidance to optimize our purchasing and selling endeavors, prioritizing informed decision-making to identify key areas of focus that promise maximum returns on investment." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Objectives\n", + "* To determine the key factors influencing house prices.\n", + "* To develop multilinear regression models to predict house prices based on relevant features.\n", + "* To use insights from the regression analysis to optimize pricing strategies for both purchasing and selling properties.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Hypothesis\n", + "* Null Hypothesis - There is no relationship between our independent variables and our dependent variable \n", + "\n", + "* Alternative Hypothesis - There is a relationship between our independent variables and our dependent variable" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data Understanding:\n", + "\n", + "In this project, we utilized the King County House Sales dataset, which serves as the foundational dataset for our analysis. It was sourced Kaggle.The dataset encompasses comprehensive information regarding house sales within King County, Washington, USA. It comprises a diverse array of features, including the number of bedrooms, bathrooms, square footage, as well as geographical and pricing details of the properties sold. This dataset is frequently employed in data science and machine learning endeavors, particularly for predictive modeling tasks such as regression analysis aimed at forecasting house prices based on the provided features." + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "The dataset contains 21597 houses with 21 features\n", - "\n", - "Columns and their data types:\n", - "id: int64\n", - "date: object\n", - "price: float64\n", - "bedrooms: int64\n", - "bathrooms: float64\n", - "sqft_living: int64\n", - "sqft_lot: int64\n", - "floors: float64\n", - "waterfront: object\n", - "view: object\n", - "condition: object\n", - "grade: object\n", - "sqft_above: int64\n", - "sqft_basement: object\n", - "yr_built: int64\n", - "yr_renovated: float64\n", - "zipcode: int64\n", - "lat: float64\n", - "long: float64\n", - "sqft_living15: int64\n", - "sqft_lot15: int64\n", - "\n" - ] + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### King County Housing Data Columns \n", + "\n", + "The column names contained in column_names.md are:\n", + "* `id`: A unique identifier for each house sale.\n", + "* `date`: The date when the house was sold.\n", + "* `price`: The sale price of the house, serving as the target variable for predictive modeling.\n", + "* `bedrooms`, `bathrooms`, `sqft_living`, `sqft_lot`: Numerical features representing the number of bedrooms and bathrooms, as well as the living area and lot area of the house, respectively.\n", + "* `floors`: The number of floors in the house.\n", + "* `waterfront`, `view`, `condition`, `grade`: Categorical features describing aspects such as waterfront availability, property view, condition, and overall grade assigned to the housing unit.\n", + "* `yr_built`, `yr_renovated`: Year of construction and renovation of the house.\n", + "* `zipcode`, `lat`, `long`: Geographical features including ZIP code, latitude, and longitude coordinates.\n", + "* `sqft_above`, `sqft_basement`, `sqft_living15`, `sqft_lot15`: Additional numerical features providing details about the house's above-ground and basement square footage, as well as living area and lot area of the nearest 15 neighboring houses." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Loading\n", + "\n" + ] }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontview...gradesqft_abovesqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15
0712930052010/13/2014221900.031.00118056501.0NaNNONE...7 Average11800.019550.09817847.5112-122.25713405650
1641410019212/9/2014538000.032.25257072422.0NONONE...7 Average2170400.019511991.09812547.7210-122.31916907639
256315004002/25/2015180000.021.00770100001.0NONONE...6 Low Average7700.01933NaN9802847.7379-122.23327208062
3248720087512/9/2014604000.043.00196050001.0NONONE...7 Average1050910.019650.09813647.5208-122.39313605000
419544005102/18/2015510000.032.00168080801.0NONONE...8 Good16800.019870.09807447.6168-122.04518007503
..................................................................
215922630000185/21/2014360000.032.50153011313.0NONONE...8 Good15300.020090.09810347.6993-122.34615301509
2159366000601202/23/2015400000.042.50231058132.0NONONE...8 Good23100.020140.09814647.5107-122.36218307200
2159415233001416/23/2014402101.020.75102013502.0NONONE...7 Average10200.020090.09814447.5944-122.29910202007
215952913101001/16/2015400000.032.50160023882.0NaNNONE...8 Good16000.020040.09802747.5345-122.06914101287
21596152330015710/15/2014325000.020.75102010762.0NONONE...7 Average10200.020080.09814447.5941-122.29910201357
\n", - "

21597 rows × 21 columns

\n", - "
" + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating a function that loads data and return it in a dataframe\n", + "def load_data(file_path):\n", + " house_data = pd.read_csv(file_path)\n", + "\n", + " #shape\n", + " shape = house_data.shape\n", + " print(f\"The dataset contains {shape[0]} houses with {shape[1]} features\")\n", + " print()\n", + " \n", + " #Data Types\n", + " data_types = house_data.dtypes\n", + " print(\"Columns and their data types:\")\n", + " for column, dtype in data_types.items():\n", + " print(f\"{column}: {dtype}\")\n", + " print()\n", + "\n", + " return house_data\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The dataset contains 21597 houses with 21 features\n", + "\n", + "Columns and their data types:\n", + "id: int64\n", + "date: object\n", + "price: float64\n", + "bedrooms: int64\n", + "bathrooms: float64\n", + "sqft_living: int64\n", + "sqft_lot: int64\n", + "floors: float64\n", + "waterfront: object\n", + "view: object\n", + "condition: object\n", + "grade: object\n", + "sqft_above: int64\n", + "sqft_basement: object\n", + "yr_built: int64\n", + "yr_renovated: float64\n", + "zipcode: int64\n", + "lat: float64\n", + "long: float64\n", + "sqft_living15: int64\n", + "sqft_lot15: int64\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontview...gradesqft_abovesqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15
0712930052010/13/2014221900.031.00118056501.0NaNNONE...7 Average11800.019550.09817847.5112-122.25713405650
1641410019212/9/2014538000.032.25257072422.0NONONE...7 Average2170400.019511991.09812547.7210-122.31916907639
256315004002/25/2015180000.021.00770100001.0NONONE...6 Low Average7700.01933NaN9802847.7379-122.23327208062
3248720087512/9/2014604000.043.00196050001.0NONONE...7 Average1050910.019650.09813647.5208-122.39313605000
419544005102/18/2015510000.032.00168080801.0NONONE...8 Good16800.019870.09807447.6168-122.04518007503
..................................................................
215922630000185/21/2014360000.032.50153011313.0NONONE...8 Good15300.020090.09810347.6993-122.34615301509
2159366000601202/23/2015400000.042.50231058132.0NONONE...8 Good23100.020140.09814647.5107-122.36218307200
2159415233001416/23/2014402101.020.75102013502.0NONONE...7 Average10200.020090.09814447.5944-122.29910202007
215952913101001/16/2015400000.032.50160023882.0NaNNONE...8 Good16000.020040.09802747.5345-122.06914101287
21596152330015710/15/2014325000.020.75102010762.0NONONE...7 Average10200.020080.09814447.5941-122.29910201357
\n", + "

21597 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " id date price bedrooms bathrooms sqft_living \\\n", + "0 7129300520 10/13/2014 221900.0 3 1.00 1180 \n", + "1 6414100192 12/9/2014 538000.0 3 2.25 2570 \n", + "2 5631500400 2/25/2015 180000.0 2 1.00 770 \n", + "3 2487200875 12/9/2014 604000.0 4 3.00 1960 \n", + "4 1954400510 2/18/2015 510000.0 3 2.00 1680 \n", + "... ... ... ... ... ... ... \n", + "21592 263000018 5/21/2014 360000.0 3 2.50 1530 \n", + "21593 6600060120 2/23/2015 400000.0 4 2.50 2310 \n", + "21594 1523300141 6/23/2014 402101.0 2 0.75 1020 \n", + "21595 291310100 1/16/2015 400000.0 3 2.50 1600 \n", + "21596 1523300157 10/15/2014 325000.0 2 0.75 1020 \n", + "\n", + " sqft_lot floors waterfront view ... grade sqft_above \\\n", + "0 5650 1.0 NaN NONE ... 7 Average 1180 \n", + "1 7242 2.0 NO NONE ... 7 Average 2170 \n", + "2 10000 1.0 NO NONE ... 6 Low Average 770 \n", + "3 5000 1.0 NO NONE ... 7 Average 1050 \n", + "4 8080 1.0 NO NONE ... 8 Good 1680 \n", + "... ... ... ... ... ... ... ... \n", + "21592 1131 3.0 NO NONE ... 8 Good 1530 \n", + "21593 5813 2.0 NO NONE ... 8 Good 2310 \n", + "21594 1350 2.0 NO NONE ... 7 Average 1020 \n", + "21595 2388 2.0 NaN NONE ... 8 Good 1600 \n", + "21596 1076 2.0 NO NONE ... 7 Average 1020 \n", + "\n", + " sqft_basement yr_built yr_renovated zipcode lat long \\\n", + "0 0.0 1955 0.0 98178 47.5112 -122.257 \n", + "1 400.0 1951 1991.0 98125 47.7210 -122.319 \n", + "2 0.0 1933 NaN 98028 47.7379 -122.233 \n", + "3 910.0 1965 0.0 98136 47.5208 -122.393 \n", + "4 0.0 1987 0.0 98074 47.6168 -122.045 \n", + "... ... ... ... ... ... ... \n", + "21592 0.0 2009 0.0 98103 47.6993 -122.346 \n", + "21593 0.0 2014 0.0 98146 47.5107 -122.362 \n", + "21594 0.0 2009 0.0 98144 47.5944 -122.299 \n", + "21595 0.0 2004 0.0 98027 47.5345 -122.069 \n", + "21596 0.0 2008 0.0 98144 47.5941 -122.299 \n", + "\n", + " sqft_living15 sqft_lot15 \n", + "0 1340 5650 \n", + "1 1690 7639 \n", + "2 2720 8062 \n", + "3 1360 5000 \n", + "4 1800 7503 \n", + "... ... ... \n", + "21592 1530 1509 \n", + "21593 1830 7200 \n", + "21594 1020 2007 \n", + "21595 1410 1287 \n", + "21596 1020 1357 \n", + "\n", + "[21597 rows x 21 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - " id date price bedrooms bathrooms sqft_living \\\n", - "0 7129300520 10/13/2014 221900.0 3 1.00 1180 \n", - "1 6414100192 12/9/2014 538000.0 3 2.25 2570 \n", - "2 5631500400 2/25/2015 180000.0 2 1.00 770 \n", - "3 2487200875 12/9/2014 604000.0 4 3.00 1960 \n", - "4 1954400510 2/18/2015 510000.0 3 2.00 1680 \n", - "... ... ... ... ... ... ... \n", - "21592 263000018 5/21/2014 360000.0 3 2.50 1530 \n", - "21593 6600060120 2/23/2015 400000.0 4 2.50 2310 \n", - "21594 1523300141 6/23/2014 402101.0 2 0.75 1020 \n", - "21595 291310100 1/16/2015 400000.0 3 2.50 1600 \n", - "21596 1523300157 10/15/2014 325000.0 2 0.75 1020 \n", - "\n", - " sqft_lot floors waterfront view ... grade sqft_above \\\n", - "0 5650 1.0 NaN NONE ... 7 Average 1180 \n", - "1 7242 2.0 NO NONE ... 7 Average 2170 \n", - "2 10000 1.0 NO NONE ... 6 Low Average 770 \n", - "3 5000 1.0 NO NONE ... 7 Average 1050 \n", - "4 8080 1.0 NO NONE ... 8 Good 1680 \n", - "... ... ... ... ... ... ... ... \n", - "21592 1131 3.0 NO NONE ... 8 Good 1530 \n", - "21593 5813 2.0 NO NONE ... 8 Good 2310 \n", - "21594 1350 2.0 NO NONE ... 7 Average 1020 \n", - "21595 2388 2.0 NaN NONE ... 8 Good 1600 \n", - "21596 1076 2.0 NO NONE ... 7 Average 1020 \n", - "\n", - " sqft_basement yr_built yr_renovated zipcode lat long \\\n", - "0 0.0 1955 0.0 98178 47.5112 -122.257 \n", - "1 400.0 1951 1991.0 98125 47.7210 -122.319 \n", - "2 0.0 1933 NaN 98028 47.7379 -122.233 \n", - "3 910.0 1965 0.0 98136 47.5208 -122.393 \n", - "4 0.0 1987 0.0 98074 47.6168 -122.045 \n", - "... ... ... ... ... ... ... \n", - "21592 0.0 2009 0.0 98103 47.6993 -122.346 \n", - "21593 0.0 2014 0.0 98146 47.5107 -122.362 \n", - "21594 0.0 2009 0.0 98144 47.5944 -122.299 \n", - "21595 0.0 2004 0.0 98027 47.5345 -122.069 \n", - "21596 0.0 2008 0.0 98144 47.5941 -122.299 \n", - "\n", - " sqft_living15 sqft_lot15 \n", - "0 1340 5650 \n", - "1 1690 7639 \n", - "2 2720 8062 \n", - "3 1360 5000 \n", - "4 1800 7503 \n", - "... ... ... \n", - "21592 1530 1509 \n", - "21593 1830 7200 \n", - "21594 1020 2007 \n", - "21595 1410 1287 \n", - "21596 1020 1357 \n", - "\n", - "[21597 rows x 21 columns]" + "source": [ + "load_data('data/kc_house_data.csv') # Assuming 'data' folder is in the same directory\n", + "\n", + "\n" ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "load_data('data/kc_house_data.csv') # Assuming 'data' folder is in the same directory\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The dataset contains 21 columns, each representing a distinct feature, and 21,597 rows, with each row corresponding to a specific house sale entry.\n", - "\n", - "The dataset contains a mix of data types, including integers (int64), floating-point numbers (float64), and objects (strings). For instance, numerical features such as bedrooms, bathrooms, and sqft_living are represented as integers or floating-point numbers to facilitate mathematical computations, while categorical features like waterfront and view are stored as objects to accommodate text-based categories." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "The dataset contains 21597 houses with 21 features\n", - "\n", - "Columns and their data types:\n", - "id: int64\n", - "date: object\n", - "price: float64\n", - "bedrooms: int64\n", - "bathrooms: float64\n", - "sqft_living: int64\n", - "sqft_lot: int64\n", - "floors: float64\n", - "waterfront: object\n", - "view: object\n", - "condition: object\n", - "grade: object\n", - "sqft_above: int64\n", - "sqft_basement: object\n", - "yr_built: int64\n", - "yr_renovated: float64\n", - "zipcode: int64\n", - "lat: float64\n", - "long: float64\n", - "sqft_living15: int64\n", - "sqft_lot15: int64\n", - "\n" - ] - } - ], - "source": [ - "kings_data = load_data('data/kc_house_data.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "#create a function that takes in a column and returns the column statistics as a dictionary\n", - "def descriptive_analytics(column):\n", - " stats_dict = column.describe().to_dict()\n", - " \n", - " print(\"Descriptive Statistics for Column '{}':\".format(column.name))\n", - " print(\"The count of the column is:\", stats_dict['count'])\n", - " print(\"The mean of the column is:\", stats_dict['mean'])\n", - " print(\"The standard deviation of the column is:\", stats_dict['std'])\n", - " print(\"The minimum value of the column is:\", stats_dict['min'])\n", - " print(\"The 25th percentile of the column is:\", stats_dict['25%'])\n", - " print(\"The median of the column is:\", stats_dict['50%'])\n", - " print(\"The 75th percentile of the column is:\", stats_dict['75%'])\n", - " print(\"The maximum value of the column is:\", stats_dict['max'])" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The dataset contains 21 columns, each representing a distinct feature, and 21,597 rows, with each row corresponding to a specific house sale entry.\n", + "\n", + "The dataset contains a mix of data types, including integers (int64), floating-point numbers (float64), and objects (strings). For instance, numerical features such as bedrooms, bathrooms, and sqft_living are represented as integers or floating-point numbers to facilitate mathematical computations, while categorical features like waterfront and view are stored as objects to accommodate text-based categories." + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Descriptive Statistics for Column 'price':\n", - "The count of the column is: 21597.0\n", - "The mean of the column is: 540296.5735055795\n", - "The standard deviation of the column is: 367368.1401013936\n", - "The minimum value of the column is: 78000.0\n", - "The 25th percentile of the column is: 322000.0\n", - "The median of the column is: 450000.0\n", - "The 75th percentile of the column is: 645000.0\n", - "The maximum value of the column is: 7700000.0\n" - ] - } - ], - "source": [ - "descriptive_analytics(kings_data['price'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see that the maximum price of a house is 7700000 dollars and the minimum price is 78000 dollars\n", - "\n", - "There are 21597 prices regarding to the houses in the dataset\n", - "\n", - "Average price of a house is 540296.57 dollars" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Preperation\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The dataset contains 21597 houses with 21 features\n", + "\n", + "Columns and their data types:\n", + "id: int64\n", + "date: object\n", + "price: float64\n", + "bedrooms: int64\n", + "bathrooms: float64\n", + "sqft_living: int64\n", + "sqft_lot: int64\n", + "floors: float64\n", + "waterfront: object\n", + "view: object\n", + "condition: object\n", + "grade: object\n", + "sqft_above: int64\n", + "sqft_basement: object\n", + "yr_built: int64\n", + "yr_renovated: float64\n", + "zipcode: int64\n", + "lat: float64\n", + "long: float64\n", + "sqft_living15: int64\n", + "sqft_lot15: int64\n", + "\n" + ] + } + ], + "source": [ + "kings_data = load_data('data/kc_house_data.csv')" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 21597 entries, 0 to 21596\n", - "Data columns (total 21 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 21597 non-null int64 \n", - " 1 date 21597 non-null object \n", - " 2 price 21597 non-null float64\n", - " 3 bedrooms 21597 non-null int64 \n", - " 4 bathrooms 21597 non-null float64\n", - " 5 sqft_living 21597 non-null int64 \n", - " 6 sqft_lot 21597 non-null int64 \n", - " 7 floors 21597 non-null float64\n", - " 8 waterfront 19221 non-null object \n", - " 9 view 21534 non-null object \n", - " 10 condition 21597 non-null object \n", - " 11 grade 21597 non-null object \n", - " 12 sqft_above 21597 non-null int64 \n", - " 13 sqft_basement 21597 non-null object \n", - " 14 yr_built 21597 non-null int64 \n", - " 15 yr_renovated 17755 non-null float64\n", - " 16 zipcode 21597 non-null int64 \n", - " 17 lat 21597 non-null float64\n", - " 18 long 21597 non-null float64\n", - " 19 sqft_living15 21597 non-null int64 \n", - " 20 sqft_lot15 21597 non-null int64 \n", - "dtypes: float64(6), int64(9), object(6)\n", - "memory usage: 3.5+ MB\n" - ] - } - ], - "source": [ - "kings_data.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "def identify_issues(dataset):\n", - " # Identify missing values as a percentage of the whole dataset\n", - " missing_values = (dataset.isnull().sum())/len(dataset) * 100\n", - "\n", - " # Identify duplicates\n", - " duplicates = dataset.duplicated().sum()\n", - " \n", - " #return a dictionary \n", - " return {'duplicates': duplicates,\n", - " 'missing values': missing_values.round(2)} \n" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "#create a function that takes in a column and returns the column statistics as a dictionary\n", + "def descriptive_analytics(column):\n", + " stats_dict = column.describe().to_dict()\n", + " \n", + " print(\"Descriptive Statistics for Column '{}':\".format(column.name))\n", + " print(\"The count of the column is:\", stats_dict['count'])\n", + " print(\"The mean of the column is:\", stats_dict['mean'])\n", + " print(\"The standard deviation of the column is:\", stats_dict['std'])\n", + " print(\"The minimum value of the column is:\", stats_dict['min'])\n", + " print(\"The 25th percentile of the column is:\", stats_dict['25%'])\n", + " print(\"The median of the column is:\", stats_dict['50%'])\n", + " print(\"The 75th percentile of the column is:\", stats_dict['75%'])\n", + " print(\"The maximum value of the column is:\", stats_dict['max'])" + ] + }, { - "data": { - "text/plain": [ - "{'duplicates': 0,\n", - " 'missing values': id 0.00\n", - " date 0.00\n", - " price 0.00\n", - " bedrooms 0.00\n", - " bathrooms 0.00\n", - " sqft_living 0.00\n", - " sqft_lot 0.00\n", - " floors 0.00\n", - " waterfront 11.00\n", - " view 0.29\n", - " condition 0.00\n", - " grade 0.00\n", - " sqft_above 0.00\n", - " sqft_basement 0.00\n", - " yr_built 0.00\n", - " yr_renovated 17.79\n", - " zipcode 0.00\n", - " lat 0.00\n", - " long 0.00\n", - " sqft_living15 0.00\n", - " sqft_lot15 0.00\n", - " dtype: float64}" + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Descriptive Statistics for Column 'price':\n", + "The count of the column is: 21597.0\n", + "The mean of the column is: 540296.5735055795\n", + "The standard deviation of the column is: 367368.1401013936\n", + "The minimum value of the column is: 78000.0\n", + "The 25th percentile of the column is: 322000.0\n", + "The median of the column is: 450000.0\n", + "The 75th percentile of the column is: 645000.0\n", + "The maximum value of the column is: 7700000.0\n" + ] + } + ], + "source": [ + "descriptive_analytics(kings_data['price'])" ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "identify_issues(kings_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The examination indicates that there are no duplicate entries within the dataset, ensuring the integrity of the records. However, attention is warranted to address missing values present in certain columns. Specifically, the 'waterfront' feature exhibits 11% of null values, representing a negligible portion of the dataset. Similarly, the 'yr_renovated' feature shows a relatively higher percentage of missing values, accounting for approximately 17.79% of the dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Before making changes make a copy instead of overwriting data" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "house_data_clean = kings_data.copy()" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# Changing the date to date time\n", - "house_data_clean['date'] = pd.to_datetime(house_data_clean['date'])\n", - "\n", - "# Extracting only the year from the column Date\n", - "house_data_clean.date = house_data_clean['date'].dt.year\n", - "\n", - "# Changing the dates for the year built \n", - "house_data_clean['yr_built'] = pd.to_datetime(house_data_clean['yr_built'],format='%Y').dt.year\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The above code converts the 'date' column data to only contain the year the house was sold, for the purpose of analysis we will use only the year since the changes month by month will be minor." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Dealing with the missing values" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "def missing_values(dataset):\n", - " # drop the rows from views\n", - " dataset.dropna(subset=['view'],inplace=True)\n", - "\n", - " # Filling the NaN values for waterfront with NO\n", - " dataset.waterfront.fillna('NO',inplace=True)\n", - " \n", - " # Dropping the yr_renovated column \n", - " dataset.drop('yr_renovated',axis=1,inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "missing_values(house_data_clean)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "'yr_renovated' has the highest percentage of NaN values 17%. This will be dropped since it will not be used within our model inline with the business problem.\n", - "\n", - "'Waterfront' feature has 11% null values, this was filled with NO on the assumption that these cells were not filled since they lacked waterfronts\n", - "\n", - "For the 'View' column, the null values were dropped by row since the overall percentage impact would be minute" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ + }, { - "data": { - "text/plain": [ - "{'duplicates': 2,\n", - " 'missing values': id 0.0\n", - " date 0.0\n", - " price 0.0\n", - " bedrooms 0.0\n", - " bathrooms 0.0\n", - " sqft_living 0.0\n", - " sqft_lot 0.0\n", - " floors 0.0\n", - " waterfront 0.0\n", - " view 0.0\n", - " condition 0.0\n", - " grade 0.0\n", - " sqft_above 0.0\n", - " sqft_basement 0.0\n", - " yr_built 0.0\n", - " zipcode 0.0\n", - " lat 0.0\n", - " long 0.0\n", - " sqft_living15 0.0\n", - " sqft_lot15 0.0\n", - " dtype: float64}" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the maximum price of a house is 7700000 dollars and the minimum price is 78000 dollars\n", + "\n", + "There are 21597 prices regarding to the houses in the dataset\n", + "\n", + "Average price of a house is 540296.57 dollars" ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "identify_issues(house_data_clean)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preperation\n", + "\n" + ] + }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewconditiongradesqft_abovesqft_basementyr_builtzipcodelatlongsqft_living15sqft_lot15
394718250690312014550000.041.75241084472.0NOGOODGood8 Good2060350.019369807447.6499-122.088252014789
2003886489001102014555000.032.50194032112.0NONONEAverage8 Good19400.020099802747.5644-122.09318803078
\n", - "
" + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 21597 entries, 0 to 21596\n", + "Data columns (total 21 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 21597 non-null int64 \n", + " 1 date 21597 non-null object \n", + " 2 price 21597 non-null float64\n", + " 3 bedrooms 21597 non-null int64 \n", + " 4 bathrooms 21597 non-null float64\n", + " 5 sqft_living 21597 non-null int64 \n", + " 6 sqft_lot 21597 non-null int64 \n", + " 7 floors 21597 non-null float64\n", + " 8 waterfront 19221 non-null object \n", + " 9 view 21534 non-null object \n", + " 10 condition 21597 non-null object \n", + " 11 grade 21597 non-null object \n", + " 12 sqft_above 21597 non-null int64 \n", + " 13 sqft_basement 21597 non-null object \n", + " 14 yr_built 21597 non-null int64 \n", + " 15 yr_renovated 17755 non-null float64\n", + " 16 zipcode 21597 non-null int64 \n", + " 17 lat 21597 non-null float64\n", + " 18 long 21597 non-null float64\n", + " 19 sqft_living15 21597 non-null int64 \n", + " 20 sqft_lot15 21597 non-null int64 \n", + "dtypes: float64(6), int64(9), object(6)\n", + "memory usage: 3.5+ MB\n" + ] + } ], - "text/plain": [ - " id date price bedrooms bathrooms sqft_living sqft_lot \\\n", - "3947 1825069031 2014 550000.0 4 1.75 2410 8447 \n", - "20038 8648900110 2014 555000.0 3 2.50 1940 3211 \n", - "\n", - " floors waterfront view condition grade sqft_above sqft_basement \\\n", - "3947 2.0 NO GOOD Good 8 Good 2060 350.0 \n", - "20038 2.0 NO NONE Average 8 Good 1940 0.0 \n", - "\n", - " yr_built zipcode lat long sqft_living15 sqft_lot15 \n", - "3947 1936 98074 47.6499 -122.088 2520 14789 \n", - "20038 2009 98027 47.5644 -122.093 1880 3078 " + "source": [ + "kings_data.info()" ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def identify_issues(dataset):\n", + " # Identify missing values as a percentage of the whole dataset\n", + " missing_values = (dataset.isnull().sum())/len(dataset) * 100\n", + "\n", + " # Identify duplicates\n", + " duplicates = dataset.duplicated().sum()\n", + " \n", + " #return a dictionary \n", + " return {'duplicates': duplicates,\n", + " 'missing values': missing_values.round(2)} \n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'duplicates': 0,\n", + " 'missing values': id 0.00\n", + " date 0.00\n", + " price 0.00\n", + " bedrooms 0.00\n", + " bathrooms 0.00\n", + " sqft_living 0.00\n", + " sqft_lot 0.00\n", + " floors 0.00\n", + " waterfront 11.00\n", + " view 0.29\n", + " condition 0.00\n", + " grade 0.00\n", + " sqft_above 0.00\n", + " sqft_basement 0.00\n", + " yr_built 0.00\n", + " yr_renovated 17.79\n", + " zipcode 0.00\n", + " lat 0.00\n", + " long 0.00\n", + " sqft_living15 0.00\n", + " sqft_lot15 0.00\n", + " dtype: float64}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "identify_issues(kings_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The examination indicates that there are no duplicate entries within the dataset, ensuring the integrity of the records. However, attention is warranted to address missing values present in certain columns. Specifically, the 'waterfront' feature exhibits 11% of null values, representing a negligible portion of the dataset. Similarly, the 'yr_renovated' feature shows a relatively higher percentage of missing values, accounting for approximately 17.79% of the dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Before making changes make a copy instead of overwriting data" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "house_data_clean = kings_data.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Changing the date to date time\n", + "house_data_clean['date'] = pd.to_datetime(house_data_clean['date'])\n", + "\n", + "# Extracting only the year from the column Date\n", + "house_data_clean.date = house_data_clean['date'].dt.year\n", + "\n", + "# Changing the dates for the year built \n", + "house_data_clean['yr_built'] = pd.to_datetime(house_data_clean['yr_built'],format='%Y').dt.year\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above code converts the 'date' column data to only contain the year the house was sold, for the purpose of analysis we will use only the year since the changes month by month will be minor." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Dealing with the missing values" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def missing_values(dataset):\n", + " # drop the rows from views\n", + " dataset.dropna(subset=['view'],inplace=True)\n", + "\n", + " # Filling the NaN values for waterfront with NO\n", + " dataset.waterfront.fillna('NO',inplace=True)\n", + " \n", + " # Dropping the yr_renovated column \n", + " dataset.drop('yr_renovated',axis=1,inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "missing_values(house_data_clean)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "'yr_renovated' has the highest percentage of NaN values 17%. This will be dropped since it will not be used within our model inline with the business problem.\n", + "\n", + "'Waterfront' feature has 11% null values, this was filled with NO on the assumption that these cells were not filled since they lacked waterfronts\n", + "\n", + "For the 'View' column, the null values were dropped by row since the overall percentage impact would be minute" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'duplicates': 2,\n", + " 'missing values': id 0.0\n", + " date 0.0\n", + " price 0.0\n", + " bedrooms 0.0\n", + " bathrooms 0.0\n", + " sqft_living 0.0\n", + " sqft_lot 0.0\n", + " floors 0.0\n", + " waterfront 0.0\n", + " view 0.0\n", + " condition 0.0\n", + " grade 0.0\n", + " sqft_above 0.0\n", + " sqft_basement 0.0\n", + " yr_built 0.0\n", + " zipcode 0.0\n", + " lat 0.0\n", + " long 0.0\n", + " sqft_living15 0.0\n", + " sqft_lot15 0.0\n", + " dtype: float64}" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "identify_issues(house_data_clean)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewconditiongradesqft_abovesqft_basementyr_builtzipcodelatlongsqft_living15sqft_lot15
394718250690312014550000.041.75241084472.0NOGOODGood8 Good2060350.019369807447.6499-122.088252014789
2003886489001102014555000.032.50194032112.0NONONEAverage8 Good19400.020099802747.5644-122.09318803078
\n", + "
" + ], + "text/plain": [ + " id date price bedrooms bathrooms sqft_living sqft_lot \\\n", + "3947 1825069031 2014 550000.0 4 1.75 2410 8447 \n", + "20038 8648900110 2014 555000.0 3 2.50 1940 3211 \n", + "\n", + " floors waterfront view condition grade sqft_above sqft_basement \\\n", + "3947 2.0 NO GOOD Good 8 Good 2060 350.0 \n", + "20038 2.0 NO NONE Average 8 Good 1940 0.0 \n", + "\n", + " yr_built zipcode lat long sqft_living15 sqft_lot15 \n", + "3947 1936 98074 47.6499 -122.088 2520 14789 \n", + "20038 2009 98027 47.5644 -122.093 1880 3078 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "house_data_clean[house_data_clean.duplicated()]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" } - ], - "source": [ - "house_data_clean[house_data_clean.duplicated()]" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 + "nbformat": 4, + "nbformat_minor": 2 } From e409f03f1cd19c8332e7b24b138dae76150ffc25 Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Tue, 30 Apr 2024 23:53:19 +0300 Subject: [PATCH 22/42] Update student.ipynb --- student.ipynb | 206 ++++++++++++++++++++++++-------------------------- 1 file changed, 99 insertions(+), 107 deletions(-) diff --git a/student.ipynb b/student.ipynb index 09a140ae..e4128c3d 100644 --- a/student.ipynb +++ b/student.ipynb @@ -1,111 +1,103 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Final Project Submission\n", - "\n", - "Please fill out:\n", -Clyde - "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng. \n", - - "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng, Derrick Kiptoo.\n", - main - "* Student pace: full time\n", - "* Scheduled project review date/time: \n", - "* Instructor name: Nikita \n", - "* Blog post URL:\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Kings County Housing Analysis with Multiple Linear Regression" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Overview\n", - "\n", - "\n", - "A real estate agency in Kingsway seeks to determine what are the contributing factors that affect the price of houses to make improvements where necessary. They want to employ an analytical approach rather than sentimental before arriving at a decision. Multilinear regression has been used for this project to understand how various features affect their pricing to better their services." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Business Problem\n", - "\n", - "In the face of market fluctuations and heightened competition within the real estate sector, our agency is grappling with pricing volatility, which poses significant challenges for our agents in devising effective business strategies. We seek strategic guidance to optimize our purchasing and selling endeavors, prioritizing informed decision-making to identify key areas of focus that promise maximum returns on investment." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Objectives\n", - "* To determine the key factors influencing house prices.\n", - "* To develop multilinear regression models to predict house prices based on relevant features.\n", - "* To use insights from the regression analysis to optimize pricing strategies for both purchasing and selling properties.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Hypothesis\n", - "* Null Hypothesis - There is no relationship between our independent variables and our dependent variable \n", - "\n", - "* Alternative Hypothesis - There is a relationship between our independent variables and our dependent variable" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Data Understanding:\n", - "\n", - "In this project, we utilized the King County House Sales dataset, which serves as the foundational dataset for our analysis. It was sourced Kaggle.The dataset encompasses comprehensive information regarding house sales within King County, Washington, USA. It comprises a diverse array of features, including the number of bedrooms, bathrooms, square footage, as well as geographical and pricing details of the properties sold. This dataset is frequently employed in data science and machine learning endeavors, particularly for predictive modeling tasks such as regression analysis aimed at forecasting house prices based on the provided features." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### King County Housing Data Columns \n", - "\n", - "The column names contained in column_names.md are:\n", - "* `id`: A unique identifier for each house sale.\n", - "* `date`: The date when the house was sold.\n", - "* `price`: The sale price of the house, serving as the target variable for predictive modeling.\n", - "* `bedrooms`, `bathrooms`, `sqft_living`, `sqft_lot`: Numerical features representing the number of bedrooms and bathrooms, as well as the living area and lot area of the house, respectively.\n", - "* `floors`: The number of floors in the house.\n", - "* `waterfront`, `view`, `condition`, `grade`: Categorical features describing aspects such as waterfront availability, property view, condition, and overall grade assigned to the housing unit.\n", - "* `yr_built`, `yr_renovated`: Year of construction and renovation of the house.\n", - "* `zipcode`, `lat`, `long`: Geographical features including ZIP code, latitude, and longitude coordinates.\n", - "* `sqft_above`, `sqft_basement`, `sqft_living15`, `sqft_lot15`: Additional numerical features providing details about the house's above-ground and basement square footage, as well as living area and lot area of the nearest 15 neighboring houses." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Loading\n", - "\n", - "#### Import Necessary Libraries" - ] - }, - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Final Project Submission\n", + "\n", + "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng. \n", + "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng, Derrick Kiptoo.\n", + "* Student pace: full time\n", + "* Scheduled project review date/time: \n", + "* Instructor name: Nikita \n", + "* Blog post URL:\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Kings County Housing Analysis with Multiple Linear Regression" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Overview\n", + "\n", + "A real estate agency in Kingsway seeks to determine what are the contributing factors that affect the price of houses to make improvements where necessary. They want to employ an analytical approach rather than sentimental before arriving at a decision. Multilinear regression has been used for this project to understand how various features affect their pricing to better their services." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Business Problem\n", + "\n", + "In the face of market fluctuations and heightened competition within the real estate sector, our agency is grappling with pricing volatility, which poses significant challenges for our agents in devising effective business strategies. We seek strategic guidance to optimize our purchasing and selling endeavors, prioritizing informed decision-making to identify key areas of focus that promise maximum returns on investment." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Objectives\n", + "* To determine the key factors influencing house prices.\n", + "* To develop multilinear regression models to predict house prices based on relevant features.\n", + "* To use insights from the regression analysis to optimize pricing strategies for both purchasing and selling properties.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Hypothesis\n", + "* Null Hypothesis - There is no relationship between our independent variables and our dependent variable \n", + "\n", + "* Alternative Hypothesis - There is a relationship between our independent variables and our dependent variable" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data Understanding:\n", + "\n", + "In this project, we utilized the King County House Sales dataset, which serves as the foundational dataset for our analysis. It was sourced Kaggle.The dataset encompasses comprehensive information regarding house sales within King County, Washington, USA. It comprises a diverse array of features, including the number of bedrooms, bathrooms, square footage, as well as geographical and pricing details of the properties sold. This dataset is frequently employed in data science and machine learning endeavors, particularly for predictive modeling tasks such as regression analysis aimed at forecasting house prices based on the provided features." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### King County Housing Data Columns \n", + "\n", + "The column names contained in column_names.md are:\n", + "* `id`: A unique identifier for each house sale.\n", + "* `date`: The date when the house was sold.\n", + "* `price`: The sale price of the house, serving as the target variable for predictive modeling.\n", + "* `bedrooms`, `bathrooms`, `sqft_living`, `sqft_lot`: Numerical features representing the number of bedrooms and bathrooms, as well as the living area and lot area of the house, respectively.\n", + "* `floors`: The number of floors in the house.\n", + "* `waterfront`, `view`, `condition`, `grade`: Categorical features describing aspects such as waterfront availability, property view, condition, and overall grade assigned to the housing unit.\n", + "* `yr_built`, `yr_renovated`: Year of construction and renovation of the house.\n", + "* `zipcode`, `lat`, `long`: Geographical features including ZIP code, latitude, and longitude coordinates.\n", + "* `sqft_above`, `sqft_basement`, `sqft_living15`, `sqft_lot15`: Additional numerical features providing details about the house's above-ground and basement square footage, as well as living area and lot area of the nearest 15 neighboring houses." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Loading\n", + "\n" + ] + }, + { + "cell_type": "code", + "metadata": {}, + "source": [ "import matplotlib.pyplot as plt\n", "%matplotlib inline\n", "import numpy as np\n", From 9824144694280c2400705fbd29f06296d7cb192d Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Tue, 30 Apr 2024 23:55:32 +0300 Subject: [PATCH 23/42] Update student.ipynb --- student.ipynb | 2070 +++++++++++++++++++++++++------------------------ 1 file changed, 1036 insertions(+), 1034 deletions(-) diff --git a/student.ipynb b/student.ipynb index e4128c3d..3d526e43 100644 --- a/student.ipynb +++ b/student.ipynb @@ -1,1053 +1,1055 @@ { - "cells": [ - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Final Project Submission\n", - "\n", - "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng. \n", - "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng, Derrick Kiptoo.\n", - "* Student pace: full time\n", - "* Scheduled project review date/time: \n", - "* Instructor name: Nikita \n", - "* Blog post URL:\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Kings County Housing Analysis with Multiple Linear Regression" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Overview\n", - "\n", - "A real estate agency in Kingsway seeks to determine what are the contributing factors that affect the price of houses to make improvements where necessary. They want to employ an analytical approach rather than sentimental before arriving at a decision. Multilinear regression has been used for this project to understand how various features affect their pricing to better their services." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Business Problem\n", - "\n", - "In the face of market fluctuations and heightened competition within the real estate sector, our agency is grappling with pricing volatility, which poses significant challenges for our agents in devising effective business strategies. We seek strategic guidance to optimize our purchasing and selling endeavors, prioritizing informed decision-making to identify key areas of focus that promise maximum returns on investment." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Objectives\n", - "* To determine the key factors influencing house prices.\n", - "* To develop multilinear regression models to predict house prices based on relevant features.\n", - "* To use insights from the regression analysis to optimize pricing strategies for both purchasing and selling properties.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Hypothesis\n", - "* Null Hypothesis - There is no relationship between our independent variables and our dependent variable \n", - "\n", - "* Alternative Hypothesis - There is a relationship between our independent variables and our dependent variable" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Data Understanding:\n", - "\n", - "In this project, we utilized the King County House Sales dataset, which serves as the foundational dataset for our analysis. It was sourced Kaggle.The dataset encompasses comprehensive information regarding house sales within King County, Washington, USA. It comprises a diverse array of features, including the number of bedrooms, bathrooms, square footage, as well as geographical and pricing details of the properties sold. This dataset is frequently employed in data science and machine learning endeavors, particularly for predictive modeling tasks such as regression analysis aimed at forecasting house prices based on the provided features." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### King County Housing Data Columns \n", - "\n", - "The column names contained in column_names.md are:\n", - "* `id`: A unique identifier for each house sale.\n", - "* `date`: The date when the house was sold.\n", - "* `price`: The sale price of the house, serving as the target variable for predictive modeling.\n", - "* `bedrooms`, `bathrooms`, `sqft_living`, `sqft_lot`: Numerical features representing the number of bedrooms and bathrooms, as well as the living area and lot area of the house, respectively.\n", - "* `floors`: The number of floors in the house.\n", - "* `waterfront`, `view`, `condition`, `grade`: Categorical features describing aspects such as waterfront availability, property view, condition, and overall grade assigned to the housing unit.\n", - "* `yr_built`, `yr_renovated`: Year of construction and renovation of the house.\n", - "* `zipcode`, `lat`, `long`: Geographical features including ZIP code, latitude, and longitude coordinates.\n", - "* `sqft_above`, `sqft_basement`, `sqft_living15`, `sqft_lot15`: Additional numerical features providing details about the house's above-ground and basement square footage, as well as living area and lot area of the nearest 15 neighboring houses." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Loading\n", - "\n" - ] - }, - { - "cell_type": "code", - "metadata": {}, - "source": [ - "import matplotlib.pyplot as plt\n", - "%matplotlib inline\n", - "import numpy as np\n", - "import pandas as pd\n", - "import scipy.stats as stats\n", - "import seaborn as sns\n", - "import statsmodels.api as sm\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Loading Data" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "# Creating a function that loads data and return it in a dataframe\n", - "def load_data(file_path):\n", - " house_data = pd.read_csv(file_path)\n", - "\n", - " #shape\n", - " shape = house_data.shape\n", - " print(f\"The dataset contains {shape[0]} houses with {shape[1]} features\")\n", - " print()\n", - " \n", - " #Data Types\n", - " data_types = house_data.dtypes\n", - " print(\"Columns and their data types:\")\n", - " for column, dtype in data_types.items():\n", - " print(f\"{column}: {dtype}\")\n", - " print()\n", - "\n", - " return house_data\n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ + "cells": [ + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Final Project Submission\n", + "\n", + "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng. \n", + "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng, Derrick Kiptoo.\n", + "* Student pace: full time\n", + "* Scheduled project review date/time: \n", + "* Instructor name: Nikita \n", + "* Blog post URL:\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "# Kings County Housing Analysis with Multiple Linear Regression" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Overview\n", + "\n", + "A real estate agency in Kingsway seeks to determine what are the contributing factors that affect the price of houses to make improvements where necessary. They want to employ an analytical approach rather than sentimental before arriving at a decision. Multilinear regression has been used for this project to understand how various features affect their pricing to better their services." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Business Problem\n", + "\n", + "In the face of market fluctuations and heightened competition within the real estate sector, our agency is grappling with pricing volatility, which poses significant challenges for our agents in devising effective business strategies. We seek strategic guidance to optimize our purchasing and selling endeavors, prioritizing informed decision-making to identify key areas of focus that promise maximum returns on investment." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Objectives\n", + "* To determine the key factors influencing house prices.\n", + "* To develop multilinear regression models to predict house prices based on relevant features.\n", + "* To use insights from the regression analysis to optimize pricing strategies for both purchasing and selling properties.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Hypothesis\n", + "* Null Hypothesis - There is no relationship between our independent variables and our dependent variable \n", + "\n", + "* Alternative Hypothesis - There is a relationship between our independent variables and our dependent variable" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Data Understanding:\n", + "\n", + "In this project, we utilized the King County House Sales dataset, which serves as the foundational dataset for our analysis. It was sourced Kaggle.The dataset encompasses comprehensive information regarding house sales within King County, Washington, USA. It comprises a diverse array of features, including the number of bedrooms, bathrooms, square footage, as well as geographical and pricing details of the properties sold. This dataset is frequently employed in data science and machine learning endeavors, particularly for predictive modeling tasks such as regression analysis aimed at forecasting house prices based on the provided features." + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "The dataset contains 21597 houses with 21 features\n", - "\n", - "Columns and their data types:\n", - "id: int64\n", - "date: object\n", - "price: float64\n", - "bedrooms: int64\n", - "bathrooms: float64\n", - "sqft_living: int64\n", - "sqft_lot: int64\n", - "floors: float64\n", - "waterfront: object\n", - "view: object\n", - "condition: object\n", - "grade: object\n", - "sqft_above: int64\n", - "sqft_basement: object\n", - "yr_built: int64\n", - "yr_renovated: float64\n", - "zipcode: int64\n", - "lat: float64\n", - "long: float64\n", - "sqft_living15: int64\n", - "sqft_lot15: int64\n", - "\n" - ] + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### King County Housing Data Columns \n", + "\n", + "The column names contained in column_names.md are:\n", + "* `id`: A unique identifier for each house sale.\n", + "* `date`: The date when the house was sold.\n", + "* `price`: The sale price of the house, serving as the target variable for predictive modeling.\n", + "* `bedrooms`, `bathrooms`, `sqft_living`, `sqft_lot`: Numerical features representing the number of bedrooms and bathrooms, as well as the living area and lot area of the house, respectively.\n", + "* `floors`: The number of floors in the house.\n", + "* `waterfront`, `view`, `condition`, `grade`: Categorical features describing aspects such as waterfront availability, property view, condition, and overall grade assigned to the housing unit.\n", + "* `yr_built`, `yr_renovated`: Year of construction and renovation of the house.\n", + "* `zipcode`, `lat`, `long`: Geographical features including ZIP code, latitude, and longitude coordinates.\n", + "* `sqft_above`, `sqft_basement`, `sqft_living15`, `sqft_lot15`: Additional numerical features providing details about the house's above-ground and basement square footage, as well as living area and lot area of the nearest 15 neighboring houses." + ] }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontview...gradesqft_abovesqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15
0712930052010/13/2014221900.031.00118056501.0NaNNONE...7 Average11800.019550.09817847.5112-122.25713405650
1641410019212/9/2014538000.032.25257072422.0NONONE...7 Average2170400.019511991.09812547.7210-122.31916907639
256315004002/25/2015180000.021.00770100001.0NONONE...6 Low Average7700.01933NaN9802847.7379-122.23327208062
3248720087512/9/2014604000.043.00196050001.0NONONE...7 Average1050910.019650.09813647.5208-122.39313605000
419544005102/18/2015510000.032.00168080801.0NONONE...8 Good16800.019870.09807447.6168-122.04518007503
..................................................................
215922630000185/21/2014360000.032.50153011313.0NONONE...8 Good15300.020090.09810347.6993-122.34615301509
2159366000601202/23/2015400000.042.50231058132.0NONONE...8 Good23100.020140.09814647.5107-122.36218307200
2159415233001416/23/2014402101.020.75102013502.0NONONE...7 Average10200.020090.09814447.5944-122.29910202007
215952913101001/16/2015400000.032.50160023882.0NaNNONE...8 Good16000.020040.09802747.5345-122.06914101287
21596152330015710/15/2014325000.020.75102010762.0NONONE...7 Average10200.020080.09814447.5941-122.29910201357
\n", - "

21597 rows × 21 columns

\n", - "
" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Loading\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "import numpy as np\n", + "import pandas as pd\n", + "import scipy.stats as stats\n", + "import seaborn as sns\n", + "import statsmodels.api as sm\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Loading Data" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "# Creating a function that loads data and return it in a dataframe\n", + "def load_data(file_path):\n", + " house_data = pd.read_csv(file_path)\n", + "\n", + " #shape\n", + " shape = house_data.shape\n", + " print(f\"The dataset contains {shape[0]} houses with {shape[1]} features\")\n", + " print()\n", + " \n", + " #Data Types\n", + " data_types = house_data.dtypes\n", + " print(\"Columns and their data types:\")\n", + " for column, dtype in data_types.items():\n", + " print(f\"{column}: {dtype}\")\n", + " print()\n", + "\n", + " return house_data\n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The dataset contains 21597 houses with 21 features\n", + "\n", + "Columns and their data types:\n", + "id: int64\n", + "date: object\n", + "price: float64\n", + "bedrooms: int64\n", + "bathrooms: float64\n", + "sqft_living: int64\n", + "sqft_lot: int64\n", + "floors: float64\n", + "waterfront: object\n", + "view: object\n", + "condition: object\n", + "grade: object\n", + "sqft_above: int64\n", + "sqft_basement: object\n", + "yr_built: int64\n", + "yr_renovated: float64\n", + "zipcode: int64\n", + "lat: float64\n", + "long: float64\n", + "sqft_living15: int64\n", + "sqft_lot15: int64\n", + "\n" + ] + }, + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontview...gradesqft_abovesqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15
0712930052010/13/2014221900.031.00118056501.0NaNNONE...7 Average11800.019550.09817847.5112-122.25713405650
1641410019212/9/2014538000.032.25257072422.0NONONE...7 Average2170400.019511991.09812547.7210-122.31916907639
256315004002/25/2015180000.021.00770100001.0NONONE...6 Low Average7700.01933NaN9802847.7379-122.23327208062
3248720087512/9/2014604000.043.00196050001.0NONONE...7 Average1050910.019650.09813647.5208-122.39313605000
419544005102/18/2015510000.032.00168080801.0NONONE...8 Good16800.019870.09807447.6168-122.04518007503
..................................................................
215922630000185/21/2014360000.032.50153011313.0NONONE...8 Good15300.020090.09810347.6993-122.34615301509
2159366000601202/23/2015400000.042.50231058132.0NONONE...8 Good23100.020140.09814647.5107-122.36218307200
2159415233001416/23/2014402101.020.75102013502.0NONONE...7 Average10200.020090.09814447.5944-122.29910202007
215952913101001/16/2015400000.032.50160023882.0NaNNONE...8 Good16000.020040.09802747.5345-122.06914101287
21596152330015710/15/2014325000.020.75102010762.0NONONE...7 Average10200.020080.09814447.5941-122.29910201357
\n", + "

21597 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " id date price bedrooms bathrooms sqft_living \\\n", + "0 7129300520 10/13/2014 221900.0 3 1.00 1180 \n", + "1 6414100192 12/9/2014 538000.0 3 2.25 2570 \n", + "2 5631500400 2/25/2015 180000.0 2 1.00 770 \n", + "3 2487200875 12/9/2014 604000.0 4 3.00 1960 \n", + "4 1954400510 2/18/2015 510000.0 3 2.00 1680 \n", + "... ... ... ... ... ... ... \n", + "21592 263000018 5/21/2014 360000.0 3 2.50 1530 \n", + "21593 6600060120 2/23/2015 400000.0 4 2.50 2310 \n", + "21594 1523300141 6/23/2014 402101.0 2 0.75 1020 \n", + "21595 291310100 1/16/2015 400000.0 3 2.50 1600 \n", + "21596 1523300157 10/15/2014 325000.0 2 0.75 1020 \n", + "\n", + " sqft_lot floors waterfront view ... grade sqft_above \\\n", + "0 5650 1.0 NaN NONE ... 7 Average 1180 \n", + "1 7242 2.0 NO NONE ... 7 Average 2170 \n", + "2 10000 1.0 NO NONE ... 6 Low Average 770 \n", + "3 5000 1.0 NO NONE ... 7 Average 1050 \n", + "4 8080 1.0 NO NONE ... 8 Good 1680 \n", + "... ... ... ... ... ... ... ... \n", + "21592 1131 3.0 NO NONE ... 8 Good 1530 \n", + "21593 5813 2.0 NO NONE ... 8 Good 2310 \n", + "21594 1350 2.0 NO NONE ... 7 Average 1020 \n", + "21595 2388 2.0 NaN NONE ... 8 Good 1600 \n", + "21596 1076 2.0 NO NONE ... 7 Average 1020 \n", + "\n", + " sqft_basement yr_built yr_renovated zipcode lat long \\\n", + "0 0.0 1955 0.0 98178 47.5112 -122.257 \n", + "1 400.0 1951 1991.0 98125 47.7210 -122.319 \n", + "2 0.0 1933 NaN 98028 47.7379 -122.233 \n", + "3 910.0 1965 0.0 98136 47.5208 -122.393 \n", + "4 0.0 1987 0.0 98074 47.6168 -122.045 \n", + "... ... ... ... ... ... ... \n", + "21592 0.0 2009 0.0 98103 47.6993 -122.346 \n", + "21593 0.0 2014 0.0 98146 47.5107 -122.362 \n", + "21594 0.0 2009 0.0 98144 47.5944 -122.299 \n", + "21595 0.0 2004 0.0 98027 47.5345 -122.069 \n", + "21596 0.0 2008 0.0 98144 47.5941 -122.299 \n", + "\n", + " sqft_living15 sqft_lot15 \n", + "0 1340 5650 \n", + "1 1690 7639 \n", + "2 2720 8062 \n", + "3 1360 5000 \n", + "4 1800 7503 \n", + "... ... ... \n", + "21592 1530 1509 \n", + "21593 1830 7200 \n", + "21594 1020 2007 \n", + "21595 1410 1287 \n", + "21596 1020 1357 \n", + "\n", + "[21597 rows x 21 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - " id date price bedrooms bathrooms sqft_living \\\n", - "0 7129300520 10/13/2014 221900.0 3 1.00 1180 \n", - "1 6414100192 12/9/2014 538000.0 3 2.25 2570 \n", - "2 5631500400 2/25/2015 180000.0 2 1.00 770 \n", - "3 2487200875 12/9/2014 604000.0 4 3.00 1960 \n", - "4 1954400510 2/18/2015 510000.0 3 2.00 1680 \n", - "... ... ... ... ... ... ... \n", - "21592 263000018 5/21/2014 360000.0 3 2.50 1530 \n", - "21593 6600060120 2/23/2015 400000.0 4 2.50 2310 \n", - "21594 1523300141 6/23/2014 402101.0 2 0.75 1020 \n", - "21595 291310100 1/16/2015 400000.0 3 2.50 1600 \n", - "21596 1523300157 10/15/2014 325000.0 2 0.75 1020 \n", - "\n", - " sqft_lot floors waterfront view ... grade sqft_above \\\n", - "0 5650 1.0 NaN NONE ... 7 Average 1180 \n", - "1 7242 2.0 NO NONE ... 7 Average 2170 \n", - "2 10000 1.0 NO NONE ... 6 Low Average 770 \n", - "3 5000 1.0 NO NONE ... 7 Average 1050 \n", - "4 8080 1.0 NO NONE ... 8 Good 1680 \n", - "... ... ... ... ... ... ... ... \n", - "21592 1131 3.0 NO NONE ... 8 Good 1530 \n", - "21593 5813 2.0 NO NONE ... 8 Good 2310 \n", - "21594 1350 2.0 NO NONE ... 7 Average 1020 \n", - "21595 2388 2.0 NaN NONE ... 8 Good 1600 \n", - "21596 1076 2.0 NO NONE ... 7 Average 1020 \n", - "\n", - " sqft_basement yr_built yr_renovated zipcode lat long \\\n", - "0 0.0 1955 0.0 98178 47.5112 -122.257 \n", - "1 400.0 1951 1991.0 98125 47.7210 -122.319 \n", - "2 0.0 1933 NaN 98028 47.7379 -122.233 \n", - "3 910.0 1965 0.0 98136 47.5208 -122.393 \n", - "4 0.0 1987 0.0 98074 47.6168 -122.045 \n", - "... ... ... ... ... ... ... \n", - "21592 0.0 2009 0.0 98103 47.6993 -122.346 \n", - "21593 0.0 2014 0.0 98146 47.5107 -122.362 \n", - "21594 0.0 2009 0.0 98144 47.5944 -122.299 \n", - "21595 0.0 2004 0.0 98027 47.5345 -122.069 \n", - "21596 0.0 2008 0.0 98144 47.5941 -122.299 \n", - "\n", - " sqft_living15 sqft_lot15 \n", - "0 1340 5650 \n", - "1 1690 7639 \n", - "2 2720 8062 \n", - "3 1360 5000 \n", - "4 1800 7503 \n", - "... ... ... \n", - "21592 1530 1509 \n", - "21593 1830 7200 \n", - "21594 1020 2007 \n", - "21595 1410 1287 \n", - "21596 1020 1357 \n", - "\n", - "[21597 rows x 21 columns]" + "source": [ + "load_data('data/kc_house_data.csv') # Assuming 'data' folder is in the same directory\n", + "\n", + "\n" ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "load_data('data/kc_house_data.csv') # Assuming 'data' folder is in the same directory\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The dataset contains 21 columns, each representing a distinct feature, and 21,597 rows, with each row corresponding to a specific house sale entry.\n", - "\n", - "The dataset contains a mix of data types, including integers (int64), floating-point numbers (float64), and objects (strings). For instance, numerical features such as bedrooms, bathrooms, and sqft_living are represented as integers or floating-point numbers to facilitate mathematical computations, while categorical features like waterfront and view are stored as objects to accommodate text-based categories." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "The dataset contains 21597 houses with 21 features\n", - "\n", - "Columns and their data types:\n", - "id: int64\n", - "date: object\n", - "price: float64\n", - "bedrooms: int64\n", - "bathrooms: float64\n", - "sqft_living: int64\n", - "sqft_lot: int64\n", - "floors: float64\n", - "waterfront: object\n", - "view: object\n", - "condition: object\n", - "grade: object\n", - "sqft_above: int64\n", - "sqft_basement: object\n", - "yr_built: int64\n", - "yr_renovated: float64\n", - "zipcode: int64\n", - "lat: float64\n", - "long: float64\n", - "sqft_living15: int64\n", - "sqft_lot15: int64\n", - "\n" - ] - } - ], - "source": [ - "kings_data = load_data('data/kc_house_data.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "#create a function that takes in a column and returns the column statistics as a dictionary\n", - "def descriptive_analytics(column):\n", - " stats_dict = column.describe().to_dict()\n", - " \n", - " print(\"Descriptive Statistics for Column '{}':\".format(column.name))\n", - " print(\"The count of the column is:\", stats_dict['count'])\n", - " print(\"The mean of the column is:\", stats_dict['mean'])\n", - " print(\"The standard deviation of the column is:\", stats_dict['std'])\n", - " print(\"The minimum value of the column is:\", stats_dict['min'])\n", - " print(\"The 25th percentile of the column is:\", stats_dict['25%'])\n", - " print(\"The median of the column is:\", stats_dict['50%'])\n", - " print(\"The 75th percentile of the column is:\", stats_dict['75%'])\n", - " print(\"The maximum value of the column is:\", stats_dict['max'])" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The dataset contains 21 columns, each representing a distinct feature, and 21,597 rows, with each row corresponding to a specific house sale entry.\n", + "\n", + "The dataset contains a mix of data types, including integers (int64), floating-point numbers (float64), and objects (strings). For instance, numerical features such as bedrooms, bathrooms, and sqft_living are represented as integers or floating-point numbers to facilitate mathematical computations, while categorical features like waterfront and view are stored as objects to accommodate text-based categories." + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Descriptive Statistics for Column 'price':\n", - "The count of the column is: 21597.0\n", - "The mean of the column is: 540296.5735055795\n", - "The standard deviation of the column is: 367368.1401013936\n", - "The minimum value of the column is: 78000.0\n", - "The 25th percentile of the column is: 322000.0\n", - "The median of the column is: 450000.0\n", - "The 75th percentile of the column is: 645000.0\n", - "The maximum value of the column is: 7700000.0\n" - ] - } - ], - "source": [ - "descriptive_analytics(kings_data['price'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see that the maximum price of a house is 7700000 dollars and the minimum price is 78000 dollars\n", - "\n", - "There are 21597 prices regarding to the houses in the dataset\n", - "\n", - "Average price of a house is 540296.57 dollars" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Preperation\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The dataset contains 21597 houses with 21 features\n", + "\n", + "Columns and their data types:\n", + "id: int64\n", + "date: object\n", + "price: float64\n", + "bedrooms: int64\n", + "bathrooms: float64\n", + "sqft_living: int64\n", + "sqft_lot: int64\n", + "floors: float64\n", + "waterfront: object\n", + "view: object\n", + "condition: object\n", + "grade: object\n", + "sqft_above: int64\n", + "sqft_basement: object\n", + "yr_built: int64\n", + "yr_renovated: float64\n", + "zipcode: int64\n", + "lat: float64\n", + "long: float64\n", + "sqft_living15: int64\n", + "sqft_lot15: int64\n", + "\n" + ] + } + ], + "source": [ + "kings_data = load_data('data/kc_house_data.csv')" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 21597 entries, 0 to 21596\n", - "Data columns (total 21 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 21597 non-null int64 \n", - " 1 date 21597 non-null object \n", - " 2 price 21597 non-null float64\n", - " 3 bedrooms 21597 non-null int64 \n", - " 4 bathrooms 21597 non-null float64\n", - " 5 sqft_living 21597 non-null int64 \n", - " 6 sqft_lot 21597 non-null int64 \n", - " 7 floors 21597 non-null float64\n", - " 8 waterfront 19221 non-null object \n", - " 9 view 21534 non-null object \n", - " 10 condition 21597 non-null object \n", - " 11 grade 21597 non-null object \n", - " 12 sqft_above 21597 non-null int64 \n", - " 13 sqft_basement 21597 non-null object \n", - " 14 yr_built 21597 non-null int64 \n", - " 15 yr_renovated 17755 non-null float64\n", - " 16 zipcode 21597 non-null int64 \n", - " 17 lat 21597 non-null float64\n", - " 18 long 21597 non-null float64\n", - " 19 sqft_living15 21597 non-null int64 \n", - " 20 sqft_lot15 21597 non-null int64 \n", - "dtypes: float64(6), int64(9), object(6)\n", - "memory usage: 3.5+ MB\n" - ] - } - ], - "source": [ - "kings_data.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "def identify_issues(dataset):\n", - " # Identify missing values as a percentage of the whole dataset\n", - " missing_values = (dataset.isnull().sum())/len(dataset) * 100\n", - "\n", - " # Identify duplicates\n", - " duplicates = dataset.duplicated().sum()\n", - " \n", - " #return a dictionary \n", - " return {'duplicates': duplicates,\n", - " 'missing values': missing_values.round(2)} \n" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "#create a function that takes in a column and returns the column statistics as a dictionary\n", + "def descriptive_analytics(column):\n", + " stats_dict = column.describe().to_dict()\n", + " \n", + " print(\"Descriptive Statistics for Column '{}':\".format(column.name))\n", + " print(\"The count of the column is:\", stats_dict['count'])\n", + " print(\"The mean of the column is:\", stats_dict['mean'])\n", + " print(\"The standard deviation of the column is:\", stats_dict['std'])\n", + " print(\"The minimum value of the column is:\", stats_dict['min'])\n", + " print(\"The 25th percentile of the column is:\", stats_dict['25%'])\n", + " print(\"The median of the column is:\", stats_dict['50%'])\n", + " print(\"The 75th percentile of the column is:\", stats_dict['75%'])\n", + " print(\"The maximum value of the column is:\", stats_dict['max'])" + ] + }, { - "data": { - "text/plain": [ - "{'duplicates': 0,\n", - " 'missing values': id 0.00\n", - " date 0.00\n", - " price 0.00\n", - " bedrooms 0.00\n", - " bathrooms 0.00\n", - " sqft_living 0.00\n", - " sqft_lot 0.00\n", - " floors 0.00\n", - " waterfront 11.00\n", - " view 0.29\n", - " condition 0.00\n", - " grade 0.00\n", - " sqft_above 0.00\n", - " sqft_basement 0.00\n", - " yr_built 0.00\n", - " yr_renovated 17.79\n", - " zipcode 0.00\n", - " lat 0.00\n", - " long 0.00\n", - " sqft_living15 0.00\n", - " sqft_lot15 0.00\n", - " dtype: float64}" + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Descriptive Statistics for Column 'price':\n", + "The count of the column is: 21597.0\n", + "The mean of the column is: 540296.5735055795\n", + "The standard deviation of the column is: 367368.1401013936\n", + "The minimum value of the column is: 78000.0\n", + "The 25th percentile of the column is: 322000.0\n", + "The median of the column is: 450000.0\n", + "The 75th percentile of the column is: 645000.0\n", + "The maximum value of the column is: 7700000.0\n" + ] + } + ], + "source": [ + "descriptive_analytics(kings_data['price'])" ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "identify_issues(kings_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The examination indicates that there are no duplicate entries within the dataset, ensuring the integrity of the records. However, attention is warranted to address missing values present in certain columns. Specifically, the 'waterfront' feature exhibits 11% of null values, representing a negligible portion of the dataset. Similarly, the 'yr_renovated' feature shows a relatively higher percentage of missing values, accounting for approximately 17.79% of the dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Before making changes make a copy instead of overwriting data" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "house_data_clean = kings_data.copy()" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# Changing the date to date time\n", - "house_data_clean['date'] = pd.to_datetime(house_data_clean['date'])\n", - "\n", - "# Extracting only the year from the column Date\n", - "house_data_clean.date = house_data_clean['date'].dt.year\n", - "\n", - "# Changing the dates for the year built \n", - "house_data_clean['yr_built'] = pd.to_datetime(house_data_clean['yr_built'],format='%Y').dt.year\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The above code converts the 'date' column data to only contain the year the house was sold, for the purpose of analysis we will use only the year since the changes month by month will be minor." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Dealing with the missing values" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "def missing_values(dataset):\n", - " # drop the rows from views\n", - " dataset.dropna(subset=['view'],inplace=True)\n", - "\n", - " # Filling the NaN values for waterfront with NO\n", - " dataset.waterfront.fillna('NO',inplace=True)\n", - " \n", - " # Dropping the yr_renovated column \n", - " dataset.drop('yr_renovated',axis=1,inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "missing_values(house_data_clean)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "'yr_renovated' has the highest percentage of NaN values 17%. This will be dropped since it will not be used within our model inline with the business problem.\n", - "\n", - "'Waterfront' feature has 11% null values, this was filled with NO on the assumption that these cells were not filled since they lacked waterfronts\n", - "\n", - "For the 'View' column, the null values were dropped by row since the overall percentage impact would be minute" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ + }, { - "data": { - "text/plain": [ - "{'duplicates': 2,\n", - " 'missing values': id 0.0\n", - " date 0.0\n", - " price 0.0\n", - " bedrooms 0.0\n", - " bathrooms 0.0\n", - " sqft_living 0.0\n", - " sqft_lot 0.0\n", - " floors 0.0\n", - " waterfront 0.0\n", - " view 0.0\n", - " condition 0.0\n", - " grade 0.0\n", - " sqft_above 0.0\n", - " sqft_basement 0.0\n", - " yr_built 0.0\n", - " zipcode 0.0\n", - " lat 0.0\n", - " long 0.0\n", - " sqft_living15 0.0\n", - " sqft_lot15 0.0\n", - " dtype: float64}" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the maximum price of a house is 7700000 dollars and the minimum price is 78000 dollars\n", + "\n", + "There are 21597 prices regarding to the houses in the dataset\n", + "\n", + "Average price of a house is 540296.57 dollars" ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "identify_issues(house_data_clean)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ + }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewconditiongradesqft_abovesqft_basementyr_builtzipcodelatlongsqft_living15sqft_lot15
394718250690312014550000.041.75241084472.0NOGOODGood8 Good2060350.019369807447.6499-122.088252014789
2003886489001102014555000.032.50194032112.0NONONEAverage8 Good19400.020099802747.5644-122.09318803078
\n", - "
" + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preperation\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 21597 entries, 0 to 21596\n", + "Data columns (total 21 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 21597 non-null int64 \n", + " 1 date 21597 non-null object \n", + " 2 price 21597 non-null float64\n", + " 3 bedrooms 21597 non-null int64 \n", + " 4 bathrooms 21597 non-null float64\n", + " 5 sqft_living 21597 non-null int64 \n", + " 6 sqft_lot 21597 non-null int64 \n", + " 7 floors 21597 non-null float64\n", + " 8 waterfront 19221 non-null object \n", + " 9 view 21534 non-null object \n", + " 10 condition 21597 non-null object \n", + " 11 grade 21597 non-null object \n", + " 12 sqft_above 21597 non-null int64 \n", + " 13 sqft_basement 21597 non-null object \n", + " 14 yr_built 21597 non-null int64 \n", + " 15 yr_renovated 17755 non-null float64\n", + " 16 zipcode 21597 non-null int64 \n", + " 17 lat 21597 non-null float64\n", + " 18 long 21597 non-null float64\n", + " 19 sqft_living15 21597 non-null int64 \n", + " 20 sqft_lot15 21597 non-null int64 \n", + "dtypes: float64(6), int64(9), object(6)\n", + "memory usage: 3.5+ MB\n" + ] + } + ], + "source": [ + "kings_data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def identify_issues(dataset):\n", + " # Identify missing values as a percentage of the whole dataset\n", + " missing_values = (dataset.isnull().sum())/len(dataset) * 100\n", + "\n", + " # Identify duplicates\n", + " duplicates = dataset.duplicated().sum()\n", + " \n", + " #return a dictionary \n", + " return {'duplicates': duplicates,\n", + " 'missing values': missing_values.round(2)} \n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'duplicates': 0,\n", + " 'missing values': id 0.00\n", + " date 0.00\n", + " price 0.00\n", + " bedrooms 0.00\n", + " bathrooms 0.00\n", + " sqft_living 0.00\n", + " sqft_lot 0.00\n", + " floors 0.00\n", + " waterfront 11.00\n", + " view 0.29\n", + " condition 0.00\n", + " grade 0.00\n", + " sqft_above 0.00\n", + " sqft_basement 0.00\n", + " yr_built 0.00\n", + " yr_renovated 17.79\n", + " zipcode 0.00\n", + " lat 0.00\n", + " long 0.00\n", + " sqft_living15 0.00\n", + " sqft_lot15 0.00\n", + " dtype: float64}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } ], - "text/plain": [ - " id date price bedrooms bathrooms sqft_living sqft_lot \\\n", - "3947 1825069031 2014 550000.0 4 1.75 2410 8447 \n", - "20038 8648900110 2014 555000.0 3 2.50 1940 3211 \n", - "\n", - " floors waterfront view condition grade sqft_above sqft_basement \\\n", - "3947 2.0 NO GOOD Good 8 Good 2060 350.0 \n", - "20038 2.0 NO NONE Average 8 Good 1940 0.0 \n", - "\n", - " yr_built zipcode lat long sqft_living15 sqft_lot15 \n", - "3947 1936 98074 47.6499 -122.088 2520 14789 \n", - "20038 2009 98027 47.5644 -122.093 1880 3078 " + "source": [ + "identify_issues(kings_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The examination indicates that there are no duplicate entries within the dataset, ensuring the integrity of the records. However, attention is warranted to address missing values present in certain columns. Specifically, the 'waterfront' feature exhibits 11% of null values, representing a negligible portion of the dataset. Similarly, the 'yr_renovated' feature shows a relatively higher percentage of missing values, accounting for approximately 17.79% of the dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Before making changes make a copy instead of overwriting data" ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "house_data_clean = kings_data.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Changing the date to date time\n", + "house_data_clean['date'] = pd.to_datetime(house_data_clean['date'])\n", + "\n", + "# Extracting only the year from the column Date\n", + "house_data_clean.date = house_data_clean['date'].dt.year\n", + "\n", + "# Changing the dates for the year built \n", + "house_data_clean['yr_built'] = pd.to_datetime(house_data_clean['yr_built'],format='%Y').dt.year\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above code converts the 'date' column data to only contain the year the house was sold, for the purpose of analysis we will use only the year since the changes month by month will be minor." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Dealing with the missing values" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def missing_values(dataset):\n", + " # drop the rows from views\n", + " dataset.dropna(subset=['view'],inplace=True)\n", + "\n", + " # Filling the NaN values for waterfront with NO\n", + " dataset.waterfront.fillna('NO',inplace=True)\n", + " \n", + " # Dropping the yr_renovated column \n", + " dataset.drop('yr_renovated',axis=1,inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "missing_values(house_data_clean)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "'yr_renovated' has the highest percentage of NaN values 17%. This will be dropped since it will not be used within our model inline with the business problem.\n", + "\n", + "'Waterfront' feature has 11% null values, this was filled with NO on the assumption that these cells were not filled since they lacked waterfronts\n", + "\n", + "For the 'View' column, the null values were dropped by row since the overall percentage impact would be minute" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'duplicates': 2,\n", + " 'missing values': id 0.0\n", + " date 0.0\n", + " price 0.0\n", + " bedrooms 0.0\n", + " bathrooms 0.0\n", + " sqft_living 0.0\n", + " sqft_lot 0.0\n", + " floors 0.0\n", + " waterfront 0.0\n", + " view 0.0\n", + " condition 0.0\n", + " grade 0.0\n", + " sqft_above 0.0\n", + " sqft_basement 0.0\n", + " yr_built 0.0\n", + " zipcode 0.0\n", + " lat 0.0\n", + " long 0.0\n", + " sqft_living15 0.0\n", + " sqft_lot15 0.0\n", + " dtype: float64}" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "identify_issues(house_data_clean)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewconditiongradesqft_abovesqft_basementyr_builtzipcodelatlongsqft_living15sqft_lot15
394718250690312014550000.041.75241084472.0NOGOODGood8 Good2060350.019369807447.6499-122.088252014789
2003886489001102014555000.032.50194032112.0NONONEAverage8 Good19400.020099802747.5644-122.09318803078
\n", + "
" + ], + "text/plain": [ + " id date price bedrooms bathrooms sqft_living sqft_lot \\\n", + "3947 1825069031 2014 550000.0 4 1.75 2410 8447 \n", + "20038 8648900110 2014 555000.0 3 2.50 1940 3211 \n", + "\n", + " floors waterfront view condition grade sqft_above sqft_basement \\\n", + "3947 2.0 NO GOOD Good 8 Good 2060 350.0 \n", + "20038 2.0 NO NONE Average 8 Good 1940 0.0 \n", + "\n", + " yr_built zipcode lat long sqft_living15 sqft_lot15 \n", + "3947 1936 98074 47.6499 -122.088 2520 14789 \n", + "20038 2009 98027 47.5644 -122.093 1880 3078 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "house_data_clean[house_data_clean.duplicated()]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" } - ], - "source": [ - "house_data_clean[house_data_clean.duplicated()]" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 + "nbformat": 4, + "nbformat_minor": 2 } From b05f711c493aa16e95ed47e02c83d8d582ed1acc Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Tue, 30 Apr 2024 23:56:31 +0300 Subject: [PATCH 24/42] Update student.ipynb --- student.ipynb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/student.ipynb b/student.ipynb index 3d526e43..6e366080 100644 --- a/student.ipynb +++ b/student.ipynb @@ -6,8 +6,7 @@ "source": [ "## Final Project Submission\n", "\n", - "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng. \n", - "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng, Derrick Kiptoo.\n", + "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng.\n", "* Student pace: full time\n", "* Scheduled project review date/time: \n", "* Instructor name: Nikita \n", From 990054edf2f11aae7589f9f0b24f0079e57ea7e3 Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Wed, 1 May 2024 10:18:15 +0300 Subject: [PATCH 25/42] Update student.ipynb --- student.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/student.ipynb b/student.ipynb index cea94b8e..bde1baf8 100644 --- a/student.ipynb +++ b/student.ipynb @@ -6,7 +6,7 @@ "source": [ "## Final Project Submission\n", "\n", - "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng. \n", + "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng, Derrick Kiptoo \n", "* Student pace: full time\n", "* Scheduled project review date/time: \n", "* Instructor name: Nikita \n", From 57e844e323b484dcd7257fad75cfa16dbc1972f1 Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Wed, 1 May 2024 10:24:54 +0300 Subject: [PATCH 26/42] Update student.ipynb --- student.ipynb | 38 +++++++++++++++++++------------------- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/student.ipynb b/student.ipynb index 6e366080..f8c41695 100644 --- a/student.ipynb +++ b/student.ipynb @@ -95,7 +95,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 16, "metadata": {}, "outputs": [], "source": [ @@ -117,7 +117,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 17, "metadata": {}, "outputs": [], "source": [ @@ -142,7 +142,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 18, "metadata": {}, "outputs": [ { @@ -546,7 +546,7 @@ "[21597 rows x 21 columns]" ] }, - "execution_count": 3, + "execution_count": 18, "metadata": {}, "output_type": "execute_result" } @@ -568,7 +568,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 19, "metadata": {}, "outputs": [ { @@ -609,7 +609,7 @@ }, { "cell_type": "code", - "execution_count": 5, + "execution_count": 20, "metadata": {}, "outputs": [], "source": [ @@ -630,7 +630,7 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": 21, "metadata": {}, "outputs": [ { @@ -674,7 +674,7 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": 22, "metadata": {}, "outputs": [ { @@ -718,7 +718,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": 23, "metadata": {}, "outputs": [], "source": [ @@ -736,7 +736,7 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": 24, "metadata": {}, "outputs": [ { @@ -767,7 +767,7 @@ " dtype: float64}" ] }, - "execution_count": 9, + "execution_count": 24, "metadata": {}, "output_type": "execute_result" } @@ -792,7 +792,7 @@ }, { "cell_type": "code", - "execution_count": 10, + "execution_count": 25, "metadata": {}, "outputs": [], "source": [ @@ -801,7 +801,7 @@ }, { "cell_type": "code", - "execution_count": 11, + "execution_count": 26, "metadata": {}, "outputs": [], "source": [ @@ -831,7 +831,7 @@ }, { "cell_type": "code", - "execution_count": 12, + "execution_count": 27, "metadata": {}, "outputs": [], "source": [ @@ -848,7 +848,7 @@ }, { "cell_type": "code", - "execution_count": 13, + "execution_count": 28, "metadata": {}, "outputs": [], "source": [ @@ -868,7 +868,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 29, "metadata": {}, "outputs": [ { @@ -898,7 +898,7 @@ " dtype: float64}" ] }, - "execution_count": 14, + "execution_count": 29, "metadata": {}, "output_type": "execute_result" } @@ -909,7 +909,7 @@ }, { "cell_type": "code", - "execution_count": 15, + "execution_count": 30, "metadata": {}, "outputs": [ { @@ -1020,7 +1020,7 @@ "20038 2009 98027 47.5644 -122.093 1880 3078 " ] }, - "execution_count": 15, + "execution_count": 30, "metadata": {}, "output_type": "execute_result" } From aa1c14e5ff58f5b2840b8e60e2cf447ab8214c60 Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Wed, 1 May 2024 10:34:20 +0300 Subject: [PATCH 27/42] Update student.ipynb --- student.ipynb | 138 +------------------------------------------------- 1 file changed, 1 insertion(+), 137 deletions(-) diff --git a/student.ipynb b/student.ipynb index 6fdc9263..27522862 100644 --- a/student.ipynb +++ b/student.ipynb @@ -6,148 +6,13 @@ "source": [ "## Final Project Submission\n", "\n", -Clyde - - "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng, Derrick Kiptoo \n", - main + "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng, Derrick Kiptoo. \n", "* Student pace: full time\n", "* Scheduled project review date/time: \n", "* Instructor name: Nikita \n", "* Blog post URL:\n" ] }, - Clyde - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Kings County Housing Analysis with Multiple Linear Regression" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Overview\n", - "\n", - "A real estate agency in Kingsway seeks to determine what are the contributing factors that affect the price of houses to make improvements where necessary. They want to employ an analytical approach rather than sentimental before arriving at a decision. Multilinear regression has been used for this project to understand how various features affect their pricing to better their services." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Business Problem\n", - "\n", - "In the face of market fluctuations and heightened competition within the real estate sector, our agency is grappling with pricing volatility, which poses significant challenges for our agents in devising effective business strategies. We seek strategic guidance to optimize our purchasing and selling endeavors, prioritizing informed decision-making to identify key areas of focus that promise maximum returns on investment." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Objectives\n", - "* To determine the key factors influencing house prices.\n", - "* To develop multilinear regression models to predict house prices based on relevant features.\n", - "* To use insights from the regression analysis to optimize pricing strategies for both purchasing and selling properties.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Hypothesis\n", - "* Null Hypothesis - There is no relationship between our independent variables and our dependent variable \n", - "\n", - "* Alternative Hypothesis - There is a relationship between our independent variables and our dependent variable" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Data Understanding:\n", - "\n", - "In this project, we utilized the King County House Sales dataset, which serves as the foundational dataset for our analysis. It was sourced Kaggle.The dataset encompasses comprehensive information regarding house sales within King County, Washington, USA. It comprises a diverse array of features, including the number of bedrooms, bathrooms, square footage, as well as geographical and pricing details of the properties sold. This dataset is frequently employed in data science and machine learning endeavors, particularly for predictive modeling tasks such as regression analysis aimed at forecasting house prices based on the provided features." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### King County Housing Data Columns \n", - "\n", - "The column names contained in column_names.md are:\n", - "* `id`: A unique identifier for each house sale.\n", - "* `date`: The date when the house was sold.\n", - "* `price`: The sale price of the house, serving as the target variable for predictive modeling.\n", - "* `bedrooms`, `bathrooms`, `sqft_living`, `sqft_lot`: Numerical features representing the number of bedrooms and bathrooms, as well as the living area and lot area of the house, respectively.\n", - "* `floors`: The number of floors in the house.\n", - "* `waterfront`, `view`, `condition`, `grade`: Categorical features describing aspects such as waterfront availability, property view, condition, and overall grade assigned to the housing unit.\n", - "* `yr_built`, `yr_renovated`: Year of construction and renovation of the house.\n", - "* `zipcode`, `lat`, `long`: Geographical features including ZIP code, latitude, and longitude coordinates.\n", - "* `sqft_above`, `sqft_basement`, `sqft_living15`, `sqft_lot15`: Additional numerical features providing details about the house's above-ground and basement square footage, as well as living area and lot area of the nearest 15 neighboring houses." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Loading\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "%matplotlib inline\n", - "import numpy as np\n", - "import pandas as pd\n", - "import scipy.stats as stats\n", - "import seaborn as sns\n", - "import statsmodels.api as sm\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Loading Data" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "# Creating a function that loads data and return it in a dataframe\n", - "def load_data(file_path):\n", - " house_data = pd.read_csv(file_path)\n", - "\n", - " #shape\n", - " shape = house_data.shape\n", - " print(f\"The dataset contains {shape[0]} houses with {shape[1]} features\")\n", - " print()\n", - " \n", - " #Data Types\n", - " data_types = house_data.dtypes\n", - " print(\"Columns and their data types:\")\n", - " for column, dtype in data_types.items():\n", - " print(f\"{column}: {dtype}\")\n", - " print()\n", - "\n", - " return house_data\n" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - { "cell_type": "markdown", "metadata": {}, @@ -256,7 +121,6 @@ Clyde { "cell_type": "code", "execution_count": 3, - main "metadata": {}, "outputs": [ { From cbb5de189d98ee4c56dc7720b876ef3fa82fabf4 Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Wed, 1 May 2024 11:06:49 +0300 Subject: [PATCH 28/42] Update student.ipynb --- student.ipynb | 1118 ++++++++++++++++--------------------------------- 1 file changed, 352 insertions(+), 766 deletions(-) diff --git a/student.ipynb b/student.ipynb index 27522862..247e570c 100644 --- a/student.ipynb +++ b/student.ipynb @@ -342,6 +342,7 @@ " ...\n", " ...\n", " ...\n", + " ...\n", " \n", " \n", " 21592\n", @@ -425,7 +426,7 @@ " 1600\n", " 2388\n", " 2.0\n", - " NaN\n", + " NO\n", " NONE\n", " ...\n", " 8 Good\n", @@ -439,33 +440,9 @@ " 1410\n", " 1287\n", " \n", - " \n", - " 21596\n", - " 1523300157\n", - " 10/15/2014\n", - " 325000.0\n", - " 2\n", - " 0.75\n", - " 1020\n", - " 1076\n", - " 2.0\n", - " NO\n", - " NONE\n", - " ...\n", - " 7 Average\n", - " 1020\n", - " 0.0\n", - " 2008\n", - " 0.0\n", - " 98144\n", - " 47.5941\n", - " -122.299\n", - " 1020\n", - " 1357\n", - " \n", " \n", "\n", - "

21597 rows × 21 columns

\n", + "

21596 rows × 21 columns

\n", "" ], "text/plain": [ @@ -524,764 +501,373 @@ "[21597 rows x 21 columns]" ] }, -Clyde - "execution_count": 18, - - "execution_count": 3, - main + { + "cells": [ + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "load_data('data/kc_house_data.csv') # Assuming 'data' folder is in the same directory\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The dataset contains 21 columns, each representing a distinct feature, and 21,597 rows, with each row corresponding to a specific house sale entry.\n", + "\n", + "The dataset contains a mix of data types, including integers (int64), floating-point numbers (float64), and objects (strings). For instance, numerical features such as bedrooms, bathrooms, and sqft_living are represented as integers or floating-point numbers to facilitate mathematical computations, while categorical features like waterfront and view are stored as objects to accommodate text-based categories." + ] + }, + { + "cell_type": "code", + "execution_count": 19, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The dataset contains 21597 houses with 21 features\n", + "\n", + "Columns and their data types:\n", + "id: int64\n", + "date: object\n", + "price: float64\n", + "bedrooms: int64\n", + "bathrooms: float64\n", + "sqft_living: int64\n", + "sqft_lot: int64\n", + "floors: float64\n", + "waterfront: object\n", + "view: object\n", + "condition: object\n", + "grade: object\n", + "sqft_above: int64\n", + "sqft_basement: object\n", + "yr_built: int64\n", + "yr_renovated: float64\n", + "zipcode: int64\n", + "lat: float64\n", + "long: float64\n", + "sqft_living15: int64\n", + "sqft_lot15: int64\n", + "\n" + ] + } + ], + "source": [ + "kings_data = load_data('data/kc_house_data.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "# create a function that takes in a column and returns the column statistics as a dictionary\n", + "def descriptive_analytics(column):\n", + " stats_dict = column.describe().to_dict()\n", + " \n", + " print(\"Descriptive Statistics for Column '{}':\".format(column.name))\n", + " print(\"The count of the column is:\", stats_dict['count'])\n", + " print(\"The mean of the column is:\", stats_dict['mean'])\n", + " print(\"The standard deviation of the column is:\", stats_dict['std'])\n", + " print(\"The minimum value of the column is:\", stats_dict['min'])\n", + " print(\"The 25th percentile of the column is:\", stats_dict['25%'])\n", + " print(\"The median of the column is:\", stats_dict['50%'])\n", + " print(\"The 75th percentile of the column is:\", stats_dict['75%'])\n", + " print(\"The maximum value of the column is:\", stats_dict['max'])" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Descriptive Statistics for Column 'price':\n", + "The count of the column is: 21597.0\n", + "The mean of the column is: 540296.5735055795\n", + "The standard deviation of the column is: 367368.1401013936\n", + "The minimum value of the column is: 78000.0\n", + "The 25th percentile of the column is: 322000.0\n", + "The median of the column is: 450000.0\n", + "The 75th percentile of the column is: 645000.0\n", + "The maximum value of the column is: 7700000.0\n" + ] + } + ], + "source": [ + "descriptive_analytics(kings_data['price'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the maximum price of a house is $7,700,000 and the minimum price is $78,000.\n", + "\n", + "There are 21,597 prices regarding the houses in the dataset.\n", + "\n", + "The average price of a house is $540,296.57." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5 + } + "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "load_data('data/kc_house_data.csv') # Assuming 'data' folder is in the same directory\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The dataset contains 21 columns, each representing a distinct feature, and 21,597 rows, with each row corresponding to a specific house sale entry.\n", - "\n", - "The dataset contains a mix of data types, including integers (int64), floating-point numbers (float64), and objects (strings). For instance, numerical features such as bedrooms, bathrooms, and sqft_living are represented as integers or floating-point numbers to facilitate mathematical computations, while categorical features like waterfront and view are stored as objects to accommodate text-based categories." - ] - }, - { - "cell_type": "code", - Clyde - "execution_count": 19, - - "execution_count": 4, - main - "metadata": {}, - "outputs": [ + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 21597 entries, 0 to 21596\n", + "Data columns (total 21 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 21597 non-null int64 \n", + " 1 date 21597 non-null object \n", + " 2 price 21597 non-null float64\n", + " 3 bedrooms 21597 non-null int64 \n", + " 4 bathrooms 21597 non-null float64\n", + " 5 sqft_living 21597 non-null int64 \n", + " 6 sqft_lot 21597 non-null int64 \n", + " 7 floors 21597 non-null float64\n", + " 8 waterfront 19221 non-null object \n", + " 9 view 21534 non-null object \n", + " 10 condition 21597 non-null object \n", + " 11 grade 21597 non-null object \n", + " 12 sqft_above 21597 non-null int64 \n", + " 13 sqft_basement 21597 non-null object \n", + " 14 yr_built 21597 non-null int64 \n", + " 15 yr_renovated 17755 non-null float64\n", + " 16 zipcode 21597 non-null int64 \n", + " 17 lat 21597 non-null float64\n", + " 18 long 21597 non-null float64\n", + " 19 sqft_living15 21597 non-null int64 \n", + " 20 sqft_lot15 21597 non-null int64 \n", + "dtypes: float64(6), int64(9), object(6)\n", + "memory usage: 3.5+ MB\n" + ] + } + ], + "source": [ + "kings_data.info()" + ] + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "The dataset contains 21597 houses with 21 features\n", + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def identify_issues(dataset):\n", + " # Identify missing values as a percentage of the whole dataset\n", + " missing_values = (dataset.isnull().sum())/len(dataset) * 100\n", "\n", - "Columns and their data types:\n", - "id: int64\n", - "date: object\n", - "price: float64\n", - "bedrooms: int64\n", - "bathrooms: float64\n", - "sqft_living: int64\n", - "sqft_lot: int64\n", - "floors: float64\n", - "waterfront: object\n", - "view: object\n", - "condition: object\n", - "grade: object\n", - "sqft_above: int64\n", - "sqft_basement: object\n", - "yr_built: int64\n", - "yr_renovated: float64\n", - "zipcode: int64\n", - "lat: float64\n", - "long: float64\n", - "sqft_living15: int64\n", - "sqft_lot15: int64\n", - "\n" + " # Identify duplicates\n", + " duplicates = dataset.duplicated().sum()\n", + " \n", + " #return a dictionary \n", + " return {'duplicates': duplicates,\n", + " 'missing values': missing_values.round(2)} \n" ] - } - ], - "source": [ - "kings_data = load_data('data/kc_house_data.csv')" - ] - }, - { - "cell_type": "code", - Clyde - "execution_count": 20, - - "execution_count": 5, - main - "metadata": {}, - "outputs": [], - "source": [ - "#create a function that takes in a column and returns the column statistics as a dictionary\n", - "def descriptive_analytics(column):\n", - " stats_dict = column.describe().to_dict()\n", - " \n", - " print(\"Descriptive Statistics for Column '{}':\".format(column.name))\n", - " print(\"The count of the column is:\", stats_dict['count'])\n", - " print(\"The mean of the column is:\", stats_dict['mean'])\n", - " print(\"The standard deviation of the column is:\", stats_dict['std'])\n", - " print(\"The minimum value of the column is:\", stats_dict['min'])\n", - " print(\"The 25th percentile of the column is:\", stats_dict['25%'])\n", - " print(\"The median of the column is:\", stats_dict['50%'])\n", - " print(\"The 75th percentile of the column is:\", stats_dict['75%'])\n", - " print(\"The maximum value of the column is:\", stats_dict['max'])" - ] - }, - Clyde - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Descriptive Statistics for Column 'price':\n", - "The count of the column is: 21597.0\n", - "The mean of the column is: 540296.5735055795\n", - "The standard deviation of the column is: 367368.1401013936\n", - "The minimum value of the column is: 78000.0\n", - "The 25th percentile of the column is: 322000.0\n", - "The median of the column is: 450000.0\n", - "The 75th percentile of the column is: 645000.0\n", - "The maximum value of the column is: 7700000.0\n" + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'duplicates': 0,\n", + " 'missing values': id 0.00\n", + " date 0.00\n", + " price 0.00\n", + " bedrooms 0.00\n", + " bathrooms 0.00\n", + " sqft_living 0.00\n", + " sqft_lot 0.00\n", + " floors 0.00\n", + " waterfront 11.00\n", + " view 0.29\n", + " condition 0.00\n", + " grade 0.00\n", + " sqft_above 0.00\n", + " sqft_basement 0.00\n", + " yr_built 0.00\n", + " yr_renovated 17.79\n", + " zipcode 0.00\n", + " lat 0.00\n", + " long 0.00\n", + " sqft_living15 0.00\n", + " sqft_lot15 0.00\n", + " dtype: float64}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "identify_issues(kings_data)" ] - } - ], - "source": [ - "descriptive_analytics(kings_data['price'])" - - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "Descriptive Statistics for Column 'price':\n", - "The count of the column is: 21597.0\n", - "The mean of the column is: 540296.5735055795\n", - "The standard deviation of the column is: 367368.1401013936\n", - "The minimum value of the column is: 78000.0\n", - "The 25th percentile of the column is: 322000.0\n", - "The median of the column is: 450000.0\n", - "The 75th percentile of the column is: 645000.0\n", - "The maximum value of the column is: 7700000.0\n" + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "house_data_clean = kings_data.copy()" ] - } - ], - "source": [ - "descriptive_analytics(kings_data['price'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see that the maximum price of a house is 7700000 dollars and the minimum price is 78000 dollars\n", - "\n", - "There are 21597 prices regarding to the houses in the dataset\n", - "\n", - "Average price of a house is 540296.57 dollars" - main - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - Clyde - "We can see that the maximum price of a house is 7700000 dollars and the minimum price is 78000 dollars\n", - "\n", - "There are 21597 prices regarding to the houses in the dataset\n", - "\n", - "Average price of a house is 540296.57 dollars" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Preperation\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - - "## Data Preperation\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - main - "metadata": {}, - "outputs": [ + }, { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 21597 entries, 0 to 21596\n", - "Data columns (total 21 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 21597 non-null int64 \n", - " 1 date 21597 non-null object \n", - " 2 price 21597 non-null float64\n", - " 3 bedrooms 21597 non-null int64 \n", - " 4 bathrooms 21597 non-null float64\n", - " 5 sqft_living 21597 non-null int64 \n", - " 6 sqft_lot 21597 non-null int64 \n", - " 7 floors 21597 non-null float64\n", - " 8 waterfront 19221 non-null object \n", - " 9 view 21534 non-null object \n", - " 10 condition 21597 non-null object \n", - " 11 grade 21597 non-null object \n", - " 12 sqft_above 21597 non-null int64 \n", - " 13 sqft_basement 21597 non-null object \n", - " 14 yr_built 21597 non-null int64 \n", - " 15 yr_renovated 17755 non-null float64\n", - " 16 zipcode 21597 non-null int64 \n", - " 17 lat 21597 non-null float64\n", - " 18 long 21597 non-null float64\n", - " 19 sqft_living15 21597 non-null int64 \n", - " 20 sqft_lot15 21597 non-null int64 \n", - "dtypes: float64(6), int64(9), object(6)\n", - "memory usage: 3.5+ MB\n" + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Changing the date to date time\n", + "house_data_clean['date'] = pd.to_datetime(house_data_clean['date'])\n", + "\n", + "# Extracting only the year from the column Date\n", + "house_data_clean.date = house_data_clean['date'].dt.year\n", + "\n", + "# Changing the dates for the year built \n", + "house_data_clean['yr_built'] = pd.to_datetime(house_data_clean['yr_built'],format='%Y').dt.year\n" ] - } - ], - "source": [ - "kings_data.info()" - ] - }, - Clyde - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "def identify_issues(dataset):\n", - " # Identify missing values as a percentage of the whole dataset\n", - " missing_values = (dataset.isnull().sum())/len(dataset) * 100\n", - "\n", - " # Identify duplicates\n", - " duplicates = dataset.duplicated().sum()\n", - " \n", - " #return a dictionary \n", - " return {'duplicates': duplicates,\n", - " 'missing values': missing_values.round(2)} \n" - - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "def identify_issues(dataset):\n", - " # Identify missing values as a percentage of the whole dataset\n", - " missing_values = (dataset.isnull().sum())/len(dataset) * 100\n", - "\n", - " # Identify duplicates\n", - " duplicates = dataset.duplicated().sum()\n", - " \n", - " #return a dictionary \n", - " return {'duplicates': duplicates,\n", - " 'missing values': missing_values.round(2)} \n" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ + }, { - "data": { - "text/plain": [ - "{'duplicates': 0,\n", - " 'missing values': id 0.00\n", - " date 0.00\n", - " price 0.00\n", - " bedrooms 0.00\n", - " bathrooms 0.00\n", - " sqft_living 0.00\n", - " sqft_lot 0.00\n", - " floors 0.00\n", - " waterfront 11.00\n", - " view 0.29\n", - " condition 0.00\n", - " grade 0.00\n", - " sqft_above 0.00\n", - " sqft_basement 0.00\n", - " yr_built 0.00\n", - " yr_renovated 17.79\n", - " zipcode 0.00\n", - " lat 0.00\n", - " long 0.00\n", - " sqft_living15 0.00\n", - " sqft_lot15 0.00\n", - " dtype: float64}" - ] - }, - "execution_count": 9, + "cell_type": "code", + "execution_count": 6, "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "identify_issues(kings_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The examination indicates that there are no duplicate entries within the dataset, ensuring the integrity of the records. However, attention is warranted to address missing values present in certain columns. Specifically, the 'waterfront' feature exhibits 11% of null values, representing a negligible portion of the dataset. Similarly, the 'yr_renovated' feature shows a relatively higher percentage of missing values, accounting for approximately 17.79% of the dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Before making changes make a copy instead of overwriting data" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "house_data_clean = kings_data.copy()" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# Changing the date to date time\n", - "house_data_clean['date'] = pd.to_datetime(house_data_clean['date'])\n", - "\n", - "# Extracting only the year from the column Date\n", - "house_data_clean.date = house_data_clean['date'].dt.year\n", - "\n", - "# Changing the dates for the year built \n", - "house_data_clean['yr_built'] = pd.to_datetime(house_data_clean['yr_built'],format='%Y').dt.year\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The above code converts the 'date' column data to only contain the year the house was sold, for the purpose of analysis we will use only the year since the changes month by month will be minor." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Dealing with the missing values" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "def missing_values(dataset):\n", - " # drop the rows from views\n", - " dataset.dropna(subset=['view'],inplace=True)\n", - "\n", - " # Filling the NaN values for waterfront with NO\n", - " dataset.waterfront.fillna('NO',inplace=True)\n", - " \n", - " # Dropping the yr_renovated column \n", - " dataset.drop('yr_renovated',axis=1,inplace=True)" - main - ] - }, - { - "cell_type": "code", - Clyde - "execution_count": 24, - - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "missing_values(house_data_clean)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "'yr_renovated' has the highest percentage of NaN values 17%. This will be dropped since it will not be used within our model inline with the business problem.\n", - "\n", - "'Waterfront' feature has 11% null values, this was filled with NO on the assumption that these cells were not filled since they lacked waterfronts\n", - "\n", - "For the 'View' column, the null values were dropped by row since the overall percentage impact would be minute" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - main - "metadata": {}, - "outputs": [ + "outputs": [], + "source": [ + "def missing_values(dataset):\n", + " # drop the rows from views\n", + " dataset.dropna(subset=['view'],inplace=True)\n", + "\n", + " # Filling the NaN values for waterfront with NO\n", + " dataset.waterfront.fillna('NO',inplace=True)\n", + " \n", + " # Dropping the yr_renovated column \n", + " dataset.drop('yr_renovated',axis=1,inplace=True)" + ] + }, { - "data": { - "text/plain": [ - Clyde - "{'duplicates': 0,\n", - " 'missing values': id 0.00\n", - " date 0.00\n", - " price 0.00\n", - " bedrooms 0.00\n", - " bathrooms 0.00\n", - " sqft_living 0.00\n", - " sqft_lot 0.00\n", - " floors 0.00\n", - " waterfront 11.00\n", - " view 0.29\n", - " condition 0.00\n", - " grade 0.00\n", - " sqft_above 0.00\n", - " sqft_basement 0.00\n", - " yr_built 0.00\n", - " yr_renovated 17.79\n", - " zipcode 0.00\n", - " lat 0.00\n", - " long 0.00\n", - " sqft_living15 0.00\n", - " sqft_lot15 0.00\n", - " dtype: float64}" - ] - }, - "execution_count": 24, - - "{'duplicates': 2,\n", - " 'missing values': id 0.0\n", - " date 0.0\n", - " price 0.0\n", - " bedrooms 0.0\n", - " bathrooms 0.0\n", - " sqft_living 0.0\n", - " sqft_lot 0.0\n", - " floors 0.0\n", - " waterfront 0.0\n", - " view 0.0\n", - " condition 0.0\n", - " grade 0.0\n", - " sqft_above 0.0\n", - " sqft_basement 0.0\n", - " yr_built 0.0\n", - " zipcode 0.0\n", - " lat 0.0\n", - " long 0.0\n", - " sqft_living15 0.0\n", - " sqft_lot15 0.0\n", - " dtype: float64}" - ] - }, - "execution_count": 14, - main + "cell_type": "code", + "execution_count": 7, "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - Clyde - "identify_issues(kings_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The examination indicates that there are no duplicate entries within the dataset, ensuring the integrity of the records. However, attention is warranted to address missing values present in certain columns. Specifically, the 'waterfront' feature exhibits 11% of null values, representing a negligible portion of the dataset. Similarly, the 'yr_renovated' feature shows a relatively higher percentage of missing values, accounting for approximately 17.79% of the dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Before making changes make a copy instead of overwriting data" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "house_data_clean = kings_data.copy()" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "# Changing the date to date time\n", - "house_data_clean['date'] = pd.to_datetime(house_data_clean['date'])\n", - "\n", - "# Extracting only the year from the column Date\n", - "house_data_clean.date = house_data_clean['date'].dt.year\n", - "\n", - "# Changing the dates for the year built \n", - "house_data_clean['yr_built'] = pd.to_datetime(house_data_clean['yr_built'],format='%Y').dt.year\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The above code converts the 'date' column data to only contain the year the house was sold, for the purpose of analysis we will use only the year since the changes month by month will be minor." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Dealing with the missing values" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "def missing_values(dataset):\n", - " # drop the rows from views\n", - " dataset.dropna(subset=['view'],inplace=True)\n", - "\n", - " # Filling the NaN values for waterfront with NO\n", - " dataset.waterfront.fillna('NO',inplace=True)\n", - " \n", - " # Dropping the yr_renovated column \n", - " dataset.drop('yr_renovated',axis=1,inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [], - "source": [ - "missing_values(house_data_clean)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "'yr_renovated' has the highest percentage of NaN values 17%. This will be dropped since it will not be used within our model inline with the business problem.\n", - "\n", - "'Waterfront' feature has 11% null values, this was filled with NO on the assumption that these cells were not filled since they lacked waterfronts\n", - "\n", - "For the 'View' column, the null values were dropped by row since the overall percentage impact would be minute" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ + "outputs": [], + "source": [ + "missing_values(house_data_clean)" + ] + }, { - "data": { - "text/plain": [ - "{'duplicates': 2,\n", - " 'missing values': id 0.0\n", - " date 0.0\n", - " price 0.0\n", - " bedrooms 0.0\n", - " bathrooms 0.0\n", - " sqft_living 0.0\n", - " sqft_lot 0.0\n", - " floors 0.0\n", - " waterfront 0.0\n", - " view 0.0\n", - " condition 0.0\n", - " grade 0.0\n", - " sqft_above 0.0\n", - " sqft_basement 0.0\n", - " yr_built 0.0\n", - " zipcode 0.0\n", - " lat 0.0\n", - " long 0.0\n", - " sqft_living15 0.0\n", - " sqft_lot15 0.0\n", - " dtype: float64}" - ] - }, - "execution_count": 29, + "cell_type": "code", + "execution_count": 8, "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "identify_issues(house_data_clean)" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - - "identify_issues(house_data_clean)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - main - "metadata": {}, - "outputs": [ + "outputs": [ + { + "data": { + "text/plain": [ + "{'duplicates': 0,\n", + " 'missing values': id 0.00\n", + " date 0.00\n", + " price 0.00\n", + " bedrooms 0.00\n", + " bathrooms 0.00\n", + " sqft_living 0.00\n", + " sqft_lot 0.00\n", + " floors 0.00\n", + " waterfront 0.00\n", + " view 0.00\n", + " condition 0.00\n", + " grade 0.00\n", + " sqft_above 0.00\n", + " sqft_basement 0.00\n", + " yr_built 0.00\n", + " zipcode 0.00\n", + " lat 0.00\n", + " long 0.00\n", + " sqft_living15 0.00\n", + " sqft_lot15 0.00\n", + " dtype: float64}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "identify_issues(house_data_clean)" + ] + }, { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewconditiongradesqft_abovesqft_basementyr_builtzipcodelatlongsqft_living15sqft_lot15
394718250690312014550000.041.75241084472.0NOGOODGood8 Good2060350.019369807447.6499-122.088252014789
2003886489001102014555000.032.50194032112.0NONONEAverage8 Good19400.020099802747.5644-122.09318803078
\n", - "
" - ], - "text/plain": [ - " id date price bedrooms bathrooms sqft_living sqft_lot \\\n", - "3947 1825069031 2014 550000.0 4 1.75 2410 8447 \n", - "20038 8648900110 2014 555000.0 3 2.50 1940 3211 \n", - "\n", - " floors waterfront view condition grade sqft_above sqft_basement \\\n", - "3947 2.0 NO GOOD Good 8 Good 2060 350.0 \n", - "20038 2.0 NO NONE Average 8 Good 1940 0.0 \n", - "\n", - " yr_built zipcode lat long sqft_living15 sqft_lot15 \n", - "3947 1936 98074 47.6499 -122.088 2520 14789 \n", - "20038 2009 98027 47.5644 -122.093 1880 3078 " - ] - }, - Clyde - "execution_count": 30, - - "execution_count": 15, - main + "cell_type": "code", + "execution_count": 9, "metadata": {}, - "output_type": "execute_result" + "outputs": [], + "source": [ + "# Checking for duplicates in the dataset\n", + "house_data_clean[house_data_clean.duplicated()]" + ] } ], - "source": [ - "house_data_clean[house_data_clean.duplicated()]" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.11" + } }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 -} + "nbformat": 4, + "nbformat_minor": 5 + } \ No newline at end of file From 90359262269e6daaee8bbcd4a12e41f0a9f9a634 Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Wed, 1 May 2024 11:09:22 +0300 Subject: [PATCH 29/42] Update student.ipynb --- student.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/student.ipynb b/student.ipynb index 247e570c..40941025 100644 --- a/student.ipynb +++ b/student.ipynb @@ -501,7 +501,7 @@ "[21597 rows x 21 columns]" ] }, - { + "cells": [ { "cell_type": "code", From da626e5a98ef9ce6374de659e6f8b27748dc2cd0 Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Wed, 1 May 2024 11:13:05 +0300 Subject: [PATCH 30/42] Update student.ipynb --- student.ipynb | 451 +++++++++++++++++++++++++------------------------- 1 file changed, 229 insertions(+), 222 deletions(-) diff --git a/student.ipynb b/student.ipynb index 40941025..ab38df0d 100644 --- a/student.ipynb +++ b/student.ipynb @@ -645,229 +645,236 @@ "nbformat_minor": 5 } - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 21597 entries, 0 to 21596\n", - "Data columns (total 21 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 21597 non-null int64 \n", - " 1 date 21597 non-null object \n", - " 2 price 21597 non-null float64\n", - " 3 bedrooms 21597 non-null int64 \n", - " 4 bathrooms 21597 non-null float64\n", - " 5 sqft_living 21597 non-null int64 \n", - " 6 sqft_lot 21597 non-null int64 \n", - " 7 floors 21597 non-null float64\n", - " 8 waterfront 19221 non-null object \n", - " 9 view 21534 non-null object \n", - " 10 condition 21597 non-null object \n", - " 11 grade 21597 non-null object \n", - " 12 sqft_above 21597 non-null int64 \n", - " 13 sqft_basement 21597 non-null object \n", - " 14 yr_built 21597 non-null int64 \n", - " 15 yr_renovated 17755 non-null float64\n", - " 16 zipcode 21597 non-null int64 \n", - " 17 lat 21597 non-null float64\n", - " 18 long 21597 non-null float64\n", - " 19 sqft_living15 21597 non-null int64 \n", - " 20 sqft_lot15 21597 non-null int64 \n", - "dtypes: float64(6), int64(9), object(6)\n", - "memory usage: 3.5+ MB\n" - ] - } - ], - "source": [ - "kings_data.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "def identify_issues(dataset):\n", - " # Identify missing values as a percentage of the whole dataset\n", - " missing_values = (dataset.isnull().sum())/len(dataset) * 100\n", - "\n", - " # Identify duplicates\n", - " duplicates = dataset.duplicated().sum()\n", - " \n", - " #return a dictionary \n", - " return {'duplicates': duplicates,\n", - " 'missing values': missing_values.round(2)} \n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'duplicates': 0,\n", - " 'missing values': id 0.00\n", - " date 0.00\n", - " price 0.00\n", - " bedrooms 0.00\n", - " bathrooms 0.00\n", - " sqft_living 0.00\n", - " sqft_lot 0.00\n", - " floors 0.00\n", - " waterfront 11.00\n", - " view 0.29\n", - " condition 0.00\n", - " grade 0.00\n", - " sqft_above 0.00\n", - " sqft_basement 0.00\n", - " yr_built 0.00\n", - " yr_renovated 17.79\n", - " zipcode 0.00\n", - " lat 0.00\n", - " long 0.00\n", - " sqft_living15 0.00\n", - " sqft_lot15 0.00\n", - " dtype: float64}" + { + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 21597 entries, 0 to 21596\n", + "Data columns (total 21 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 21597 non-null int64 \n", + " 1 date 21597 non-null object \n", + " 2 price 21597 non-null float64\n", + " 3 bedrooms 21597 non-null int64 \n", + " 4 bathrooms 21597 non-null float64\n", + " 5 sqft_living 21597 non-null int64 \n", + " 6 sqft_lot 21597 non-null int64 \n", + " 7 floors 21597 non-null float64\n", + " 8 waterfront 19221 non-null object \n", + " 9 view 21534 non-null object \n", + " 10 condition 21597 non-null object \n", + " 11 grade 21597 non-null object \n", + " 12 sqft_above 21597 non-null int64 \n", + " 13 sqft_basement 21597 non-null object \n", + " 14 yr_built 21597 non-null int64 \n", + " 15 yr_renovated 17755 non-null float64\n", + " 16 zipcode 21597 non-null int64 \n", + " 17 lat 21597 non-null float64\n", + " 18 long 21597 non-null float64\n", + " 19 sqft_living15 21597 non-null int64 \n", + " 20 sqft_lot15 21597 non-null int64 \n", + "dtypes: float64(6), int64(9), object(6)\n", + "memory usage: 3.5+ MB\n" + ] + } + ], + "source": [ + "kings_data.info()" ] }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "identify_issues(kings_data)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "house_data_clean = kings_data.copy()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# Changing the date to date time\n", - "house_data_clean['date'] = pd.to_datetime(house_data_clean['date'])\n", - "\n", - "# Extracting only the year from the column Date\n", - "house_data_clean.date = house_data_clean['date'].dt.year\n", - "\n", - "# Changing the dates for the year built \n", - "house_data_clean['yr_built'] = pd.to_datetime(house_data_clean['yr_built'],format='%Y').dt.year\n" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "def missing_values(dataset):\n", - " # drop the rows from views\n", - " dataset.dropna(subset=['view'],inplace=True)\n", - "\n", - " # Filling the NaN values for waterfront with NO\n", - " dataset.waterfront.fillna('NO',inplace=True)\n", - " \n", - " # Dropping the yr_renovated column \n", - " dataset.drop('yr_renovated',axis=1,inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "missing_values(house_data_clean)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'duplicates': 0,\n", - " 'missing values': id 0.00\n", - " date 0.00\n", - " price 0.00\n", - " bedrooms 0.00\n", - " bathrooms 0.00\n", - " sqft_living 0.00\n", - " sqft_lot 0.00\n", - " floors 0.00\n", - " waterfront 0.00\n", - " view 0.00\n", - " condition 0.00\n", - " grade 0.00\n", - " sqft_above 0.00\n", - " sqft_basement 0.00\n", - " yr_built 0.00\n", - " zipcode 0.00\n", - " lat 0.00\n", - " long 0.00\n", - " sqft_living15 0.00\n", - " sqft_lot15 0.00\n", - " dtype: float64}" + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def identify_issues(dataset):\n", + " # Identify missing values as a percentage of the whole dataset\n", + " missing_values = (dataset.isnull().sum())/len(dataset) * 100\n", + "\n", + " # Identify duplicates\n", + " duplicates = dataset.duplicated().sum()\n", + " \n", + " #return a dictionary \n", + " return {'duplicates': duplicates,\n", + " 'missing values': missing_values.round(2)} \n" ] }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "identify_issues(house_data_clean)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "# Checking for duplicates in the dataset\n", - "house_data_clean[house_data_clean.duplicated()]" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.11" - } - }, - "nbformat": 4, - "nbformat_minor": 5 - } \ No newline at end of file + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'duplicates': 0,\n", + " 'missing values': {\n", + " 'id': 0.0,\n", + " 'date': 0.0,\n", + " 'price': 0.0,\n", + " 'bedrooms': 0.0,\n", + " 'bathrooms': 0.0,\n", + " 'sqft_living': 0.0,\n", + " 'sqft_lot': 0.0,\n", + " 'floors': 0.0,\n", + " 'waterfront': 11.0,\n", + " 'view': 0.29,\n", + " 'condition': 0.0,\n", + " 'grade': 0.0,\n", + " 'sqft_above': 0.0,\n", + " 'sqft_basement': 0.0,\n", + " 'yr_built': 0.0,\n", + " 'yr_renovated': 17.79,\n", + " 'zipcode': 0.0,\n", + " 'lat': 0.0,\n", + " 'long': 0.0,\n", + " 'sqft_living15': 0.0,\n", + " 'sqft_lot15': 0.0\n", + " }}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "identify_issues(kings_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "house_data_clean = kings_data.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Changing the date to date time\n", + "house_data_clean['date'] = pd.to_datetime(house_data_clean['date'])\n", + "\n", + "# Extracting only the year from the column Date\n", + "house_data_clean.date = house_data_clean['date'].dt.year\n", + "\n", + "# Changing the dates for the year built \n", + "house_data_clean['yr_built'] = pd.to_datetime(house_data_clean['yr_built'],format='%Y').dt.year\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def missing_values(dataset):\n", + " # drop the rows from views\n", + " dataset.dropna(subset=['view'],inplace=True)\n", + "\n", + " # Filling the NaN values for waterfront with NO\n", + " dataset.waterfront.fillna('NO',inplace=True)\n", + " \n", + " # Dropping the yr_renovated column \n", + " dataset.drop('yr_renovated',axis=1,inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "missing_values(house_data_clean)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'duplicates': 0,\n", + " 'missing values': id 0.00\n", + " date 0.00\n", + " price 0.00\n", + " bedrooms 0.00\n", + " bathrooms 0.00\n", + " sqft_living 0.00\n", + " sqft_lot 0.00\n", + " floors 0.00\n", + " waterfront 0.00\n", + " view 0.00\n", + " condition 0.00\n", + " grade 0.00\n", + " sqft_above 0.00\n", + " sqft_basement 0.00\n", + " yr_built 0.00\n", + " zipcode 0.00\n", + " lat 0.00\n", + " long 0.00\n", + " sqft_living15 0.00\n", + " sqft_lot15 0.00\n", + " dtype: float64}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "identify_issues(house_data_clean)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Checking for duplicates in the dataset\n", + "house_data_clean[house_data_clean.duplicated()]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 + } + \ No newline at end of file From 48e99978650e729af9d4901d484b9c7babe585fb Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Wed, 1 May 2024 11:14:21 +0300 Subject: [PATCH 31/42] Update student.ipynb --- student.ipynb | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/student.ipynb b/student.ipynb index ab38df0d..b8e4f974 100644 --- a/student.ipynb +++ b/student.ipynb @@ -644,8 +644,7 @@ "nbformat": 4, "nbformat_minor": 5 } - - { + { "cells": [ { "cell_type": "code", From 571eda3d3a01ed7d6f054fcc00212bce4d186755 Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Wed, 1 May 2024 11:30:59 +0300 Subject: [PATCH 32/42] Update student.ipynb --- student.ipynb | 666 ++++++++++++++++++++++++-------------------------- 1 file changed, 323 insertions(+), 343 deletions(-) diff --git a/student.ipynb b/student.ipynb index b8e4f974..0893adb3 100644 --- a/student.ipynb +++ b/student.ipynb @@ -533,347 +533,327 @@ "output_type": "stream", "text": [ "The dataset contains 21597 houses with 21 features\n", - "\n", - "Columns and their data types:\n", - "id: int64\n", - "date: object\n", - "price: float64\n", - "bedrooms: int64\n", - "bathrooms: float64\n", - "sqft_living: int64\n", - "sqft_lot: int64\n", - "floors: float64\n", - "waterfront: object\n", - "view: object\n", - "condition: object\n", - "grade: object\n", - "sqft_above: int64\n", - "sqft_basement: object\n", - "yr_built: int64\n", - "yr_renovated: float64\n", - "zipcode: int64\n", - "lat: float64\n", - "long: float64\n", - "sqft_living15: int64\n", - "sqft_lot15: int64\n", - "\n" - ] - } - ], - "source": [ - "kings_data = load_data('data/kc_house_data.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "# create a function that takes in a column and returns the column statistics as a dictionary\n", - "def descriptive_analytics(column):\n", - " stats_dict = column.describe().to_dict()\n", - " \n", - " print(\"Descriptive Statistics for Column '{}':\".format(column.name))\n", - " print(\"The count of the column is:\", stats_dict['count'])\n", - " print(\"The mean of the column is:\", stats_dict['mean'])\n", - " print(\"The standard deviation of the column is:\", stats_dict['std'])\n", - " print(\"The minimum value of the column is:\", stats_dict['min'])\n", - " print(\"The 25th percentile of the column is:\", stats_dict['25%'])\n", - " print(\"The median of the column is:\", stats_dict['50%'])\n", - " print(\"The 75th percentile of the column is:\", stats_dict['75%'])\n", - " print(\"The maximum value of the column is:\", stats_dict['max'])" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Descriptive Statistics for Column 'price':\n", - "The count of the column is: 21597.0\n", - "The mean of the column is: 540296.5735055795\n", - "The standard deviation of the column is: 367368.1401013936\n", - "The minimum value of the column is: 78000.0\n", - "The 25th percentile of the column is: 322000.0\n", - "The median of the column is: 450000.0\n", - "The 75th percentile of the column is: 645000.0\n", - "The maximum value of the column is: 7700000.0\n" - ] - } - ], - "source": [ - "descriptive_analytics(kings_data['price'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see that the maximum price of a house is $7,700,000 and the minimum price is $78,000.\n", - "\n", - "There are 21,597 prices regarding the houses in the dataset.\n", - "\n", - "The average price of a house is $540,296.57." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.1" - } - }, - "nbformat": 4, - "nbformat_minor": 5 - } - { - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 21597 entries, 0 to 21596\n", - "Data columns (total 21 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 21597 non-null int64 \n", - " 1 date 21597 non-null object \n", - " 2 price 21597 non-null float64\n", - " 3 bedrooms 21597 non-null int64 \n", - " 4 bathrooms 21597 non-null float64\n", - " 5 sqft_living 21597 non-null int64 \n", - " 6 sqft_lot 21597 non-null int64 \n", - " 7 floors 21597 non-null float64\n", - " 8 waterfront 19221 non-null object \n", - " 9 view 21534 non-null object \n", - " 10 condition 21597 non-null object \n", - " 11 grade 21597 non-null object \n", - " 12 sqft_above 21597 non-null int64 \n", - " 13 sqft_basement 21597 non-null object \n", - " 14 yr_built 21597 non-null int64 \n", - " 15 yr_renovated 17755 non-null float64\n", - " 16 zipcode 21597 non-null int64 \n", - " 17 lat 21597 non-null float64\n", - " 18 long 21597 non-null float64\n", - " 19 sqft_living15 21597 non-null int64 \n", - " 20 sqft_lot15 21597 non-null int64 \n", - "dtypes: float64(6), int64(9), object(6)\n", - "memory usage: 3.5+ MB\n" - ] - } - ], - "source": [ - "kings_data.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "def identify_issues(dataset):\n", - " # Identify missing values as a percentage of the whole dataset\n", - " missing_values = (dataset.isnull().sum())/len(dataset) * 100\n", - "\n", - " # Identify duplicates\n", - " duplicates = dataset.duplicated().sum()\n", - " \n", - " #return a dictionary \n", - " return {'duplicates': duplicates,\n", - " 'missing values': missing_values.round(2)} \n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'duplicates': 0,\n", - " 'missing values': {\n", - " 'id': 0.0,\n", - " 'date': 0.0,\n", - " 'price': 0.0,\n", - " 'bedrooms': 0.0,\n", - " 'bathrooms': 0.0,\n", - " 'sqft_living': 0.0,\n", - " 'sqft_lot': 0.0,\n", - " 'floors': 0.0,\n", - " 'waterfront': 11.0,\n", - " 'view': 0.29,\n", - " 'condition': 0.0,\n", - " 'grade': 0.0,\n", - " 'sqft_above': 0.0,\n", - " 'sqft_basement': 0.0,\n", - " 'yr_built': 0.0,\n", - " 'yr_renovated': 17.79,\n", - " 'zipcode': 0.0,\n", - " 'lat': 0.0,\n", - " 'long': 0.0,\n", - " 'sqft_living15': 0.0,\n", - " 'sqft_lot15': 0.0\n", - " }}" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "identify_issues(kings_data)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "house_data_clean = kings_data.copy()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# Changing the date to date time\n", - "house_data_clean['date'] = pd.to_datetime(house_data_clean['date'])\n", - "\n", - "# Extracting only the year from the column Date\n", - "house_data_clean.date = house_data_clean['date'].dt.year\n", - "\n", - "# Changing the dates for the year built \n", - "house_data_clean['yr_built'] = pd.to_datetime(house_data_clean['yr_built'],format='%Y').dt.year\n" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "def missing_values(dataset):\n", - " # drop the rows from views\n", - " dataset.dropna(subset=['view'],inplace=True)\n", - "\n", - " # Filling the NaN values for waterfront with NO\n", - " dataset.waterfront.fillna('NO',inplace=True)\n", - " \n", - " # Dropping the yr_renovated column \n", - " dataset.drop('yr_renovated',axis=1,inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "missing_values(house_data_clean)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'duplicates': 0,\n", - " 'missing values': id 0.00\n", - " date 0.00\n", - " price 0.00\n", - " bedrooms 0.00\n", - " bathrooms 0.00\n", - " sqft_living 0.00\n", - " sqft_lot 0.00\n", - " floors 0.00\n", - " waterfront 0.00\n", - " view 0.00\n", - " condition 0.00\n", - " grade 0.00\n", - " sqft_above 0.00\n", - " sqft_basement 0.00\n", - " yr_built 0.00\n", - " zipcode 0.00\n", - " lat 0.00\n", - " long 0.00\n", - " sqft_living15 0.00\n", - " sqft_lot15 0.00\n", - " dtype: float64}" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "identify_issues(house_data_clean)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "# Checking for duplicates in the dataset\n", - "house_data_clean[house_data_clean.duplicated()]" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.11" - } - }, - "nbformat": 4, - "nbformat_minor": 5 - } +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "kings_data = load_data('data/kc_house_data.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 20, + "metadata": {}, + "outputs": [], + "source": [ + "# create a function that takes in a column and returns the column statistics as a dictionary\n", + "def descriptive_analytics(column):\n", + " stats_dict = column.describe().to_dict()\n", + " \n", + " print(\"Descriptive Statistics for Column '{}':\".format(column.name))\n", + " print(\"The count of the column is:\", stats_dict['count'])\n", + " print(\"The mean of the column is:\", stats_dict['mean'])\n", + " print(\"The standard deviation of the column is:\", stats_dict['std'])\n", + " print(\"The minimum value of the column is:\", stats_dict['min'])\n", + " print(\"The 25th percentile of the column is:\", stats_dict['25%'])\n", + " print(\"The median of the column is:\", stats_dict['50%'])\n", + " print(\"The 75th percentile of the column is:\", stats_dict['75%'])\n", + " print(\"The maximum value of the column is:\", stats_dict['max'])" + ] + }, + { + "cell_type": "code", + "execution_count": 21, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Descriptive Statistics for Column 'price':\n", + "The count of the column is: 21597.0\n", + "The mean of the column is: 540296.5735055795\n", + "The standard deviation of the column is: 367368.1401013936\n", + "The minimum value of the column is: 78000.0\n", + "The 25th percentile of the column is: 322000.0\n", + "The median of the column is: 450000.0\n", + "The 75th percentile of the column is: 645000.0\n", + "The maximum value of the column is: 7700000.0\n" + ] + } + ], + "source": [ + "descriptive_analytics(kings_data['price'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the maximum price of a house is $7,700,000 and the minimum price is $78,000.\n", + "\n", + "There are 21,597 prices regarding the houses in the dataset.\n", + "\n", + "The average price of a house is $540,296.57." + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.9.1" + } + }, + "nbformat": 4, + "nbformat_minor": 5, +"cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 21597 entries, 0 to 21596\n", + "Data columns (total 21 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 21597 non-null int64 \n", + " 1 date 21597 non-null object \n", + " 2 price 21597 non-null float64\n", + " 3 bedrooms 21597 non-null int64 \n", + " 4 bathrooms 21597 non-null float64\n", + " 5 sqft_living 21597 non-null int64 \n", + " 6 sqft_lot 21597 non-null int64 \n", + " 7 floors 21597 non-null float64\n", + " 8 waterfront 19221 non-null object \n", + " 9 view 21534 non-null object \n", + " 10 condition 21597 non-null object \n", + " 11 grade 21597 non-null object \n", + " 12 sqft_above 21597 non-null int64 \n", + " 13 sqft_basement 21597 non-null object \n", + " 14 yr_built 21597 non-null int64 \n", + " 15 yr_renovated 17755 non-null float64\n", + " 16 zipcode 21597 non-null int64 \n", + " 17 lat 21597 non-null float64\n", + " 18 long 21597 non-null float64\n", + " 19 sqft_living15 21597 non-null int64 \n", + " 20 sqft_lot15 21597 non-null int64 \n", + "dtypes: float64(6), int64(9), object(6)\n", + "memory usage: 3.5+ MB\n" + ] + } + ], + "source": [ + "kings_data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": {}, + "outputs": [], + "source": [ + "def identify_issues(dataset):\n", + " # Identify missing values as a percentage of the whole dataset\n", + " missing_values = (dataset.isnull().sum())/len(dataset) * 100\n", + "\n", + " # Identify duplicates\n", + " duplicates = dataset.duplicated().sum()\n", + " \n", + " #return a dictionary \n", + " return {'duplicates': duplicates,\n", + " 'missing values': missing_values.round(2)} \n" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'duplicates': 0,\n", + " 'missing values': {\n", + " 'id': 0.0,\n", + " 'date': 0.0,\n", + " 'price': 0.0,\n", + " 'bedrooms': 0.0,\n", + " 'bathrooms': 0.0,\n", + " 'sqft_living': 0.0,\n", + " 'sqft_lot': 0.0,\n", + " 'floors': 0.0,\n", + " 'waterfront': 11.0,\n", + " 'view': 0.29,\n", + " 'condition': 0.0,\n", + " 'grade': 0.0,\n", + " 'sqft_above': 0.0,\n", + " 'sqft_basement': 0.0,\n", + " 'yr_built': 0.0,\n", + " 'yr_renovated': 17.79,\n", + " 'zipcode': 0.0,\n", + " 'lat': 0.0,\n", + " 'long': 0.0,\n", + " 'sqft_living15': 0.0,\n", + " 'sqft_lot15': 0.0\n", + " }}" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "identify_issues(kings_data)" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [], + "source": [ + "house_data_clean = kings_data.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "# Changing the date to date time\n", + "house_data_clean['date'] = pd.to_datetime(house_data_clean['date'])\n", + "\n", + "# Extracting only the year from the column Date\n", + "house_data_clean.date = house_data_clean['date'].dt.year\n", + "\n", + "# Changing the dates for the year built \n", + "house_data_clean['yr_built'] = pd.to_datetime(house_data_clean['yr_built'],format='%Y').dt.year\n" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [], + "source": [ + "def missing_values(dataset):\n", + " # drop the rows from views\n", + " dataset.dropna(subset=['view'],inplace=True)\n", + "\n", + " # Filling the NaN values for waterfront with NO\n", + " dataset.waterfront.fillna('NO',inplace=True)\n", + " \n", + " # Dropping the yr_renovated column \n", + " dataset.drop('yr_renovated',axis=1,inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [], + "source": [ + "missing_values(house_data_clean)" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'duplicates': 0,\n", + " 'missing values': id 0.00\n", + " date 0.00\n", + " price 0.00\n", + " bedrooms 0.00\n", + " bathrooms 0.00\n", + " sqft_living 0.00\n", + " sqft_lot 0.00\n", + " floors 0.00\n", + " waterfront 0.00\n", + " view 0.00\n", + " condition 0.00\n", + " grade 0.00\n", + " sqft_above 0.00\n", + " sqft_basement 0.00\n", + " yr_built 0.00\n", + " zipcode 0.00\n", + " lat 0.00\n", + " long 0.00\n", + " sqft_living15 0.00\n", + " sqft_lot15 0.00\n", + " dtype: float64}" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "identify_issues(house_data_clean)" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [], + "source": [ + "# Checking for duplicates in the dataset\n", + "house_data_clean[house_data_clean.duplicated()]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.8.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +} + + \ No newline at end of file From 0edfa118235af2f6329f710be238452ff730db5b Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Wed, 1 May 2024 11:32:17 +0300 Subject: [PATCH 33/42] Update student.ipynb --- student.ipynb | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/student.ipynb b/student.ipynb index 0893adb3..c9d9d88b 100644 --- a/student.ipynb +++ b/student.ipynb @@ -853,7 +853,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} - - - \ No newline at end of file +} \ No newline at end of file From 9d605250e480b913d8a7e552b6a90206d811e08f Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Wed, 1 May 2024 11:43:49 +0300 Subject: [PATCH 34/42] Update student.ipynb --- student.ipynb | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/student.ipynb b/student.ipynb index c9d9d88b..82d7ee49 100644 --- a/student.ipynb +++ b/student.ipynb @@ -827,11 +827,12 @@ "metadata": {}, "outputs": [], "source": [ - "# Checking for duplicates in the dataset\n", - "house_data_clean[house_data_clean.duplicated()]" - ] - } - ], + "# Checking for duplicates in the dataset\n", + "house_data_clean[house_data_clean.duplicated()]" + ] +}, +{ +"cells": [], "metadata": { "kernelspec": { "display_name": "Python 3", @@ -847,7 +848,7 @@ "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", + "pygments_lexer": "ipython3", "version": "3.8.11" } }, From 14adaad0364e6d4495b633c371b72cc1c352a3d6 Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Wed, 1 May 2024 11:45:00 +0300 Subject: [PATCH 35/42] Update student.ipynb --- student.ipynb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/student.ipynb b/student.ipynb index 82d7ee49..7aa3d364 100644 --- a/student.ipynb +++ b/student.ipynb @@ -854,4 +854,4 @@ }, "nbformat": 4, "nbformat_minor": 5 -} \ No newline at end of file +}, \ No newline at end of file From 4275e4e074c4b71e4d9493cb2357b7011a1332cf Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Wed, 1 May 2024 11:52:56 +0300 Subject: [PATCH 36/42] Update student.ipynb --- student.ipynb | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/student.ipynb b/student.ipynb index 6fdc9263..59bab0ab 100644 --- a/student.ipynb +++ b/student.ipynb @@ -6,24 +6,20 @@ "source": [ "## Final Project Submission\n", "\n", -Clyde - "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng, Derrick Kiptoo \n", - main - "* Student pace: full time\n", + "* Student pace: full time\n", "* Scheduled project review date/time: \n", "* Instructor name: Nikita \n", "* Blog post URL:\n" ] }, - Clyde { "cell_type": "markdown", "metadata": {}, "source": [ "# Kings County Housing Analysis with Multiple Linear Regression" ] - }, + } { "cell_type": "markdown", "metadata": {}, From 11616b1c966613b440f2144e2b3a8d18708bf2a3 Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Wed, 1 May 2024 11:54:28 +0300 Subject: [PATCH 37/42] Update student.ipynb --- student.ipynb | 1515 ++++++++++++++++++------------------------------- 1 file changed, 564 insertions(+), 951 deletions(-) diff --git a/student.ipynb b/student.ipynb index 59bab0ab..4a29c2f7 100644 --- a/student.ipynb +++ b/student.ipynb @@ -6,144 +6,13 @@ "source": [ "## Final Project Submission\n", "\n", - "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng, Derrick Kiptoo \n", - "* Student pace: full time\n", + "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng. \n", + "* Student pace: full time\n", "* Scheduled project review date/time: \n", "* Instructor name: Nikita \n", "* Blog post URL:\n" ] }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "# Kings County Housing Analysis with Multiple Linear Regression" - ] - } - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Overview\n", - "\n", - "A real estate agency in Kingsway seeks to determine what are the contributing factors that affect the price of houses to make improvements where necessary. They want to employ an analytical approach rather than sentimental before arriving at a decision. Multilinear regression has been used for this project to understand how various features affect their pricing to better their services." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Business Problem\n", - "\n", - "In the face of market fluctuations and heightened competition within the real estate sector, our agency is grappling with pricing volatility, which poses significant challenges for our agents in devising effective business strategies. We seek strategic guidance to optimize our purchasing and selling endeavors, prioritizing informed decision-making to identify key areas of focus that promise maximum returns on investment." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Objectives\n", - "* To determine the key factors influencing house prices.\n", - "* To develop multilinear regression models to predict house prices based on relevant features.\n", - "* To use insights from the regression analysis to optimize pricing strategies for both purchasing and selling properties.\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Hypothesis\n", - "* Null Hypothesis - There is no relationship between our independent variables and our dependent variable \n", - "\n", - "* Alternative Hypothesis - There is a relationship between our independent variables and our dependent variable" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Data Understanding:\n", - "\n", - "In this project, we utilized the King County House Sales dataset, which serves as the foundational dataset for our analysis. It was sourced Kaggle.The dataset encompasses comprehensive information regarding house sales within King County, Washington, USA. It comprises a diverse array of features, including the number of bedrooms, bathrooms, square footage, as well as geographical and pricing details of the properties sold. This dataset is frequently employed in data science and machine learning endeavors, particularly for predictive modeling tasks such as regression analysis aimed at forecasting house prices based on the provided features." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### King County Housing Data Columns \n", - "\n", - "The column names contained in column_names.md are:\n", - "* `id`: A unique identifier for each house sale.\n", - "* `date`: The date when the house was sold.\n", - "* `price`: The sale price of the house, serving as the target variable for predictive modeling.\n", - "* `bedrooms`, `bathrooms`, `sqft_living`, `sqft_lot`: Numerical features representing the number of bedrooms and bathrooms, as well as the living area and lot area of the house, respectively.\n", - "* `floors`: The number of floors in the house.\n", - "* `waterfront`, `view`, `condition`, `grade`: Categorical features describing aspects such as waterfront availability, property view, condition, and overall grade assigned to the housing unit.\n", - "* `yr_built`, `yr_renovated`: Year of construction and renovation of the house.\n", - "* `zipcode`, `lat`, `long`: Geographical features including ZIP code, latitude, and longitude coordinates.\n", - "* `sqft_above`, `sqft_basement`, `sqft_living15`, `sqft_lot15`: Additional numerical features providing details about the house's above-ground and basement square footage, as well as living area and lot area of the nearest 15 neighboring houses." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Loading\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 16, - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "%matplotlib inline\n", - "import numpy as np\n", - "import pandas as pd\n", - "import scipy.stats as stats\n", - "import seaborn as sns\n", - "import statsmodels.api as sm\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "### Loading Data" - ] - }, - { - "cell_type": "code", - "execution_count": 17, - "metadata": {}, - "outputs": [], - "source": [ - "# Creating a function that loads data and return it in a dataframe\n", - "def load_data(file_path):\n", - " house_data = pd.read_csv(file_path)\n", - "\n", - " #shape\n", - " shape = house_data.shape\n", - " print(f\"The dataset contains {shape[0]} houses with {shape[1]} features\")\n", - " print()\n", - " \n", - " #Data Types\n", - " data_types = house_data.dtypes\n", - " print(\"Columns and their data types:\")\n", - " for column, dtype in data_types.items():\n", - " print(f\"{column}: {dtype}\")\n", - " print()\n", - "\n", - " return house_data\n" - ] - }, - { - "cell_type": "code", - "execution_count": 18, - { "cell_type": "markdown", "metadata": {}, @@ -252,7 +121,6 @@ { "cell_type": "code", "execution_count": 3, - main "metadata": {}, "outputs": [ { @@ -599,821 +467,566 @@ "\n", "

21597 rows × 21 columns

\n", "" - ], - "text/plain": [ - " id date price bedrooms bathrooms sqft_living \\\n", - "0 7129300520 10/13/2014 221900.0 3 1.00 1180 \n", - "1 6414100192 12/9/2014 538000.0 3 2.25 2570 \n", - "2 5631500400 2/25/2015 180000.0 2 1.00 770 \n", - "3 2487200875 12/9/2014 604000.0 4 3.00 1960 \n", - "4 1954400510 2/18/2015 510000.0 3 2.00 1680 \n", - "... ... ... ... ... ... ... \n", - "21592 263000018 5/21/2014 360000.0 3 2.50 1530 \n", - "21593 6600060120 2/23/2015 400000.0 4 2.50 2310 \n", - "21594 1523300141 6/23/2014 402101.0 2 0.75 1020 \n", - "21595 291310100 1/16/2015 400000.0 3 2.50 1600 \n", - "21596 1523300157 10/15/2014 325000.0 2 0.75 1020 \n", - "\n", - " sqft_lot floors waterfront view ... grade sqft_above \\\n", - "0 5650 1.0 NaN NONE ... 7 Average 1180 \n", - "1 7242 2.0 NO NONE ... 7 Average 2170 \n", - "2 10000 1.0 NO NONE ... 6 Low Average 770 \n", - "3 5000 1.0 NO NONE ... 7 Average 1050 \n", - "4 8080 1.0 NO NONE ... 8 Good 1680 \n", - "... ... ... ... ... ... ... ... \n", - "21592 1131 3.0 NO NONE ... 8 Good 1530 \n", - "21593 5813 2.0 NO NONE ... 8 Good 2310 \n", - "21594 1350 2.0 NO NONE ... 7 Average 1020 \n", - "21595 2388 2.0 NaN NONE ... 8 Good 1600 \n", - "21596 1076 2.0 NO NONE ... 7 Average 1020 \n", - "\n", - " sqft_basement yr_built yr_renovated zipcode lat long \\\n", - "0 0.0 1955 0.0 98178 47.5112 -122.257 \n", - "1 400.0 1951 1991.0 98125 47.7210 -122.319 \n", - "2 0.0 1933 NaN 98028 47.7379 -122.233 \n", - "3 910.0 1965 0.0 98136 47.5208 -122.393 \n", - "4 0.0 1987 0.0 98074 47.6168 -122.045 \n", - "... ... ... ... ... ... ... \n", - "21592 0.0 2009 0.0 98103 47.6993 -122.346 \n", - "21593 0.0 2014 0.0 98146 47.5107 -122.362 \n", - "21594 0.0 2009 0.0 98144 47.5944 -122.299 \n", - "21595 0.0 2004 0.0 98027 47.5345 -122.069 \n", - "21596 0.0 2008 0.0 98144 47.5941 -122.299 \n", - "\n", - " sqft_living15 sqft_lot15 \n", - "0 1340 5650 \n", - "1 1690 7639 \n", - "2 2720 8062 \n", - "3 1360 5000 \n", - "4 1800 7503 \n", - "... ... ... \n", - "21592 1530 1509 \n", - "21593 1830 7200 \n", - "21594 1020 2007 \n", - "21595 1410 1287 \n", - "21596 1020 1357 \n", - "\n", - "[21597 rows x 21 columns]" - ] - }, -Clyde - "execution_count": 18, - - "execution_count": 3, - main - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "load_data('data/kc_house_data.csv') # Assuming 'data' folder is in the same directory\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The dataset contains 21 columns, each representing a distinct feature, and 21,597 rows, with each row corresponding to a specific house sale entry.\n", - "\n", - "The dataset contains a mix of data types, including integers (int64), floating-point numbers (float64), and objects (strings). For instance, numerical features such as bedrooms, bathrooms, and sqft_living are represented as integers or floating-point numbers to facilitate mathematical computations, while categorical features like waterfront and view are stored as objects to accommodate text-based categories." - ] - }, - { - "cell_type": "code", - Clyde - "execution_count": 19, - - "execution_count": 4, - main - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The dataset contains 21597 houses with 21 features\n", - "\n", - "Columns and their data types:\n", - "id: int64\n", - "date: object\n", - "price: float64\n", - "bedrooms: int64\n", - "bathrooms: float64\n", - "sqft_living: int64\n", - "sqft_lot: int64\n", - "floors: float64\n", - "waterfront: object\n", - "view: object\n", - "condition: object\n", - "grade: object\n", - "sqft_above: int64\n", - "sqft_basement: object\n", - "yr_built: int64\n", - "yr_renovated: float64\n", - "zipcode: int64\n", - "lat: float64\n", - "long: float64\n", - "sqft_living15: int64\n", - "sqft_lot15: int64\n", - "\n" - ] - } - ], - "source": [ - "kings_data = load_data('data/kc_house_data.csv')" - ] - }, - { - "cell_type": "code", - Clyde - "execution_count": 20, - - "execution_count": 5, - main - "metadata": {}, - "outputs": [], - "source": [ - "#create a function that takes in a column and returns the column statistics as a dictionary\n", - "def descriptive_analytics(column):\n", - " stats_dict = column.describe().to_dict()\n", - " \n", - " print(\"Descriptive Statistics for Column '{}':\".format(column.name))\n", - " print(\"The count of the column is:\", stats_dict['count'])\n", - " print(\"The mean of the column is:\", stats_dict['mean'])\n", - " print(\"The standard deviation of the column is:\", stats_dict['std'])\n", - " print(\"The minimum value of the column is:\", stats_dict['min'])\n", - " print(\"The 25th percentile of the column is:\", stats_dict['25%'])\n", - " print(\"The median of the column is:\", stats_dict['50%'])\n", - " print(\"The 75th percentile of the column is:\", stats_dict['75%'])\n", - " print(\"The maximum value of the column is:\", stats_dict['max'])" - ] - }, - Clyde - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Descriptive Statistics for Column 'price':\n", - "The count of the column is: 21597.0\n", - "The mean of the column is: 540296.5735055795\n", - "The standard deviation of the column is: 367368.1401013936\n", - "The minimum value of the column is: 78000.0\n", - "The 25th percentile of the column is: 322000.0\n", - "The median of the column is: 450000.0\n", - "The 75th percentile of the column is: 645000.0\n", - "The maximum value of the column is: 7700000.0\n" - ] - } - ], - "source": [ - "descriptive_analytics(kings_data['price'])" - - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Descriptive Statistics for Column 'price':\n", - "The count of the column is: 21597.0\n", - "The mean of the column is: 540296.5735055795\n", - "The standard deviation of the column is: 367368.1401013936\n", - "The minimum value of the column is: 78000.0\n", - "The 25th percentile of the column is: 322000.0\n", - "The median of the column is: 450000.0\n", - "The 75th percentile of the column is: 645000.0\n", - "The maximum value of the column is: 7700000.0\n" - ] - } - ], - "source": [ - "descriptive_analytics(kings_data['price'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see that the maximum price of a house is 7700000 dollars and the minimum price is 78000 dollars\n", - "\n", - "There are 21597 prices regarding to the houses in the dataset\n", - "\n", - "Average price of a house is 540296.57 dollars" - main - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - Clyde - "We can see that the maximum price of a house is 7700000 dollars and the minimum price is 78000 dollars\n", - "\n", - "There are 21597 prices regarding to the houses in the dataset\n", - "\n", - "Average price of a house is 540296.57 dollars" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Preperation\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 22, - - "## Data Preperation\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - main - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 21597 entries, 0 to 21596\n", - "Data columns (total 21 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 21597 non-null int64 \n", - " 1 date 21597 non-null object \n", - " 2 price 21597 non-null float64\n", - " 3 bedrooms 21597 non-null int64 \n", - " 4 bathrooms 21597 non-null float64\n", - " 5 sqft_living 21597 non-null int64 \n", - " 6 sqft_lot 21597 non-null int64 \n", - " 7 floors 21597 non-null float64\n", - " 8 waterfront 19221 non-null object \n", - " 9 view 21534 non-null object \n", - " 10 condition 21597 non-null object \n", - " 11 grade 21597 non-null object \n", - " 12 sqft_above 21597 non-null int64 \n", - " 13 sqft_basement 21597 non-null object \n", - " 14 yr_built 21597 non-null int64 \n", - " 15 yr_renovated 17755 non-null float64\n", - " 16 zipcode 21597 non-null int64 \n", - " 17 lat 21597 non-null float64\n", - " 18 long 21597 non-null float64\n", - " 19 sqft_living15 21597 non-null int64 \n", - " 20 sqft_lot15 21597 non-null int64 \n", - "dtypes: float64(6), int64(9), object(6)\n", - "memory usage: 3.5+ MB\n" - ] - } - ], - "source": [ - "kings_data.info()" - ] - }, - Clyde - { - "cell_type": "code", - "execution_count": 23, - "metadata": {}, - "outputs": [], - "source": [ - "def identify_issues(dataset):\n", - " # Identify missing values as a percentage of the whole dataset\n", - " missing_values = (dataset.isnull().sum())/len(dataset) * 100\n", - "\n", - " # Identify duplicates\n", - " duplicates = dataset.duplicated().sum()\n", - " \n", - " #return a dictionary \n", - " return {'duplicates': duplicates,\n", - " 'missing values': missing_values.round(2)} \n" - - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "def identify_issues(dataset):\n", - " # Identify missing values as a percentage of the whole dataset\n", - " missing_values = (dataset.isnull().sum())/len(dataset) * 100\n", - "\n", - " # Identify duplicates\n", - " duplicates = dataset.duplicated().sum()\n", - " \n", - " #return a dictionary \n", - " return {'duplicates': duplicates,\n", - " 'missing values': missing_values.round(2)} \n" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'duplicates': 0,\n", - " 'missing values': id 0.00\n", - " date 0.00\n", - " price 0.00\n", - " bedrooms 0.00\n", - " bathrooms 0.00\n", - " sqft_living 0.00\n", - " sqft_lot 0.00\n", - " floors 0.00\n", - " waterfront 11.00\n", - " view 0.29\n", - " condition 0.00\n", - " grade 0.00\n", - " sqft_above 0.00\n", - " sqft_basement 0.00\n", - " yr_built 0.00\n", - " yr_renovated 17.79\n", - " zipcode 0.00\n", - " lat 0.00\n", - " long 0.00\n", - " sqft_living15 0.00\n", - " sqft_lot15 0.00\n", - " dtype: float64}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "identify_issues(kings_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The examination indicates that there are no duplicate entries within the dataset, ensuring the integrity of the records. However, attention is warranted to address missing values present in certain columns. Specifically, the 'waterfront' feature exhibits 11% of null values, representing a negligible portion of the dataset. Similarly, the 'yr_renovated' feature shows a relatively higher percentage of missing values, accounting for approximately 17.79% of the dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Before making changes make a copy instead of overwriting data" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "house_data_clean = kings_data.copy()" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# Changing the date to date time\n", - "house_data_clean['date'] = pd.to_datetime(house_data_clean['date'])\n", - "\n", - "# Extracting only the year from the column Date\n", - "house_data_clean.date = house_data_clean['date'].dt.year\n", - "\n", - "# Changing the dates for the year built \n", - "house_data_clean['yr_built'] = pd.to_datetime(house_data_clean['yr_built'],format='%Y').dt.year\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The above code converts the 'date' column data to only contain the year the house was sold, for the purpose of analysis we will use only the year since the changes month by month will be minor." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Dealing with the missing values" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "def missing_values(dataset):\n", - " # drop the rows from views\n", - " dataset.dropna(subset=['view'],inplace=True)\n", - "\n", - " # Filling the NaN values for waterfront with NO\n", - " dataset.waterfront.fillna('NO',inplace=True)\n", - " \n", - " # Dropping the yr_renovated column \n", - " dataset.drop('yr_renovated',axis=1,inplace=True)" - main - ] - }, - { - "cell_type": "code", - Clyde - "execution_count": 24, - - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "missing_values(house_data_clean)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "'yr_renovated' has the highest percentage of NaN values 17%. This will be dropped since it will not be used within our model inline with the business problem.\n", - "\n", - "'Waterfront' feature has 11% null values, this was filled with NO on the assumption that these cells were not filled since they lacked waterfronts\n", - "\n", - "For the 'View' column, the null values were dropped by row since the overall percentage impact would be minute" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - main - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - Clyde - "{'duplicates': 0,\n", - " 'missing values': id 0.00\n", - " date 0.00\n", - " price 0.00\n", - " bedrooms 0.00\n", - " bathrooms 0.00\n", - " sqft_living 0.00\n", - " sqft_lot 0.00\n", - " floors 0.00\n", - " waterfront 11.00\n", - " view 0.29\n", - " condition 0.00\n", - " grade 0.00\n", - " sqft_above 0.00\n", - " sqft_basement 0.00\n", - " yr_built 0.00\n", - " yr_renovated 17.79\n", - " zipcode 0.00\n", - " lat 0.00\n", - " long 0.00\n", - " sqft_living15 0.00\n", - " sqft_lot15 0.00\n", - " dtype: float64}" - ] - }, - "execution_count": 24, - - "{'duplicates': 2,\n", - " 'missing values': id 0.0\n", - " date 0.0\n", - " price 0.0\n", - " bedrooms 0.0\n", - " bathrooms 0.0\n", - " sqft_living 0.0\n", - " sqft_lot 0.0\n", - " floors 0.0\n", - " waterfront 0.0\n", - " view 0.0\n", - " condition 0.0\n", - " grade 0.0\n", - " sqft_above 0.0\n", - " sqft_basement 0.0\n", - " yr_built 0.0\n", - " zipcode 0.0\n", - " lat 0.0\n", - " long 0.0\n", - " sqft_living15 0.0\n", - " sqft_lot15 0.0\n", - " dtype: float64}" - ] - }, - "execution_count": 14, - main - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - Clyde - "identify_issues(kings_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The examination indicates that there are no duplicate entries within the dataset, ensuring the integrity of the records. However, attention is warranted to address missing values present in certain columns. Specifically, the 'waterfront' feature exhibits 11% of null values, representing a negligible portion of the dataset. Similarly, the 'yr_renovated' feature shows a relatively higher percentage of missing values, accounting for approximately 17.79% of the dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Before making changes make a copy instead of overwriting data" - ] - }, - { - "cell_type": "code", - "execution_count": 25, - "metadata": {}, - "outputs": [], - "source": [ - "house_data_clean = kings_data.copy()" - ] - }, - { - "cell_type": "code", - "execution_count": 26, - "metadata": {}, - "outputs": [], - "source": [ - "# Changing the date to date time\n", - "house_data_clean['date'] = pd.to_datetime(house_data_clean['date'])\n", - "\n", - "# Extracting only the year from the column Date\n", - "house_data_clean.date = house_data_clean['date'].dt.year\n", - "\n", - "# Changing the dates for the year built \n", - "house_data_clean['yr_built'] = pd.to_datetime(house_data_clean['yr_built'],format='%Y').dt.year\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The above code converts the 'date' column data to only contain the year the house was sold, for the purpose of analysis we will use only the year since the changes month by month will be minor." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Dealing with the missing values" - ] - }, - { - "cell_type": "code", - "execution_count": 27, - "metadata": {}, - "outputs": [], - "source": [ - "def missing_values(dataset):\n", - " # drop the rows from views\n", - " dataset.dropna(subset=['view'],inplace=True)\n", - "\n", - " # Filling the NaN values for waterfront with NO\n", - " dataset.waterfront.fillna('NO',inplace=True)\n", - " \n", - " # Dropping the yr_renovated column \n", - " dataset.drop('yr_renovated',axis=1,inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 28, - "metadata": {}, - "outputs": [], - "source": [ - "missing_values(house_data_clean)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "'yr_renovated' has the highest percentage of NaN values 17%. This will be dropped since it will not be used within our model inline with the business problem.\n", - "\n", - "'Waterfront' feature has 11% null values, this was filled with NO on the assumption that these cells were not filled since they lacked waterfronts\n", - "\n", - "For the 'View' column, the null values were dropped by row since the overall percentage impact would be minute" - ] - }, - { - "cell_type": "code", - "execution_count": 29, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'duplicates': 2,\n", - " 'missing values': id 0.0\n", - " date 0.0\n", - " price 0.0\n", - " bedrooms 0.0\n", - " bathrooms 0.0\n", - " sqft_living 0.0\n", - " sqft_lot 0.0\n", - " floors 0.0\n", - " waterfront 0.0\n", - " view 0.0\n", - " condition 0.0\n", - " grade 0.0\n", - " sqft_above 0.0\n", - " sqft_basement 0.0\n", - " yr_built 0.0\n", - " zipcode 0.0\n", - " lat 0.0\n", - " long 0.0\n", - " sqft_living15 0.0\n", - " sqft_lot15 0.0\n", - " dtype: float64}" - ] - }, - "execution_count": 29, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "identify_issues(house_data_clean)" - ] - }, - { - "cell_type": "code", - "execution_count": 30, - - "identify_issues(house_data_clean)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - main - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewconditiongradesqft_abovesqft_basementyr_builtzipcodelatlongsqft_living15sqft_lot15
394718250690312014550000.041.75241084472.0NOGOODGood8 Good2060350.019369807447.6499-122.088252014789
2003886489001102014555000.032.50194032112.0NONONEAverage8 Good19400.020099802747.5644-122.09318803078
\n", - "
" - ], - "text/plain": [ - " id date price bedrooms bathrooms sqft_living sqft_lot \\\n", - "3947 1825069031 2014 550000.0 4 1.75 2410 8447 \n", - "20038 8648900110 2014 555000.0 3 2.50 1940 3211 \n", - "\n", - " floors waterfront view condition grade sqft_above sqft_basement \\\n", - "3947 2.0 NO GOOD Good 8 Good 2060 350.0 \n", - "20038 2.0 NO NONE Average 8 Good 1940 0.0 \n", - "\n", - " yr_built zipcode lat long sqft_living15 sqft_lot15 \n", - "3947 1936 98074 47.6499 -122.088 2520 14789 \n", - "20038 2009 98027 47.5644 -122.093 1880 3078 " - ] - }, - Clyde - "execution_count": 30, - - "execution_count": 15, - main - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "house_data_clean[house_data_clean.duplicated()]" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 + ], + "text/plain": [ + " id date price bedrooms bathrooms sqft_living \\\n", + "0 7129300520 10/13/2014 221900.0 3 1.00 1180 \n", + "1 6414100192 12/9/2014 538000.0 3 2.25 2570 \n", + "2 5631500400 2/25/2015 180000.0 2 1.00 770 \n", + "3 2487200875 12/9/2014 604000.0 4 3.00 1960 \n", + "4 1954400510 2/18/2015 510000.0 3 2.00 1680 \n", + "... ... ... ... ... ... ... \n", + "21592 263000018 5/21/2014 360000.0 3 2.50 1530 \n", + "21593 6600060120 2/23/2015 400000.0 4 2.50 2310 \n", + "21594 1523300141 6/23/2014 402101.0 2 0.75 1020 \n", + "21595 291310100 1/16/2015 400000.0 3 2.50 1600 \n", + "21596 1523300157 10/15/2014 325000.0 2 0.75 1020 \n", + "\n", + " sqft_lot floors waterfront view ... grade sqft_above \\\n", + "0 5650 1.0 NaN NONE ... 7 Average 1180 \n", + "1 7242 2.0 NO NONE ... 7 Average 2170 \n", + "2 10000 1.0 NO NONE ... 6 Low Average 770 \n", + "3 5000 1.0 NO NONE ... 7 Average 1050 \n", + "4 8080 1.0 NO NONE ... 8 Good 1680 \n", + "... ... ... ... ... ... ... ... \n", + "21592 1131 3.0 NO NONE ... 8 Good 1530 \n", + "21593 5813 2.0 NO NONE ... 8 Good 2310 \n", + "21594 1350 2.0 NO NONE ... 7 Average 1020 \n", + "21595 2388 2.0 NaN NONE ... 8 Good 1600 \n", + "21596 1076 2.0 NO NONE ... 7 Average 1020 \n", + "\n", + " sqft_basement yr_built yr_renovated zipcode lat long \\\n", + "0 0.0 1955 0.0 98178 47.5112 -122.257 \n", + "1 400.0 1951 1991.0 98125 47.7210 -122.319 \n", + "2 0.0 1933 NaN 98028 47.7379 -122.233 \n", + "3 910.0 1965 0.0 98136 47.5208 -122.393 \n", + "4 0.0 1987 0.0 98074 47.6168 -122.045 \n", + "... ... ... ... ... ... ... \n", + "21592 0.0 2009 0.0 98103 47.6993 -122.346 \n", + "21593 0.0 2014 0.0 98146 47.5107 -122.362 \n", + "21594 0.0 2009 0.0 98144 47.5944 -122.299 \n", + "21595 0.0 2004 0.0 98027 47.5345 -122.069 \n", + "21596 0.0 2008 0.0 98144 47.5941 -122.299 \n", + "\n", + " sqft_living15 sqft_lot15 \n", + "0 1340 5650 \n", + "1 1690 7639 \n", + "2 2720 8062 \n", + "3 1360 5000 \n", + "4 1800 7503 \n", + "... ... ... \n", + "21592 1530 1509 \n", + "21593 1830 7200 \n", + "21594 1020 2007 \n", + "21595 1410 1287 \n", + "21596 1020 1357 \n", + "\n", + "[21597 rows x 21 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "load_data('data/kc_house_data.csv') # Assuming 'data' folder is in the same directory\n", + "\n", + "\n" + ] +}, +{ + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The dataset contains 21 columns, each representing a distinct feature, and 21,597 rows, with each row corresponding to a specific house sale entry.\n", + "\n", + "The dataset contains a mix of data types, including integers (int64), floating-point numbers (float64), and objects (strings). For instance, numerical features such as bedrooms, bathrooms, and sqft_living are represented as integers or floating-point numbers to facilitate mathematical computations, while categorical features like waterfront and view are stored as objects to accommodate text-based categories." + ] +}, +{ + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The dataset contains 21597 houses with 21 features\n", + "\n", + "Columns and their data types:\n", + "id: int64\n", + "date: object\n", + "price: float64\n", + "bedrooms: int64\n", + "bathrooms: float64\n", + "sqft_living: int64\n", + "sqft_lot: int64\n", + "floors: float64\n", + "waterfront: object\n", + "view: object\n", + "condition: object\n", + "grade: object\n", + "sqft_above: int64\n", + "sqft_basement: object\n", + "yr_built: int64\n", + "yr_renovated: float64\n", + "zipcode: int64\n", + "lat: float64\n", + "long: float64\n", + "sqft_living15: int64\n", + "sqft_lot15: int64\n", + "\n" + ] + } + ], + "source": [ + "kings_data = load_data('data/kc_house_data.csv')" + ] +}, +{ + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "#create a function that takes in a column and returns the column statistics as a dictionary\n", + "def descriptive_analytics(column):\n", + " stats_dict = column.describe().to_dict()\n", + " \n", + " print(\"Descriptive Statistics for Column '{}':\".format(column.name))\n", + " print(\"The count of the column is:\", stats_dict['count'])\n", + " print(\"The mean of the column is:\", stats_dict['mean'])\n", + " print(\"The standard deviation of the column is:\", stats_dict['std'])\n", + " print(\"The minimum value of the column is:\", stats_dict['min'])\n", + " print(\"The 25th percentile of the column is:\", stats_dict['25%'])\n", + " print(\"The median of the column is:\", stats_dict['50%'])\n", + " print(\"The 75th percentile of the column is:\", stats_dict['75%'])\n", + " print(\"The maximum value of the column is:\", stats_dict['max'])" + ] +}, +{ + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Descriptive Statistics for Column 'price':\n", + "The count of the column is: 21597.0\n", + "The mean of the column is: 540296.5735055795\n", + "The standard deviation of the column is: 367368.1401013936\n", + "The minimum value of the column is: 78000.0\n", + "The 25th percentile of the column is: 322000.0\n", + "The median of the column is: 450000.0\n", + "The 75th percentile of the column is: 645000.0\n", + "The maximum value of the column is: 7700000.0\n" + ] + } + ], + "source": [ + "descriptive_analytics(kings_data['price'])" + ] +}, +{ + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the maximum price of a house is 7700000 dollars and the minimum price is 78000 dollars\n", + "\n", + "There are 21597 prices regarding to the houses in the dataset\n", + "\n", + "Average price of a house is 540296.57 dollars" + ] +}, +{ + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preperation\n", + "\n" + ] +}, +{ + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 21597 entries, 0 to 21596\n", + "Data columns (total 21 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 21597 non-null int64 \n", + " 1 date 21597 non-null object \n", + " 2 price 21597 non-null float64\n", + " 3 bedrooms 21597 non-null int64 \n", + " 4 bathrooms 21597 non-null float64\n", + " 5 sqft_living 21597 non-null int64 \n", + " 6 sqft_lot 21597 non-null int64 \n", + " 7 floors 21597 non-null float64\n", + " 8 waterfront 19221 non-null object \n", + " 9 view 21534 non-null object \n", + " 10 condition 21597 non-null object \n", + " 11 grade 21597 non-null object \n", + " 12 sqft_above 21597 non-null int64 \n", + " 13 sqft_basement 21597 non-null object \n", + " 14 yr_built 21597 non-null int64 \n", + " 15 yr_renovated 17755 non-null float64\n", + " 16 zipcode 21597 non-null int64 \n", + " 17 lat 21597 non-null float64\n", + " 18 long 21597 non-null float64\n", + " 19 sqft_living15 21597 non-null int64 \n", + " 20 sqft_lot15 21597 non-null int64 \n", + "dtypes: float64(6), int64(9), object(6)\n", + "memory usage: 3.5+ MB\n" + ] + } + ], + "source": [ + "kings_data.info()" + ] +}, +{ + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def identify_issues(dataset):\n", + " # Identify missing values as a percentage of the whole dataset\n", + " missing_values = (dataset.isnull().sum())/len(dataset) * 100\n", + "\n", + " # Identify duplicates\n", + " duplicates = dataset.duplicated().sum()\n", + " \n", + " #return a dictionary \n", + " return {'duplicates': duplicates,\n", + " 'missing values': missing_values.round(2)} \n" + ] +}, +{ + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'duplicates': 0,\n", + " 'missing values': id 0.00\n", + " date 0.00\n", + " price 0.00\n", + " bedrooms 0.00\n", + " bathrooms 0.00\n", + " sqft_living 0.00\n", + " sqft_lot 0.00\n", + " floors 0.00\n", + " waterfront 11.00\n", + " view 0.29\n", + " condition 0.00\n", + " grade 0.00\n", + " sqft_above 0.00\n", + " sqft_basement 0.00\n", + " yr_built 0.00\n", + " yr_renovated 17.79\n", + " zipcode 0.00\n", + " lat 0.00\n", + " long 0.00\n", + " sqft_living15 0.00\n", + " sqft_lot15 0.00\n", + " dtype: float64}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "identify_issues(kings_data)" + ] +}, +{ + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The examination indicates that there are no duplicate entries within the dataset, ensuring the integrity of the records. However, attention is warranted to address missing values present in certain columns. Specifically, the 'waterfront' feature exhibits 11% of null values, representing a negligible portion of the dataset. Similarly, the 'yr_renovated' feature shows a relatively higher percentage of missing values, accounting for approximately 17.79% of the dataset" + ] +}, +{ + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Before making changes make a copy instead of overwriting data" + ] +}, +{ + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "house_data_clean = kings_data.copy()" + ] +}, +{ + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Changing the date to date time\n", + "house_data_clean['date'] = pd.to_datetime(house_data_clean['date'])\n", + "\n", + "# Extracting only the year from the column Date\n", + "house_data_clean.date = house_data_clean['date'].dt.year\n", + "\n", + "# Changing the dates for the year built \n", + "house_data_clean['yr_built'] = pd.to_datetime(house_data_clean['yr_built'],format='%Y').dt.year\n" + ] +}, +{ + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above code converts the 'date' column data to only contain the year the house was sold, for the purpose of analysis we will use only the year since the changes month by month will be minor." + ] +}, +{ + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Dealing with the missing values" + ] +}, +{ + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def missing_values(dataset):\n", + " # drop the rows from views\n", + " dataset.dropna(subset=['view'],inplace=True)\n", + "\n", + " # Filling the NaN values for waterfront with NO\n", + " dataset.waterfront.fillna('NO',inplace=True)\n", + " \n", + " # Dropping the yr_renovated column \n", + " dataset.drop('yr_renovated',axis=1,inplace=True)" + ] +}, +{ + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "missing_values(house_data_clean)" + ] +}, +{ + "cell_type": "markdown", + "metadata": {}, + "source": [ + "'yr_renovated' has the highest percentage of NaN values 17%. This will be dropped since it will not be used within our model inline with the business problem.\n", + "\n", + "'Waterfront' feature has 11% null values, this was filled with NO on the assumption that these cells were not filled since they lacked waterfronts\n", + "\n", + "For the 'View' column, the null values were dropped by row since the overall percentage impact would be minute" + ] +}, +{ + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'duplicates': 2,\n", + " 'missing values': id 0.0\n", + " date 0.0\n", + " price 0.0\n", + " bedrooms 0.0\n", + " bathrooms 0.0\n", + " sqft_living 0.0\n", + " sqft_lot 0.0\n", + " floors 0.0\n", + " waterfront 0.0\n", + " view 0.0\n", + " condition 0.0\n", + " grade 0.0\n", + " sqft_above 0.0\n", + " sqft_basement 0.0\n", + " yr_built 0.0\n", + " zipcode 0.0\n", + " lat 0.0\n", + " long 0.0\n", + " sqft_living15 0.0\n", + " sqft_lot15 0.0\n", + " dtype: float64}" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "identify_issues(house_data_clean)" + ] +}, +{ + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewconditiongradesqft_abovesqft_basementyr_builtzipcodelatlongsqft_living15sqft_lot15
394718250690312014550000.041.75241084472.0NOGOODGood8 Good2060350.019369807447.6499-122.088252014789
2003886489001102014555000.032.50194032112.0NONONEAverage8 Good19400.020099802747.5644-122.09318803078
\n", + "
" + ], + "text/plain": [ + " id date price bedrooms bathrooms sqft_living sqft_lot \\\n", + "3947 1825069031 2014 550000.0 4 1.75 2410 8447 \n", + "20038 8648900110 2014 555000.0 3 2.50 1940 3211 \n", + "\n", + " floors waterfront view condition grade sqft_above sqft_basement \\\n", + "3947 2.0 NO GOOD Good 8 Good 2060 350.0 \n", + "20038 2.0 NO NONE Average 8 Good 1940 0.0 \n", + "\n", + " yr_built zipcode lat long sqft_living15 sqft_lot15 \n", + "3947 1936 98074 47.6499 -122.088 2520 14789 \n", + "20038 2009 98027 47.5644 -122.093 1880 3078 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "house_data_clean[house_data_clean.duplicated()]" + ] +} +], +"metadata": { +"kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" +}, +"language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" +} +}, +"nbformat": 4, +"nbformat_minor": 2 } From 7d271f1f0b6a80bf4a1fba855a89a1a60b1a94e1 Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Wed, 1 May 2024 11:54:48 +0300 Subject: [PATCH 38/42] Update student.ipynb --- student.ipynb | 999 +++++++++++++++++++++++++++++--------------------- 1 file changed, 587 insertions(+), 412 deletions(-) diff --git a/student.ipynb b/student.ipynb index 7aa3d364..3789754d 100644 --- a/student.ipynb +++ b/student.ipynb @@ -6,7 +6,7 @@ "source": [ "## Final Project Submission\n", "\n", - "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng, Derrick Kiptoo. \n", + "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng. \n", "* Student pace: full time\n", "* Scheduled project review date/time: \n", "* Instructor name: Nikita \n", @@ -342,7 +342,6 @@ " ...\n", " ...\n", " ...\n", - " ...\n", " \n", " \n", " 21592\n", @@ -426,7 +425,7 @@ " 1600\n", " 2388\n", " 2.0\n", - " NO\n", + " NaN\n", " NONE\n", " ...\n", " 8 Good\n", @@ -440,418 +439,594 @@ " 1410\n", " 1287\n", " \n", + " \n", + " 21596\n", + " 1523300157\n", + " 10/15/2014\n", + " 325000.0\n", + " 2\n", + " 0.75\n", + " 1020\n", + " 1076\n", + " 2.0\n", + " NO\n", + " NONE\n", + " ...\n", + " 7 Average\n", + " 1020\n", + " 0.0\n", + " 2008\n", + " 0.0\n", + " 98144\n", + " 47.5941\n", + " -122.299\n", + " 1020\n", + " 1357\n", + " \n", " \n", "\n", - "

21596 rows × 21 columns

\n", + "

21597 rows × 21 columns

\n", "" - ], - "text/plain": [ - " id date price bedrooms bathrooms sqft_living \\\n", - "0 7129300520 10/13/2014 221900.0 3 1.00 1180 \n", - "1 6414100192 12/9/2014 538000.0 3 2.25 2570 \n", - "2 5631500400 2/25/2015 180000.0 2 1.00 770 \n", - "3 2487200875 12/9/2014 604000.0 4 3.00 1960 \n", - "4 1954400510 2/18/2015 510000.0 3 2.00 1680 \n", - "... ... ... ... ... ... ... \n", - "21592 263000018 5/21/2014 360000.0 3 2.50 1530 \n", - "21593 6600060120 2/23/2015 400000.0 4 2.50 2310 \n", - "21594 1523300141 6/23/2014 402101.0 2 0.75 1020 \n", - "21595 291310100 1/16/2015 400000.0 3 2.50 1600 \n", - "21596 1523300157 10/15/2014 325000.0 2 0.75 1020 \n", - "\n", - " sqft_lot floors waterfront view ... grade sqft_above \\\n", - "0 5650 1.0 NaN NONE ... 7 Average 1180 \n", - "1 7242 2.0 NO NONE ... 7 Average 2170 \n", - "2 10000 1.0 NO NONE ... 6 Low Average 770 \n", - "3 5000 1.0 NO NONE ... 7 Average 1050 \n", - "4 8080 1.0 NO NONE ... 8 Good 1680 \n", - "... ... ... ... ... ... ... ... \n", - "21592 1131 3.0 NO NONE ... 8 Good 1530 \n", - "21593 5813 2.0 NO NONE ... 8 Good 2310 \n", - "21594 1350 2.0 NO NONE ... 7 Average 1020 \n", - "21595 2388 2.0 NaN NONE ... 8 Good 1600 \n", - "21596 1076 2.0 NO NONE ... 7 Average 1020 \n", - "\n", - " sqft_basement yr_built yr_renovated zipcode lat long \\\n", - "0 0.0 1955 0.0 98178 47.5112 -122.257 \n", - "1 400.0 1951 1991.0 98125 47.7210 -122.319 \n", - "2 0.0 1933 NaN 98028 47.7379 -122.233 \n", - "3 910.0 1965 0.0 98136 47.5208 -122.393 \n", - "4 0.0 1987 0.0 98074 47.6168 -122.045 \n", - "... ... ... ... ... ... ... \n", - "21592 0.0 2009 0.0 98103 47.6993 -122.346 \n", - "21593 0.0 2014 0.0 98146 47.5107 -122.362 \n", - "21594 0.0 2009 0.0 98144 47.5944 -122.299 \n", - "21595 0.0 2004 0.0 98027 47.5345 -122.069 \n", - "21596 0.0 2008 0.0 98144 47.5941 -122.299 \n", - "\n", - " sqft_living15 sqft_lot15 \n", - "0 1340 5650 \n", - "1 1690 7639 \n", - "2 2720 8062 \n", - "3 1360 5000 \n", - "4 1800 7503 \n", - "... ... ... \n", - "21592 1530 1509 \n", - "21593 1830 7200 \n", - "21594 1020 2007 \n", - "21595 1410 1287 \n", - "21596 1020 1357 \n", - "\n", - "[21597 rows x 21 columns]" - ] - }, - - "cells": [ - { - "cell_type": "code", - "execution_count": 18, - "metadata": {}, - "outputs": [], - "source": [ - "load_data('data/kc_house_data.csv') # Assuming 'data' folder is in the same directory\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The dataset contains 21 columns, each representing a distinct feature, and 21,597 rows, with each row corresponding to a specific house sale entry.\n", - "\n", - "The dataset contains a mix of data types, including integers (int64), floating-point numbers (float64), and objects (strings). For instance, numerical features such as bedrooms, bathrooms, and sqft_living are represented as integers or floating-point numbers to facilitate mathematical computations, while categorical features like waterfront and view are stored as objects to accommodate text-based categories." - ] - }, - { - "cell_type": "code", - "execution_count": 19, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The dataset contains 21597 houses with 21 features\n", + ], + "text/plain": [ + " id date price bedrooms bathrooms sqft_living \\\n", + "0 7129300520 10/13/2014 221900.0 3 1.00 1180 \n", + "1 6414100192 12/9/2014 538000.0 3 2.25 2570 \n", + "2 5631500400 2/25/2015 180000.0 2 1.00 770 \n", + "3 2487200875 12/9/2014 604000.0 4 3.00 1960 \n", + "4 1954400510 2/18/2015 510000.0 3 2.00 1680 \n", + "... ... ... ... ... ... ... \n", + "21592 263000018 5/21/2014 360000.0 3 2.50 1530 \n", + "21593 6600060120 2/23/2015 400000.0 4 2.50 2310 \n", + "21594 1523300141 6/23/2014 402101.0 2 0.75 1020 \n", + "21595 291310100 1/16/2015 400000.0 3 2.50 1600 \n", + "21596 1523300157 10/15/2014 325000.0 2 0.75 1020 \n", + "\n", + " sqft_lot floors waterfront view ... grade sqft_above \\\n", + "0 5650 1.0 NaN NONE ... 7 Average 1180 \n", + "1 7242 2.0 NO NONE ... 7 Average 2170 \n", + "2 10000 1.0 NO NONE ... 6 Low Average 770 \n", + "3 5000 1.0 NO NONE ... 7 Average 1050 \n", + "4 8080 1.0 NO NONE ... 8 Good 1680 \n", + "... ... ... ... ... ... ... ... \n", + "21592 1131 3.0 NO NONE ... 8 Good 1530 \n", + "21593 5813 2.0 NO NONE ... 8 Good 2310 \n", + "21594 1350 2.0 NO NONE ... 7 Average 1020 \n", + "21595 2388 2.0 NaN NONE ... 8 Good 1600 \n", + "21596 1076 2.0 NO NONE ... 7 Average 1020 \n", + "\n", + " sqft_basement yr_built yr_renovated zipcode lat long \\\n", + "0 0.0 1955 0.0 98178 47.5112 -122.257 \n", + "1 400.0 1951 1991.0 98125 47.7210 -122.319 \n", + "2 0.0 1933 NaN 98028 47.7379 -122.233 \n", + "3 910.0 1965 0.0 98136 47.5208 -122.393 \n", + "4 0.0 1987 0.0 98074 47.6168 -122.045 \n", + "... ... ... ... ... ... ... \n", + "21592 0.0 2009 0.0 98103 47.6993 -122.346 \n", + "21593 0.0 2014 0.0 98146 47.5107 -122.362 \n", + "21594 0.0 2009 0.0 98144 47.5944 -122.299 \n", + "21595 0.0 2004 0.0 98027 47.5345 -122.069 \n", + "21596 0.0 2008 0.0 98144 47.5941 -122.299 \n", + "\n", + " sqft_living15 sqft_lot15 \n", + "0 1340 5650 \n", + "1 1690 7639 \n", + "2 2720 8062 \n", + "3 1360 5000 \n", + "4 1800 7503 \n", + "... ... ... \n", + "21592 1530 1509 \n", + "21593 1830 7200 \n", + "21594 1020 2007 \n", + "21595 1410 1287 \n", + "21596 1020 1357 \n", + "\n", + "[21597 rows x 21 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "load_data('data/kc_house_data.csv') # Assuming 'data' folder is in the same directory\n", + "\n", + "\n" + ] +}, { - "cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "kings_data = load_data('data/kc_house_data.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 20, - "metadata": {}, - "outputs": [], - "source": [ - "# create a function that takes in a column and returns the column statistics as a dictionary\n", - "def descriptive_analytics(column):\n", - " stats_dict = column.describe().to_dict()\n", - " \n", - " print(\"Descriptive Statistics for Column '{}':\".format(column.name))\n", - " print(\"The count of the column is:\", stats_dict['count'])\n", - " print(\"The mean of the column is:\", stats_dict['mean'])\n", - " print(\"The standard deviation of the column is:\", stats_dict['std'])\n", - " print(\"The minimum value of the column is:\", stats_dict['min'])\n", - " print(\"The 25th percentile of the column is:\", stats_dict['25%'])\n", - " print(\"The median of the column is:\", stats_dict['50%'])\n", - " print(\"The 75th percentile of the column is:\", stats_dict['75%'])\n", - " print(\"The maximum value of the column is:\", stats_dict['max'])" - ] - }, - { - "cell_type": "code", - "execution_count": 21, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Descriptive Statistics for Column 'price':\n", - "The count of the column is: 21597.0\n", - "The mean of the column is: 540296.5735055795\n", - "The standard deviation of the column is: 367368.1401013936\n", - "The minimum value of the column is: 78000.0\n", - "The 25th percentile of the column is: 322000.0\n", - "The median of the column is: 450000.0\n", - "The 75th percentile of the column is: 645000.0\n", - "The maximum value of the column is: 7700000.0\n" - ] - } - ], - "source": [ - "descriptive_analytics(kings_data['price'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see that the maximum price of a house is $7,700,000 and the minimum price is $78,000.\n", - "\n", - "There are 21,597 prices regarding the houses in the dataset.\n", - "\n", - "The average price of a house is $540,296.57." - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.9.1" - } - }, - "nbformat": 4, - "nbformat_minor": 5, -"cells": [ - { - "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 21597 entries, 0 to 21596\n", - "Data columns (total 21 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 21597 non-null int64 \n", - " 1 date 21597 non-null object \n", - " 2 price 21597 non-null float64\n", - " 3 bedrooms 21597 non-null int64 \n", - " 4 bathrooms 21597 non-null float64\n", - " 5 sqft_living 21597 non-null int64 \n", - " 6 sqft_lot 21597 non-null int64 \n", - " 7 floors 21597 non-null float64\n", - " 8 waterfront 19221 non-null object \n", - " 9 view 21534 non-null object \n", - " 10 condition 21597 non-null object \n", - " 11 grade 21597 non-null object \n", - " 12 sqft_above 21597 non-null int64 \n", - " 13 sqft_basement 21597 non-null object \n", - " 14 yr_built 21597 non-null int64 \n", - " 15 yr_renovated 17755 non-null float64\n", - " 16 zipcode 21597 non-null int64 \n", - " 17 lat 21597 non-null float64\n", - " 18 long 21597 non-null float64\n", - " 19 sqft_living15 21597 non-null int64 \n", - " 20 sqft_lot15 21597 non-null int64 \n", - "dtypes: float64(6), int64(9), object(6)\n", - "memory usage: 3.5+ MB\n" - ] - } - ], - "source": [ - "kings_data.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 2, - "metadata": {}, - "outputs": [], - "source": [ - "def identify_issues(dataset):\n", - " # Identify missing values as a percentage of the whole dataset\n", - " missing_values = (dataset.isnull().sum())/len(dataset) * 100\n", - "\n", - " # Identify duplicates\n", - " duplicates = dataset.duplicated().sum()\n", - " \n", - " #return a dictionary \n", - " return {'duplicates': duplicates,\n", - " 'missing values': missing_values.round(2)} \n" - ] - }, - { - "cell_type": "code", - "execution_count": 3, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'duplicates': 0,\n", - " 'missing values': {\n", - " 'id': 0.0,\n", - " 'date': 0.0,\n", - " 'price': 0.0,\n", - " 'bedrooms': 0.0,\n", - " 'bathrooms': 0.0,\n", - " 'sqft_living': 0.0,\n", - " 'sqft_lot': 0.0,\n", - " 'floors': 0.0,\n", - " 'waterfront': 11.0,\n", - " 'view': 0.29,\n", - " 'condition': 0.0,\n", - " 'grade': 0.0,\n", - " 'sqft_above': 0.0,\n", - " 'sqft_basement': 0.0,\n", - " 'yr_built': 0.0,\n", - " 'yr_renovated': 17.79,\n", - " 'zipcode': 0.0,\n", - " 'lat': 0.0,\n", - " 'long': 0.0,\n", - " 'sqft_living15': 0.0,\n", - " 'sqft_lot15': 0.0\n", - " }}" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "identify_issues(kings_data)" - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [], - "source": [ - "house_data_clean = kings_data.copy()" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "# Changing the date to date time\n", - "house_data_clean['date'] = pd.to_datetime(house_data_clean['date'])\n", - "\n", - "# Extracting only the year from the column Date\n", - "house_data_clean.date = house_data_clean['date'].dt.year\n", - "\n", - "# Changing the dates for the year built \n", - "house_data_clean['yr_built'] = pd.to_datetime(house_data_clean['yr_built'],format='%Y').dt.year\n" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [], - "source": [ - "def missing_values(dataset):\n", - " # drop the rows from views\n", - " dataset.dropna(subset=['view'],inplace=True)\n", - "\n", - " # Filling the NaN values for waterfront with NO\n", - " dataset.waterfront.fillna('NO',inplace=True)\n", - " \n", - " # Dropping the yr_renovated column \n", - " dataset.drop('yr_renovated',axis=1,inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [], - "source": [ - "missing_values(house_data_clean)" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'duplicates': 0,\n", - " 'missing values': id 0.00\n", - " date 0.00\n", - " price 0.00\n", - " bedrooms 0.00\n", - " bathrooms 0.00\n", - " sqft_living 0.00\n", - " sqft_lot 0.00\n", - " floors 0.00\n", - " waterfront 0.00\n", - " view 0.00\n", - " condition 0.00\n", - " grade 0.00\n", - " sqft_above 0.00\n", - " sqft_basement 0.00\n", - " yr_built 0.00\n", - " zipcode 0.00\n", - " lat 0.00\n", - " long 0.00\n", - " sqft_living15 0.00\n", - " sqft_lot15 0.00\n", - " dtype: float64}" - ] - }, - "execution_count": 8, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "identify_issues(house_data_clean)" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [], - "source": [ - "# Checking for duplicates in the dataset\n", - "house_data_clean[house_data_clean.duplicated()]" - ] + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The dataset contains 21 columns, each representing a distinct feature, and 21,597 rows, with each row corresponding to a specific house sale entry.\n", + "\n", + "The dataset contains a mix of data types, including integers (int64), floating-point numbers (float64), and objects (strings). For instance, numerical features such as bedrooms, bathrooms, and sqft_living are represented as integers or floating-point numbers to facilitate mathematical computations, while categorical features like waterfront and view are stored as objects to accommodate text-based categories." + ] }, { -"cells": [], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.8.11" - } - }, - "nbformat": 4, - "nbformat_minor": 5 -}, \ No newline at end of file + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The dataset contains 21597 houses with 21 features\n", + "\n", + "Columns and their data types:\n", + "id: int64\n", + "date: object\n", + "price: float64\n", + "bedrooms: int64\n", + "bathrooms: float64\n", + "sqft_living: int64\n", + "sqft_lot: int64\n", + "floors: float64\n", + "waterfront: object\n", + "view: object\n", + "condition: object\n", + "grade: object\n", + "sqft_above: int64\n", + "sqft_basement: object\n", + "yr_built: int64\n", + "yr_renovated: float64\n", + "zipcode: int64\n", + "lat: float64\n", + "long: float64\n", + "sqft_living15: int64\n", + "sqft_lot15: int64\n", + "\n" + ] + } + ], + "source": [ + "kings_data = load_data('data/kc_house_data.csv')" + ] +}, +{ + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "#create a function that takes in a column and returns the column statistics as a dictionary\n", + "def descriptive_analytics(column):\n", + " stats_dict = column.describe().to_dict()\n", + " \n", + " print(\"Descriptive Statistics for Column '{}':\".format(column.name))\n", + " print(\"The count of the column is:\", stats_dict['count'])\n", + " print(\"The mean of the column is:\", stats_dict['mean'])\n", + " print(\"The standard deviation of the column is:\", stats_dict['std'])\n", + " print(\"The minimum value of the column is:\", stats_dict['min'])\n", + " print(\"The 25th percentile of the column is:\", stats_dict['25%'])\n", + " print(\"The median of the column is:\", stats_dict['50%'])\n", + " print(\"The 75th percentile of the column is:\", stats_dict['75%'])\n", + " print(\"The maximum value of the column is:\", stats_dict['max'])" + ] +}, +{ + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Descriptive Statistics for Column 'price':\n", + "The count of the column is: 21597.0\n", + "The mean of the column is: 540296.5735055795\n", + "The standard deviation of the column is: 367368.1401013936\n", + "The minimum value of the column is: 78000.0\n", + "The 25th percentile of the column is: 322000.0\n", + "The median of the column is: 450000.0\n", + "The 75th percentile of the column is: 645000.0\n", + "The maximum value of the column is: 7700000.0\n" + ] + } + ], + "source": [ + "descriptive_analytics(kings_data['price'])" + ] +}, +{ + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the maximum price of a house is 7700000 dollars and the minimum price is 78000 dollars\n", + "\n", + "There are 21597 prices regarding to the houses in the dataset\n", + "\n", + "Average price of a house is 540296.57 dollars" + ] +}, +{ + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preperation\n", + "\n" + ] +}, +{ + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 21597 entries, 0 to 21596\n", + "Data columns (total 21 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 21597 non-null int64 \n", + " 1 date 21597 non-null object \n", + " 2 price 21597 non-null float64\n", + " 3 bedrooms 21597 non-null int64 \n", + " 4 bathrooms 21597 non-null float64\n", + " 5 sqft_living 21597 non-null int64 \n", + " 6 sqft_lot 21597 non-null int64 \n", + " 7 floors 21597 non-null float64\n", + " 8 waterfront 19221 non-null object \n", + " 9 view 21534 non-null object \n", + " 10 condition 21597 non-null object \n", + " 11 grade 21597 non-null object \n", + " 12 sqft_above 21597 non-null int64 \n", + " 13 sqft_basement 21597 non-null object \n", + " 14 yr_built 21597 non-null int64 \n", + " 15 yr_renovated 17755 non-null float64\n", + " 16 zipcode 21597 non-null int64 \n", + " 17 lat 21597 non-null float64\n", + " 18 long 21597 non-null float64\n", + " 19 sqft_living15 21597 non-null int64 \n", + " 20 sqft_lot15 21597 non-null int64 \n", + "dtypes: float64(6), int64(9), object(6)\n", + "memory usage: 3.5+ MB\n" + ] + } + ], + "source": [ + "kings_data.info()" + ] +}, +{ + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def identify_issues(dataset):\n", + " # Identify missing values as a percentage of the whole dataset\n", + " missing_values = (dataset.isnull().sum())/len(dataset) * 100\n", + "\n", + " # Identify duplicates\n", + " duplicates = dataset.duplicated().sum()\n", + " \n", + " #return a dictionary \n", + " return {'duplicates': duplicates,\n", + " 'missing values': missing_values.round(2)} \n" + ] +}, +{ + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'duplicates': 0,\n", + " 'missing values': id 0.00\n", + " date 0.00\n", + " price 0.00\n", + " bedrooms 0.00\n", + " bathrooms 0.00\n", + " sqft_living 0.00\n", + " sqft_lot 0.00\n", + " floors 0.00\n", + " waterfront 11.00\n", + " view 0.29\n", + " condition 0.00\n", + " grade 0.00\n", + " sqft_above 0.00\n", + " sqft_basement 0.00\n", + " yr_built 0.00\n", + " yr_renovated 17.79\n", + " zipcode 0.00\n", + " lat 0.00\n", + " long 0.00\n", + " sqft_living15 0.00\n", + " sqft_lot15 0.00\n", + " dtype: float64}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "identify_issues(kings_data)" + ] +}, +{ + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The examination indicates that there are no duplicate entries within the dataset, ensuring the integrity of the records. However, attention is warranted to address missing values present in certain columns. Specifically, the 'waterfront' feature exhibits 11% of null values, representing a negligible portion of the dataset. Similarly, the 'yr_renovated' feature shows a relatively higher percentage of missing values, accounting for approximately 17.79% of the dataset" + ] +}, +{ + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Before making changes make a copy instead of overwriting data" + ] +}, +{ + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "house_data_clean = kings_data.copy()" + ] +}, +{ + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Changing the date to date time\n", + "house_data_clean['date'] = pd.to_datetime(house_data_clean['date'])\n", + "\n", + "# Extracting only the year from the column Date\n", + "house_data_clean.date = house_data_clean['date'].dt.year\n", + "\n", + "# Changing the dates for the year built \n", + "house_data_clean['yr_built'] = pd.to_datetime(house_data_clean['yr_built'],format='%Y').dt.year\n" + ] +}, +{ + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above code converts the 'date' column data to only contain the year the house was sold, for the purpose of analysis we will use only the year since the changes month by month will be minor." + ] +}, +{ + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Dealing with the missing values" + ] +}, +{ + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def missing_values(dataset):\n", + " # drop the rows from views\n", + " dataset.dropna(subset=['view'],inplace=True)\n", + "\n", + " # Filling the NaN values for waterfront with NO\n", + " dataset.waterfront.fillna('NO',inplace=True)\n", + " \n", + " # Dropping the yr_renovated column \n", + " dataset.drop('yr_renovated',axis=1,inplace=True)" + ] +}, +{ + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "missing_values(house_data_clean)" + ] +}, +{ + "cell_type": "markdown", + "metadata": {}, + "source": [ + "'yr_renovated' has the highest percentage of NaN values 17%. This will be dropped since it will not be used within our model inline with the business problem.\n", + "\n", + "'Waterfront' feature has 11% null values, this was filled with NO on the assumption that these cells were not filled since they lacked waterfronts\n", + "\n", + "For the 'View' column, the null values were dropped by row since the overall percentage impact would be minute" + ] +}, +{ + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'duplicates': 2,\n", + " 'missing values': id 0.0\n", + " date 0.0\n", + " price 0.0\n", + " bedrooms 0.0\n", + " bathrooms 0.0\n", + " sqft_living 0.0\n", + " sqft_lot 0.0\n", + " floors 0.0\n", + " waterfront 0.0\n", + " view 0.0\n", + " condition 0.0\n", + " grade 0.0\n", + " sqft_above 0.0\n", + " sqft_basement 0.0\n", + " yr_built 0.0\n", + " zipcode 0.0\n", + " lat 0.0\n", + " long 0.0\n", + " sqft_living15 0.0\n", + " sqft_lot15 0.0\n", + " dtype: float64}" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "identify_issues(house_data_clean)" + ] +}, +{ + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewconditiongradesqft_abovesqft_basementyr_builtzipcodelatlongsqft_living15sqft_lot15
394718250690312014550000.041.75241084472.0NOGOODGood8 Good2060350.019369807447.6499-122.088252014789
2003886489001102014555000.032.50194032112.0NONONEAverage8 Good19400.020099802747.5644-122.09318803078
\n", + "
" + ], + "text/plain": [ + " id date price bedrooms bathrooms sqft_living sqft_lot \\\n", + "3947 1825069031 2014 550000.0 4 1.75 2410 8447 \n", + "20038 8648900110 2014 555000.0 3 2.50 1940 3211 \n", + "\n", + " floors waterfront view condition grade sqft_above sqft_basement \\\n", + "3947 2.0 NO GOOD Good 8 Good 2060 350.0 \n", + "20038 2.0 NO NONE Average 8 Good 1940 0.0 \n", + "\n", + " yr_built zipcode lat long sqft_living15 sqft_lot15 \n", + "3947 1936 98074 47.6499 -122.088 2520 14789 \n", + "20038 2009 98027 47.5644 -122.093 1880 3078 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "house_data_clean[house_data_clean.duplicated()]" + ] +} +], +"metadata": { +"kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" +}, +"language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" +} +}, +"nbformat": 4, +"nbformat_minor": 2 +} \ No newline at end of file From 100b23763477054240c5bc616607ac3214b4aece Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Wed, 1 May 2024 11:55:26 +0300 Subject: [PATCH 39/42] Update student.ipynb --- student.ipynb | 1126 ++++++++++++++++++++++++------------------------- 1 file changed, 563 insertions(+), 563 deletions(-) diff --git a/student.ipynb b/student.ipynb index 3789754d..93177751 100644 --- a/student.ipynb +++ b/student.ipynb @@ -6,7 +6,7 @@ "source": [ "## Final Project Submission\n", "\n", - "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng. \n", + "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng, Derrick Kiptoo. \n", "* Student pace: full time\n", "* Scheduled project review date/time: \n", "* Instructor name: Nikita \n", @@ -467,566 +467,566 @@ "\n", "

21597 rows × 21 columns

\n", "" - ], - "text/plain": [ - " id date price bedrooms bathrooms sqft_living \\\n", - "0 7129300520 10/13/2014 221900.0 3 1.00 1180 \n", - "1 6414100192 12/9/2014 538000.0 3 2.25 2570 \n", - "2 5631500400 2/25/2015 180000.0 2 1.00 770 \n", - "3 2487200875 12/9/2014 604000.0 4 3.00 1960 \n", - "4 1954400510 2/18/2015 510000.0 3 2.00 1680 \n", - "... ... ... ... ... ... ... \n", - "21592 263000018 5/21/2014 360000.0 3 2.50 1530 \n", - "21593 6600060120 2/23/2015 400000.0 4 2.50 2310 \n", - "21594 1523300141 6/23/2014 402101.0 2 0.75 1020 \n", - "21595 291310100 1/16/2015 400000.0 3 2.50 1600 \n", - "21596 1523300157 10/15/2014 325000.0 2 0.75 1020 \n", - "\n", - " sqft_lot floors waterfront view ... grade sqft_above \\\n", - "0 5650 1.0 NaN NONE ... 7 Average 1180 \n", - "1 7242 2.0 NO NONE ... 7 Average 2170 \n", - "2 10000 1.0 NO NONE ... 6 Low Average 770 \n", - "3 5000 1.0 NO NONE ... 7 Average 1050 \n", - "4 8080 1.0 NO NONE ... 8 Good 1680 \n", - "... ... ... ... ... ... ... ... \n", - "21592 1131 3.0 NO NONE ... 8 Good 1530 \n", - "21593 5813 2.0 NO NONE ... 8 Good 2310 \n", - "21594 1350 2.0 NO NONE ... 7 Average 1020 \n", - "21595 2388 2.0 NaN NONE ... 8 Good 1600 \n", - "21596 1076 2.0 NO NONE ... 7 Average 1020 \n", - "\n", - " sqft_basement yr_built yr_renovated zipcode lat long \\\n", - "0 0.0 1955 0.0 98178 47.5112 -122.257 \n", - "1 400.0 1951 1991.0 98125 47.7210 -122.319 \n", - "2 0.0 1933 NaN 98028 47.7379 -122.233 \n", - "3 910.0 1965 0.0 98136 47.5208 -122.393 \n", - "4 0.0 1987 0.0 98074 47.6168 -122.045 \n", - "... ... ... ... ... ... ... \n", - "21592 0.0 2009 0.0 98103 47.6993 -122.346 \n", - "21593 0.0 2014 0.0 98146 47.5107 -122.362 \n", - "21594 0.0 2009 0.0 98144 47.5944 -122.299 \n", - "21595 0.0 2004 0.0 98027 47.5345 -122.069 \n", - "21596 0.0 2008 0.0 98144 47.5941 -122.299 \n", - "\n", - " sqft_living15 sqft_lot15 \n", - "0 1340 5650 \n", - "1 1690 7639 \n", - "2 2720 8062 \n", - "3 1360 5000 \n", - "4 1800 7503 \n", - "... ... ... \n", - "21592 1530 1509 \n", - "21593 1830 7200 \n", - "21594 1020 2007 \n", - "21595 1410 1287 \n", - "21596 1020 1357 \n", - "\n", - "[21597 rows x 21 columns]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "load_data('data/kc_house_data.csv') # Assuming 'data' folder is in the same directory\n", - "\n", - "\n" - ] -}, -{ - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The dataset contains 21 columns, each representing a distinct feature, and 21,597 rows, with each row corresponding to a specific house sale entry.\n", - "\n", - "The dataset contains a mix of data types, including integers (int64), floating-point numbers (float64), and objects (strings). For instance, numerical features such as bedrooms, bathrooms, and sqft_living are represented as integers or floating-point numbers to facilitate mathematical computations, while categorical features like waterfront and view are stored as objects to accommodate text-based categories." - ] -}, -{ - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The dataset contains 21597 houses with 21 features\n", - "\n", - "Columns and their data types:\n", - "id: int64\n", - "date: object\n", - "price: float64\n", - "bedrooms: int64\n", - "bathrooms: float64\n", - "sqft_living: int64\n", - "sqft_lot: int64\n", - "floors: float64\n", - "waterfront: object\n", - "view: object\n", - "condition: object\n", - "grade: object\n", - "sqft_above: int64\n", - "sqft_basement: object\n", - "yr_built: int64\n", - "yr_renovated: float64\n", - "zipcode: int64\n", - "lat: float64\n", - "long: float64\n", - "sqft_living15: int64\n", - "sqft_lot15: int64\n", - "\n" - ] - } - ], - "source": [ - "kings_data = load_data('data/kc_house_data.csv')" - ] -}, -{ - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "#create a function that takes in a column and returns the column statistics as a dictionary\n", - "def descriptive_analytics(column):\n", - " stats_dict = column.describe().to_dict()\n", - " \n", - " print(\"Descriptive Statistics for Column '{}':\".format(column.name))\n", - " print(\"The count of the column is:\", stats_dict['count'])\n", - " print(\"The mean of the column is:\", stats_dict['mean'])\n", - " print(\"The standard deviation of the column is:\", stats_dict['std'])\n", - " print(\"The minimum value of the column is:\", stats_dict['min'])\n", - " print(\"The 25th percentile of the column is:\", stats_dict['25%'])\n", - " print(\"The median of the column is:\", stats_dict['50%'])\n", - " print(\"The 75th percentile of the column is:\", stats_dict['75%'])\n", - " print(\"The maximum value of the column is:\", stats_dict['max'])" - ] -}, -{ - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Descriptive Statistics for Column 'price':\n", - "The count of the column is: 21597.0\n", - "The mean of the column is: 540296.5735055795\n", - "The standard deviation of the column is: 367368.1401013936\n", - "The minimum value of the column is: 78000.0\n", - "The 25th percentile of the column is: 322000.0\n", - "The median of the column is: 450000.0\n", - "The 75th percentile of the column is: 645000.0\n", - "The maximum value of the column is: 7700000.0\n" - ] - } - ], - "source": [ - "descriptive_analytics(kings_data['price'])" - ] -}, -{ - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see that the maximum price of a house is 7700000 dollars and the minimum price is 78000 dollars\n", - "\n", - "There are 21597 prices regarding to the houses in the dataset\n", - "\n", - "Average price of a house is 540296.57 dollars" - ] -}, -{ - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Preperation\n", - "\n" - ] -}, -{ - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 21597 entries, 0 to 21596\n", - "Data columns (total 21 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 21597 non-null int64 \n", - " 1 date 21597 non-null object \n", - " 2 price 21597 non-null float64\n", - " 3 bedrooms 21597 non-null int64 \n", - " 4 bathrooms 21597 non-null float64\n", - " 5 sqft_living 21597 non-null int64 \n", - " 6 sqft_lot 21597 non-null int64 \n", - " 7 floors 21597 non-null float64\n", - " 8 waterfront 19221 non-null object \n", - " 9 view 21534 non-null object \n", - " 10 condition 21597 non-null object \n", - " 11 grade 21597 non-null object \n", - " 12 sqft_above 21597 non-null int64 \n", - " 13 sqft_basement 21597 non-null object \n", - " 14 yr_built 21597 non-null int64 \n", - " 15 yr_renovated 17755 non-null float64\n", - " 16 zipcode 21597 non-null int64 \n", - " 17 lat 21597 non-null float64\n", - " 18 long 21597 non-null float64\n", - " 19 sqft_living15 21597 non-null int64 \n", - " 20 sqft_lot15 21597 non-null int64 \n", - "dtypes: float64(6), int64(9), object(6)\n", - "memory usage: 3.5+ MB\n" - ] - } - ], - "source": [ - "kings_data.info()" - ] -}, -{ - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "def identify_issues(dataset):\n", - " # Identify missing values as a percentage of the whole dataset\n", - " missing_values = (dataset.isnull().sum())/len(dataset) * 100\n", - "\n", - " # Identify duplicates\n", - " duplicates = dataset.duplicated().sum()\n", - " \n", - " #return a dictionary \n", - " return {'duplicates': duplicates,\n", - " 'missing values': missing_values.round(2)} \n" - ] -}, -{ - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'duplicates': 0,\n", - " 'missing values': id 0.00\n", - " date 0.00\n", - " price 0.00\n", - " bedrooms 0.00\n", - " bathrooms 0.00\n", - " sqft_living 0.00\n", - " sqft_lot 0.00\n", - " floors 0.00\n", - " waterfront 11.00\n", - " view 0.29\n", - " condition 0.00\n", - " grade 0.00\n", - " sqft_above 0.00\n", - " sqft_basement 0.00\n", - " yr_built 0.00\n", - " yr_renovated 17.79\n", - " zipcode 0.00\n", - " lat 0.00\n", - " long 0.00\n", - " sqft_living15 0.00\n", - " sqft_lot15 0.00\n", - " dtype: float64}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "identify_issues(kings_data)" - ] -}, -{ - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The examination indicates that there are no duplicate entries within the dataset, ensuring the integrity of the records. However, attention is warranted to address missing values present in certain columns. Specifically, the 'waterfront' feature exhibits 11% of null values, representing a negligible portion of the dataset. Similarly, the 'yr_renovated' feature shows a relatively higher percentage of missing values, accounting for approximately 17.79% of the dataset" - ] -}, -{ - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Before making changes make a copy instead of overwriting data" - ] -}, -{ - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "house_data_clean = kings_data.copy()" - ] -}, -{ - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# Changing the date to date time\n", - "house_data_clean['date'] = pd.to_datetime(house_data_clean['date'])\n", - "\n", - "# Extracting only the year from the column Date\n", - "house_data_clean.date = house_data_clean['date'].dt.year\n", - "\n", - "# Changing the dates for the year built \n", - "house_data_clean['yr_built'] = pd.to_datetime(house_data_clean['yr_built'],format='%Y').dt.year\n" - ] -}, -{ - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The above code converts the 'date' column data to only contain the year the house was sold, for the purpose of analysis we will use only the year since the changes month by month will be minor." - ] -}, -{ - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Dealing with the missing values" - ] -}, -{ - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "def missing_values(dataset):\n", - " # drop the rows from views\n", - " dataset.dropna(subset=['view'],inplace=True)\n", - "\n", - " # Filling the NaN values for waterfront with NO\n", - " dataset.waterfront.fillna('NO',inplace=True)\n", - " \n", - " # Dropping the yr_renovated column \n", - " dataset.drop('yr_renovated',axis=1,inplace=True)" - ] -}, -{ - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "missing_values(house_data_clean)" - ] -}, -{ - "cell_type": "markdown", - "metadata": {}, - "source": [ - "'yr_renovated' has the highest percentage of NaN values 17%. This will be dropped since it will not be used within our model inline with the business problem.\n", - "\n", - "'Waterfront' feature has 11% null values, this was filled with NO on the assumption that these cells were not filled since they lacked waterfronts\n", - "\n", - "For the 'View' column, the null values were dropped by row since the overall percentage impact would be minute" - ] -}, -{ - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'duplicates': 2,\n", - " 'missing values': id 0.0\n", - " date 0.0\n", - " price 0.0\n", - " bedrooms 0.0\n", - " bathrooms 0.0\n", - " sqft_living 0.0\n", - " sqft_lot 0.0\n", - " floors 0.0\n", - " waterfront 0.0\n", - " view 0.0\n", - " condition 0.0\n", - " grade 0.0\n", - " sqft_above 0.0\n", - " sqft_basement 0.0\n", - " yr_built 0.0\n", - " zipcode 0.0\n", - " lat 0.0\n", - " long 0.0\n", - " sqft_living15 0.0\n", - " sqft_lot15 0.0\n", - " dtype: float64}" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "identify_issues(house_data_clean)" - ] -}, -{ - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewconditiongradesqft_abovesqft_basementyr_builtzipcodelatlongsqft_living15sqft_lot15
394718250690312014550000.041.75241084472.0NOGOODGood8 Good2060350.019369807447.6499-122.088252014789
2003886489001102014555000.032.50194032112.0NONONEAverage8 Good19400.020099802747.5644-122.09318803078
\n", - "
" - ], - "text/plain": [ - " id date price bedrooms bathrooms sqft_living sqft_lot \\\n", - "3947 1825069031 2014 550000.0 4 1.75 2410 8447 \n", - "20038 8648900110 2014 555000.0 3 2.50 1940 3211 \n", - "\n", - " floors waterfront view condition grade sqft_above sqft_basement \\\n", - "3947 2.0 NO GOOD Good 8 Good 2060 350.0 \n", - "20038 2.0 NO NONE Average 8 Good 1940 0.0 \n", - "\n", - " yr_built zipcode lat long sqft_living15 sqft_lot15 \n", - "3947 1936 98074 47.6499 -122.088 2520 14789 \n", - "20038 2009 98027 47.5644 -122.093 1880 3078 " - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "house_data_clean[house_data_clean.duplicated()]" - ] -} -], -"metadata": { -"kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" -}, -"language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" + ], + "text/plain": [ + " id date price bedrooms bathrooms sqft_living \\\n", + "0 7129300520 10/13/2014 221900.0 3 1.00 1180 \n", + "1 6414100192 12/9/2014 538000.0 3 2.25 2570 \n", + "2 5631500400 2/25/2015 180000.0 2 1.00 770 \n", + "3 2487200875 12/9/2014 604000.0 4 3.00 1960 \n", + "4 1954400510 2/18/2015 510000.0 3 2.00 1680 \n", + "... ... ... ... ... ... ... \n", + "21592 263000018 5/21/2014 360000.0 3 2.50 1530 \n", + "21593 6600060120 2/23/2015 400000.0 4 2.50 2310 \n", + "21594 1523300141 6/23/2014 402101.0 2 0.75 1020 \n", + "21595 291310100 1/16/2015 400000.0 3 2.50 1600 \n", + "21596 1523300157 10/15/2014 325000.0 2 0.75 1020 \n", + "\n", + " sqft_lot floors waterfront view ... grade sqft_above \\\n", + "0 5650 1.0 NaN NONE ... 7 Average 1180 \n", + "1 7242 2.0 NO NONE ... 7 Average 2170 \n", + "2 10000 1.0 NO NONE ... 6 Low Average 770 \n", + "3 5000 1.0 NO NONE ... 7 Average 1050 \n", + "4 8080 1.0 NO NONE ... 8 Good 1680 \n", + "... ... ... ... ... ... ... ... \n", + "21592 1131 3.0 NO NONE ... 8 Good 1530 \n", + "21593 5813 2.0 NO NONE ... 8 Good 2310 \n", + "21594 1350 2.0 NO NONE ... 7 Average 1020 \n", + "21595 2388 2.0 NaN NONE ... 8 Good 1600 \n", + "21596 1076 2.0 NO NONE ... 7 Average 1020 \n", + "\n", + " sqft_basement yr_built yr_renovated zipcode lat long \\\n", + "0 0.0 1955 0.0 98178 47.5112 -122.257 \n", + "1 400.0 1951 1991.0 98125 47.7210 -122.319 \n", + "2 0.0 1933 NaN 98028 47.7379 -122.233 \n", + "3 910.0 1965 0.0 98136 47.5208 -122.393 \n", + "4 0.0 1987 0.0 98074 47.6168 -122.045 \n", + "... ... ... ... ... ... ... \n", + "21592 0.0 2009 0.0 98103 47.6993 -122.346 \n", + "21593 0.0 2014 0.0 98146 47.5107 -122.362 \n", + "21594 0.0 2009 0.0 98144 47.5944 -122.299 \n", + "21595 0.0 2004 0.0 98027 47.5345 -122.069 \n", + "21596 0.0 2008 0.0 98144 47.5941 -122.299 \n", + "\n", + " sqft_living15 sqft_lot15 \n", + "0 1340 5650 \n", + "1 1690 7639 \n", + "2 2720 8062 \n", + "3 1360 5000 \n", + "4 1800 7503 \n", + "... ... ... \n", + "21592 1530 1509 \n", + "21593 1830 7200 \n", + "21594 1020 2007 \n", + "21595 1410 1287 \n", + "21596 1020 1357 \n", + "\n", + "[21597 rows x 21 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "load_data('data/kc_house_data.csv') # Assuming 'data' folder is in the same directory\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The dataset contains 21 columns, each representing a distinct feature, and 21,597 rows, with each row corresponding to a specific house sale entry.\n", + "\n", + "The dataset contains a mix of data types, including integers (int64), floating-point numbers (float64), and objects (strings). For instance, numerical features such as bedrooms, bathrooms, and sqft_living are represented as integers or floating-point numbers to facilitate mathematical computations, while categorical features like waterfront and view are stored as objects to accommodate text-based categories." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The dataset contains 21597 houses with 21 features\n", + "\n", + "Columns and their data types:\n", + "id: int64\n", + "date: object\n", + "price: float64\n", + "bedrooms: int64\n", + "bathrooms: float64\n", + "sqft_living: int64\n", + "sqft_lot: int64\n", + "floors: float64\n", + "waterfront: object\n", + "view: object\n", + "condition: object\n", + "grade: object\n", + "sqft_above: int64\n", + "sqft_basement: object\n", + "yr_built: int64\n", + "yr_renovated: float64\n", + "zipcode: int64\n", + "lat: float64\n", + "long: float64\n", + "sqft_living15: int64\n", + "sqft_lot15: int64\n", + "\n" + ] + } + ], + "source": [ + "kings_data = load_data('data/kc_house_data.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "#create a function that takes in a column and returns the column statistics as a dictionary\n", + "def descriptive_analytics(column):\n", + " stats_dict = column.describe().to_dict()\n", + " \n", + " print(\"Descriptive Statistics for Column '{}':\".format(column.name))\n", + " print(\"The count of the column is:\", stats_dict['count'])\n", + " print(\"The mean of the column is:\", stats_dict['mean'])\n", + " print(\"The standard deviation of the column is:\", stats_dict['std'])\n", + " print(\"The minimum value of the column is:\", stats_dict['min'])\n", + " print(\"The 25th percentile of the column is:\", stats_dict['25%'])\n", + " print(\"The median of the column is:\", stats_dict['50%'])\n", + " print(\"The 75th percentile of the column is:\", stats_dict['75%'])\n", + " print(\"The maximum value of the column is:\", stats_dict['max'])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Descriptive Statistics for Column 'price':\n", + "The count of the column is: 21597.0\n", + "The mean of the column is: 540296.5735055795\n", + "The standard deviation of the column is: 367368.1401013936\n", + "The minimum value of the column is: 78000.0\n", + "The 25th percentile of the column is: 322000.0\n", + "The median of the column is: 450000.0\n", + "The 75th percentile of the column is: 645000.0\n", + "The maximum value of the column is: 7700000.0\n" + ] + } + ], + "source": [ + "descriptive_analytics(kings_data['price'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the maximum price of a house is 7700000 dollars and the minimum price is 78000 dollars\n", + "\n", + "There are 21597 prices regarding to the houses in the dataset\n", + "\n", + "Average price of a house is 540296.57 dollars" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preperation\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 21597 entries, 0 to 21596\n", + "Data columns (total 21 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 21597 non-null int64 \n", + " 1 date 21597 non-null object \n", + " 2 price 21597 non-null float64\n", + " 3 bedrooms 21597 non-null int64 \n", + " 4 bathrooms 21597 non-null float64\n", + " 5 sqft_living 21597 non-null int64 \n", + " 6 sqft_lot 21597 non-null int64 \n", + " 7 floors 21597 non-null float64\n", + " 8 waterfront 19221 non-null object \n", + " 9 view 21534 non-null object \n", + " 10 condition 21597 non-null object \n", + " 11 grade 21597 non-null object \n", + " 12 sqft_above 21597 non-null int64 \n", + " 13 sqft_basement 21597 non-null object \n", + " 14 yr_built 21597 non-null int64 \n", + " 15 yr_renovated 17755 non-null float64\n", + " 16 zipcode 21597 non-null int64 \n", + " 17 lat 21597 non-null float64\n", + " 18 long 21597 non-null float64\n", + " 19 sqft_living15 21597 non-null int64 \n", + " 20 sqft_lot15 21597 non-null int64 \n", + "dtypes: float64(6), int64(9), object(6)\n", + "memory usage: 3.5+ MB\n" + ] + } + ], + "source": [ + "kings_data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def identify_issues(dataset):\n", + " # Identify missing values as a percentage of the whole dataset\n", + " missing_values = (dataset.isnull().sum())/len(dataset) * 100\n", + "\n", + " # Identify duplicates\n", + " duplicates = dataset.duplicated().sum()\n", + " \n", + " #return a dictionary \n", + " return {'duplicates': duplicates,\n", + " 'missing values': missing_values.round(2)} \n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'duplicates': 0,\n", + " 'missing values': id 0.00\n", + " date 0.00\n", + " price 0.00\n", + " bedrooms 0.00\n", + " bathrooms 0.00\n", + " sqft_living 0.00\n", + " sqft_lot 0.00\n", + " floors 0.00\n", + " waterfront 11.00\n", + " view 0.29\n", + " condition 0.00\n", + " grade 0.00\n", + " sqft_above 0.00\n", + " sqft_basement 0.00\n", + " yr_built 0.00\n", + " yr_renovated 17.79\n", + " zipcode 0.00\n", + " lat 0.00\n", + " long 0.00\n", + " sqft_living15 0.00\n", + " sqft_lot15 0.00\n", + " dtype: float64}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "identify_issues(kings_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The examination indicates that there are no duplicate entries within the dataset, ensuring the integrity of the records. However, attention is warranted to address missing values present in certain columns. Specifically, the 'waterfront' feature exhibits 11% of null values, representing a negligible portion of the dataset. Similarly, the 'yr_renovated' feature shows a relatively higher percentage of missing values, accounting for approximately 17.79% of the dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Before making changes make a copy instead of overwriting data" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "house_data_clean = kings_data.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Changing the date to date time\n", + "house_data_clean['date'] = pd.to_datetime(house_data_clean['date'])\n", + "\n", + "# Extracting only the year from the column Date\n", + "house_data_clean.date = house_data_clean['date'].dt.year\n", + "\n", + "# Changing the dates for the year built \n", + "house_data_clean['yr_built'] = pd.to_datetime(house_data_clean['yr_built'],format='%Y').dt.year\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above code converts the 'date' column data to only contain the year the house was sold, for the purpose of analysis we will use only the year since the changes month by month will be minor." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Dealing with the missing values" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def missing_values(dataset):\n", + " # drop the rows from views\n", + " dataset.dropna(subset=['view'],inplace=True)\n", + "\n", + " # Filling the NaN values for waterfront with NO\n", + " dataset.waterfront.fillna('NO',inplace=True)\n", + " \n", + " # Dropping the yr_renovated column \n", + " dataset.drop('yr_renovated',axis=1,inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "missing_values(house_data_clean)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "'yr_renovated' has the highest percentage of NaN values 17%. This will be dropped since it will not be used within our model inline with the business problem.\n", + "\n", + "'Waterfront' feature has 11% null values, this was filled with NO on the assumption that these cells were not filled since they lacked waterfronts\n", + "\n", + "For the 'View' column, the null values were dropped by row since the overall percentage impact would be minute" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'duplicates': 2,\n", + " 'missing values': id 0.0\n", + " date 0.0\n", + " price 0.0\n", + " bedrooms 0.0\n", + " bathrooms 0.0\n", + " sqft_living 0.0\n", + " sqft_lot 0.0\n", + " floors 0.0\n", + " waterfront 0.0\n", + " view 0.0\n", + " condition 0.0\n", + " grade 0.0\n", + " sqft_above 0.0\n", + " sqft_basement 0.0\n", + " yr_built 0.0\n", + " zipcode 0.0\n", + " lat 0.0\n", + " long 0.0\n", + " sqft_living15 0.0\n", + " sqft_lot15 0.0\n", + " dtype: float64}" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "identify_issues(house_data_clean)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewconditiongradesqft_abovesqft_basementyr_builtzipcodelatlongsqft_living15sqft_lot15
394718250690312014550000.041.75241084472.0NOGOODGood8 Good2060350.019369807447.6499-122.088252014789
2003886489001102014555000.032.50194032112.0NONONEAverage8 Good19400.020099802747.5644-122.09318803078
\n", + "
" + ], + "text/plain": [ + " id date price bedrooms bathrooms sqft_living sqft_lot \\\n", + "3947 1825069031 2014 550000.0 4 1.75 2410 8447 \n", + "20038 8648900110 2014 555000.0 3 2.50 1940 3211 \n", + "\n", + " floors waterfront view condition grade sqft_above sqft_basement \\\n", + "3947 2.0 NO GOOD Good 8 Good 2060 350.0 \n", + "20038 2.0 NO NONE Average 8 Good 1940 0.0 \n", + "\n", + " yr_built zipcode lat long sqft_living15 sqft_lot15 \n", + "3947 1936 98074 47.6499 -122.088 2520 14789 \n", + "20038 2009 98027 47.5644 -122.093 1880 3078 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "house_data_clean[house_data_clean.duplicated()]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 } -}, -"nbformat": 4, -"nbformat_minor": 2 -} \ No newline at end of file From 2353438227239a869e32885e97d7b037bd7e9089 Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Wed, 1 May 2024 12:00:03 +0300 Subject: [PATCH 40/42] Update student.ipynb --- student.ipynb | 1150 +++++++++++++++++++++++++------------------------ 1 file changed, 586 insertions(+), 564 deletions(-) diff --git a/student.ipynb b/student.ipynb index 4a29c2f7..24625738 100644 --- a/student.ipynb +++ b/student.ipynb @@ -6,7 +6,7 @@ "source": [ "## Final Project Submission\n", "\n", - "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng. \n", + "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng, Derrick Kiptoo. \n", "* Student pace: full time\n", "* Scheduled project review date/time: \n", "* Instructor name: Nikita \n", @@ -93,9 +93,31 @@ "\n" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Import Necessary Libraries" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "import numpy as np\n", + "import pandas as pd\n", + "import scipy.stats as stats\n", + "import seaborn as sns\n", + "import statsmodels.api as sm" + ] + }, + { + "cell_type": "code", + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -467,566 +489,566 @@ "\n", "

21597 rows × 21 columns

\n", "" - ], - "text/plain": [ - " id date price bedrooms bathrooms sqft_living \\\n", - "0 7129300520 10/13/2014 221900.0 3 1.00 1180 \n", - "1 6414100192 12/9/2014 538000.0 3 2.25 2570 \n", - "2 5631500400 2/25/2015 180000.0 2 1.00 770 \n", - "3 2487200875 12/9/2014 604000.0 4 3.00 1960 \n", - "4 1954400510 2/18/2015 510000.0 3 2.00 1680 \n", - "... ... ... ... ... ... ... \n", - "21592 263000018 5/21/2014 360000.0 3 2.50 1530 \n", - "21593 6600060120 2/23/2015 400000.0 4 2.50 2310 \n", - "21594 1523300141 6/23/2014 402101.0 2 0.75 1020 \n", - "21595 291310100 1/16/2015 400000.0 3 2.50 1600 \n", - "21596 1523300157 10/15/2014 325000.0 2 0.75 1020 \n", - "\n", - " sqft_lot floors waterfront view ... grade sqft_above \\\n", - "0 5650 1.0 NaN NONE ... 7 Average 1180 \n", - "1 7242 2.0 NO NONE ... 7 Average 2170 \n", - "2 10000 1.0 NO NONE ... 6 Low Average 770 \n", - "3 5000 1.0 NO NONE ... 7 Average 1050 \n", - "4 8080 1.0 NO NONE ... 8 Good 1680 \n", - "... ... ... ... ... ... ... ... \n", - "21592 1131 3.0 NO NONE ... 8 Good 1530 \n", - "21593 5813 2.0 NO NONE ... 8 Good 2310 \n", - "21594 1350 2.0 NO NONE ... 7 Average 1020 \n", - "21595 2388 2.0 NaN NONE ... 8 Good 1600 \n", - "21596 1076 2.0 NO NONE ... 7 Average 1020 \n", - "\n", - " sqft_basement yr_built yr_renovated zipcode lat long \\\n", - "0 0.0 1955 0.0 98178 47.5112 -122.257 \n", - "1 400.0 1951 1991.0 98125 47.7210 -122.319 \n", - "2 0.0 1933 NaN 98028 47.7379 -122.233 \n", - "3 910.0 1965 0.0 98136 47.5208 -122.393 \n", - "4 0.0 1987 0.0 98074 47.6168 -122.045 \n", - "... ... ... ... ... ... ... \n", - "21592 0.0 2009 0.0 98103 47.6993 -122.346 \n", - "21593 0.0 2014 0.0 98146 47.5107 -122.362 \n", - "21594 0.0 2009 0.0 98144 47.5944 -122.299 \n", - "21595 0.0 2004 0.0 98027 47.5345 -122.069 \n", - "21596 0.0 2008 0.0 98144 47.5941 -122.299 \n", - "\n", - " sqft_living15 sqft_lot15 \n", - "0 1340 5650 \n", - "1 1690 7639 \n", - "2 2720 8062 \n", - "3 1360 5000 \n", - "4 1800 7503 \n", - "... ... ... \n", - "21592 1530 1509 \n", - "21593 1830 7200 \n", - "21594 1020 2007 \n", - "21595 1410 1287 \n", - "21596 1020 1357 \n", - "\n", - "[21597 rows x 21 columns]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "load_data('data/kc_house_data.csv') # Assuming 'data' folder is in the same directory\n", - "\n", - "\n" - ] -}, -{ - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The dataset contains 21 columns, each representing a distinct feature, and 21,597 rows, with each row corresponding to a specific house sale entry.\n", - "\n", - "The dataset contains a mix of data types, including integers (int64), floating-point numbers (float64), and objects (strings). For instance, numerical features such as bedrooms, bathrooms, and sqft_living are represented as integers or floating-point numbers to facilitate mathematical computations, while categorical features like waterfront and view are stored as objects to accommodate text-based categories." - ] -}, -{ - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The dataset contains 21597 houses with 21 features\n", - "\n", - "Columns and their data types:\n", - "id: int64\n", - "date: object\n", - "price: float64\n", - "bedrooms: int64\n", - "bathrooms: float64\n", - "sqft_living: int64\n", - "sqft_lot: int64\n", - "floors: float64\n", - "waterfront: object\n", - "view: object\n", - "condition: object\n", - "grade: object\n", - "sqft_above: int64\n", - "sqft_basement: object\n", - "yr_built: int64\n", - "yr_renovated: float64\n", - "zipcode: int64\n", - "lat: float64\n", - "long: float64\n", - "sqft_living15: int64\n", - "sqft_lot15: int64\n", - "\n" - ] - } - ], - "source": [ - "kings_data = load_data('data/kc_house_data.csv')" - ] -}, -{ - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "#create a function that takes in a column and returns the column statistics as a dictionary\n", - "def descriptive_analytics(column):\n", - " stats_dict = column.describe().to_dict()\n", - " \n", - " print(\"Descriptive Statistics for Column '{}':\".format(column.name))\n", - " print(\"The count of the column is:\", stats_dict['count'])\n", - " print(\"The mean of the column is:\", stats_dict['mean'])\n", - " print(\"The standard deviation of the column is:\", stats_dict['std'])\n", - " print(\"The minimum value of the column is:\", stats_dict['min'])\n", - " print(\"The 25th percentile of the column is:\", stats_dict['25%'])\n", - " print(\"The median of the column is:\", stats_dict['50%'])\n", - " print(\"The 75th percentile of the column is:\", stats_dict['75%'])\n", - " print(\"The maximum value of the column is:\", stats_dict['max'])" - ] -}, -{ - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Descriptive Statistics for Column 'price':\n", - "The count of the column is: 21597.0\n", - "The mean of the column is: 540296.5735055795\n", - "The standard deviation of the column is: 367368.1401013936\n", - "The minimum value of the column is: 78000.0\n", - "The 25th percentile of the column is: 322000.0\n", - "The median of the column is: 450000.0\n", - "The 75th percentile of the column is: 645000.0\n", - "The maximum value of the column is: 7700000.0\n" - ] - } - ], - "source": [ - "descriptive_analytics(kings_data['price'])" - ] -}, -{ - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see that the maximum price of a house is 7700000 dollars and the minimum price is 78000 dollars\n", - "\n", - "There are 21597 prices regarding to the houses in the dataset\n", - "\n", - "Average price of a house is 540296.57 dollars" - ] -}, -{ - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Preperation\n", - "\n" - ] -}, -{ - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 21597 entries, 0 to 21596\n", - "Data columns (total 21 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 21597 non-null int64 \n", - " 1 date 21597 non-null object \n", - " 2 price 21597 non-null float64\n", - " 3 bedrooms 21597 non-null int64 \n", - " 4 bathrooms 21597 non-null float64\n", - " 5 sqft_living 21597 non-null int64 \n", - " 6 sqft_lot 21597 non-null int64 \n", - " 7 floors 21597 non-null float64\n", - " 8 waterfront 19221 non-null object \n", - " 9 view 21534 non-null object \n", - " 10 condition 21597 non-null object \n", - " 11 grade 21597 non-null object \n", - " 12 sqft_above 21597 non-null int64 \n", - " 13 sqft_basement 21597 non-null object \n", - " 14 yr_built 21597 non-null int64 \n", - " 15 yr_renovated 17755 non-null float64\n", - " 16 zipcode 21597 non-null int64 \n", - " 17 lat 21597 non-null float64\n", - " 18 long 21597 non-null float64\n", - " 19 sqft_living15 21597 non-null int64 \n", - " 20 sqft_lot15 21597 non-null int64 \n", - "dtypes: float64(6), int64(9), object(6)\n", - "memory usage: 3.5+ MB\n" - ] - } - ], - "source": [ - "kings_data.info()" - ] -}, -{ - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "def identify_issues(dataset):\n", - " # Identify missing values as a percentage of the whole dataset\n", - " missing_values = (dataset.isnull().sum())/len(dataset) * 100\n", - "\n", - " # Identify duplicates\n", - " duplicates = dataset.duplicated().sum()\n", - " \n", - " #return a dictionary \n", - " return {'duplicates': duplicates,\n", - " 'missing values': missing_values.round(2)} \n" - ] -}, -{ - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'duplicates': 0,\n", - " 'missing values': id 0.00\n", - " date 0.00\n", - " price 0.00\n", - " bedrooms 0.00\n", - " bathrooms 0.00\n", - " sqft_living 0.00\n", - " sqft_lot 0.00\n", - " floors 0.00\n", - " waterfront 11.00\n", - " view 0.29\n", - " condition 0.00\n", - " grade 0.00\n", - " sqft_above 0.00\n", - " sqft_basement 0.00\n", - " yr_built 0.00\n", - " yr_renovated 17.79\n", - " zipcode 0.00\n", - " lat 0.00\n", - " long 0.00\n", - " sqft_living15 0.00\n", - " sqft_lot15 0.00\n", - " dtype: float64}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "identify_issues(kings_data)" - ] -}, -{ - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The examination indicates that there are no duplicate entries within the dataset, ensuring the integrity of the records. However, attention is warranted to address missing values present in certain columns. Specifically, the 'waterfront' feature exhibits 11% of null values, representing a negligible portion of the dataset. Similarly, the 'yr_renovated' feature shows a relatively higher percentage of missing values, accounting for approximately 17.79% of the dataset" - ] -}, -{ - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Before making changes make a copy instead of overwriting data" - ] -}, -{ - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "house_data_clean = kings_data.copy()" - ] -}, -{ - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# Changing the date to date time\n", - "house_data_clean['date'] = pd.to_datetime(house_data_clean['date'])\n", - "\n", - "# Extracting only the year from the column Date\n", - "house_data_clean.date = house_data_clean['date'].dt.year\n", - "\n", - "# Changing the dates for the year built \n", - "house_data_clean['yr_built'] = pd.to_datetime(house_data_clean['yr_built'],format='%Y').dt.year\n" - ] -}, -{ - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The above code converts the 'date' column data to only contain the year the house was sold, for the purpose of analysis we will use only the year since the changes month by month will be minor." - ] -}, -{ - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Dealing with the missing values" - ] -}, -{ - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "def missing_values(dataset):\n", - " # drop the rows from views\n", - " dataset.dropna(subset=['view'],inplace=True)\n", - "\n", - " # Filling the NaN values for waterfront with NO\n", - " dataset.waterfront.fillna('NO',inplace=True)\n", - " \n", - " # Dropping the yr_renovated column \n", - " dataset.drop('yr_renovated',axis=1,inplace=True)" - ] -}, -{ - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "missing_values(house_data_clean)" - ] -}, -{ - "cell_type": "markdown", - "metadata": {}, - "source": [ - "'yr_renovated' has the highest percentage of NaN values 17%. This will be dropped since it will not be used within our model inline with the business problem.\n", - "\n", - "'Waterfront' feature has 11% null values, this was filled with NO on the assumption that these cells were not filled since they lacked waterfronts\n", - "\n", - "For the 'View' column, the null values were dropped by row since the overall percentage impact would be minute" - ] -}, -{ - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'duplicates': 2,\n", - " 'missing values': id 0.0\n", - " date 0.0\n", - " price 0.0\n", - " bedrooms 0.0\n", - " bathrooms 0.0\n", - " sqft_living 0.0\n", - " sqft_lot 0.0\n", - " floors 0.0\n", - " waterfront 0.0\n", - " view 0.0\n", - " condition 0.0\n", - " grade 0.0\n", - " sqft_above 0.0\n", - " sqft_basement 0.0\n", - " yr_built 0.0\n", - " zipcode 0.0\n", - " lat 0.0\n", - " long 0.0\n", - " sqft_living15 0.0\n", - " sqft_lot15 0.0\n", - " dtype: float64}" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "identify_issues(house_data_clean)" - ] -}, -{ - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewconditiongradesqft_abovesqft_basementyr_builtzipcodelatlongsqft_living15sqft_lot15
394718250690312014550000.041.75241084472.0NOGOODGood8 Good2060350.019369807447.6499-122.088252014789
2003886489001102014555000.032.50194032112.0NONONEAverage8 Good19400.020099802747.5644-122.09318803078
\n", - "
" - ], - "text/plain": [ - " id date price bedrooms bathrooms sqft_living sqft_lot \\\n", - "3947 1825069031 2014 550000.0 4 1.75 2410 8447 \n", - "20038 8648900110 2014 555000.0 3 2.50 1940 3211 \n", - "\n", - " floors waterfront view condition grade sqft_above sqft_basement \\\n", - "3947 2.0 NO GOOD Good 8 Good 2060 350.0 \n", - "20038 2.0 NO NONE Average 8 Good 1940 0.0 \n", - "\n", - " yr_built zipcode lat long sqft_living15 sqft_lot15 \n", - "3947 1936 98074 47.6499 -122.088 2520 14789 \n", - "20038 2009 98027 47.5644 -122.093 1880 3078 " - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "house_data_clean[house_data_clean.duplicated()]" - ] -} -], -"metadata": { -"kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" -}, -"language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" -} -}, -"nbformat": 4, -"nbformat_minor": 2 + ], + "text/plain": [ + " id date price bedrooms bathrooms sqft_living \\\n", + "0 7129300520 10/13/2014 221900.0 3 1.00 1180 \n", + "1 6414100192 12/9/2014 538000.0 3 2.25 2570 \n", + "2 5631500400 2/25/2015 180000.0 2 1.00 770 \n", + "3 2487200875 12/9/2014 604000.0 4 3.00 1960 \n", + "4 1954400510 2/18/2015 510000.0 3 2.00 1680 \n", + "... ... ... ... ... ... ... \n", + "21592 263000018 5/21/2014 360000.0 3 2.50 1530 \n", + "21593 6600060120 2/23/2015 400000.0 4 2.50 2310 \n", + "21594 1523300141 6/23/2014 402101.0 2 0.75 1020 \n", + "21595 291310100 1/16/2015 400000.0 3 2.50 1600 \n", + "21596 1523300157 10/15/2014 325000.0 2 0.75 1020 \n", + "\n", + " sqft_lot floors waterfront view ... grade sqft_above \\\n", + "0 5650 1.0 NaN NONE ... 7 Average 1180 \n", + "1 7242 2.0 NO NONE ... 7 Average 2170 \n", + "2 10000 1.0 NO NONE ... 6 Low Average 770 \n", + "3 5000 1.0 NO NONE ... 7 Average 1050 \n", + "4 8080 1.0 NO NONE ... 8 Good 1680 \n", + "... ... ... ... ... ... ... ... \n", + "21592 1131 3.0 NO NONE ... 8 Good 1530 \n", + "21593 5813 2.0 NO NONE ... 8 Good 2310 \n", + "21594 1350 2.0 NO NONE ... 7 Average 1020 \n", + "21595 2388 2.0 NaN NONE ... 8 Good 1600 \n", + "21596 1076 2.0 NO NONE ... 7 Average 1020 \n", + "\n", + " sqft_basement yr_built yr_renovated zipcode lat long \\\n", + "0 0.0 1955 0.0 98178 47.5112 -122.257 \n", + "1 400.0 1951 1991.0 98125 47.7210 -122.319 \n", + "2 0.0 1933 NaN 98028 47.7379 -122.233 \n", + "3 910.0 1965 0.0 98136 47.5208 -122.393 \n", + "4 0.0 1987 0.0 98074 47.6168 -122.045 \n", + "... ... ... ... ... ... ... \n", + "21592 0.0 2009 0.0 98103 47.6993 -122.346 \n", + "21593 0.0 2014 0.0 98146 47.5107 -122.362 \n", + "21594 0.0 2009 0.0 98144 47.5944 -122.299 \n", + "21595 0.0 2004 0.0 98027 47.5345 -122.069 \n", + "21596 0.0 2008 0.0 98144 47.5941 -122.299 \n", + "\n", + " sqft_living15 sqft_lot15 \n", + "0 1340 5650 \n", + "1 1690 7639 \n", + "2 2720 8062 \n", + "3 1360 5000 \n", + "4 1800 7503 \n", + "... ... ... \n", + "21592 1530 1509 \n", + "21593 1830 7200 \n", + "21594 1020 2007 \n", + "21595 1410 1287 \n", + "21596 1020 1357 \n", + "\n", + "[21597 rows x 21 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "load_data('data/kc_house_data.csv') # Assuming 'data' folder is in the same directory\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The dataset contains 21 columns, each representing a distinct feature, and 21,597 rows, with each row corresponding to a specific house sale entry.\n", + "\n", + "The dataset contains a mix of data types, including integers (int64), floating-point numbers (float64), and objects (strings). For instance, numerical features such as bedrooms, bathrooms, and sqft_living are represented as integers or floating-point numbers to facilitate mathematical computations, while categorical features like waterfront and view are stored as objects to accommodate text-based categories." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The dataset contains 21597 houses with 21 features\n", + "\n", + "Columns and their data types:\n", + "id: int64\n", + "date: object\n", + "price: float64\n", + "bedrooms: int64\n", + "bathrooms: float64\n", + "sqft_living: int64\n", + "sqft_lot: int64\n", + "floors: float64\n", + "waterfront: object\n", + "view: object\n", + "condition: object\n", + "grade: object\n", + "sqft_above: int64\n", + "sqft_basement: object\n", + "yr_built: int64\n", + "yr_renovated: float64\n", + "zipcode: int64\n", + "lat: float64\n", + "long: float64\n", + "sqft_living15: int64\n", + "sqft_lot15: int64\n", + "\n" + ] + } + ], + "source": [ + "kings_data = load_data('data/kc_house_data.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "#create a function that takes in a column and returns the column statistics as a dictionary\n", + "def descriptive_analytics(column):\n", + " stats_dict = column.describe().to_dict()\n", + " \n", + " print(\"Descriptive Statistics for Column '{}':\".format(column.name))\n", + " print(\"The count of the column is:\", stats_dict['count'])\n", + " print(\"The mean of the column is:\", stats_dict['mean'])\n", + " print(\"The standard deviation of the column is:\", stats_dict['std'])\n", + " print(\"The minimum value of the column is:\", stats_dict['min'])\n", + " print(\"The 25th percentile of the column is:\", stats_dict['25%'])\n", + " print(\"The median of the column is:\", stats_dict['50%'])\n", + " print(\"The 75th percentile of the column is:\", stats_dict['75%'])\n", + " print(\"The maximum value of the column is:\", stats_dict['max'])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Descriptive Statistics for Column 'price':\n", + "The count of the column is: 21597.0\n", + "The mean of the column is: 540296.5735055795\n", + "The standard deviation of the column is: 367368.1401013936\n", + "The minimum value of the column is: 78000.0\n", + "The 25th percentile of the column is: 322000.0\n", + "The median of the column is: 450000.0\n", + "The 75th percentile of the column is: 645000.0\n", + "The maximum value of the column is: 7700000.0\n" + ] + } + ], + "source": [ + "descriptive_analytics(kings_data['price'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the maximum price of a house is 7700000 dollars and the minimum price is 78000 dollars\n", + "\n", + "There are 21597 prices regarding to the houses in the dataset\n", + "\n", + "Average price of a house is 540296.57 dollars" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preperation\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 21597 entries, 0 to 21596\n", + "Data columns (total 21 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 21597 non-null int64 \n", + " 1 date 21597 non-null object \n", + " 2 price 21597 non-null float64\n", + " 3 bedrooms 21597 non-null int64 \n", + " 4 bathrooms 21597 non-null float64\n", + " 5 sqft_living 21597 non-null int64 \n", + " 6 sqft_lot 21597 non-null int64 \n", + " 7 floors 21597 non-null float64\n", + " 8 waterfront 19221 non-null object \n", + " 9 view 21534 non-null object \n", + " 10 condition 21597 non-null object \n", + " 11 grade 21597 non-null object \n", + " 12 sqft_above 21597 non-null int64 \n", + " 13 sqft_basement 21597 non-null object \n", + " 14 yr_built 21597 non-null int64 \n", + " 15 yr_renovated 17755 non-null float64\n", + " 16 zipcode 21597 non-null int64 \n", + " 17 lat 21597 non-null float64\n", + " 18 long 21597 non-null float64\n", + " 19 sqft_living15 21597 non-null int64 \n", + " 20 sqft_lot15 21597 non-null int64 \n", + "dtypes: float64(6), int64(9), object(6)\n", + "memory usage: 3.5+ MB\n" + ] + } + ], + "source": [ + "kings_data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def identify_issues(dataset):\n", + " # Identify missing values as a percentage of the whole dataset\n", + " missing_values = (dataset.isnull().sum())/len(dataset) * 100\n", + "\n", + " # Identify duplicates\n", + " duplicates = dataset.duplicated().sum()\n", + " \n", + " #return a dictionary \n", + " return {'duplicates': duplicates,\n", + " 'missing values': missing_values.round(2)} \n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'duplicates': 0,\n", + " 'missing values': id 0.00\n", + " date 0.00\n", + " price 0.00\n", + " bedrooms 0.00\n", + " bathrooms 0.00\n", + " sqft_living 0.00\n", + " sqft_lot 0.00\n", + " floors 0.00\n", + " waterfront 11.00\n", + " view 0.29\n", + " condition 0.00\n", + " grade 0.00\n", + " sqft_above 0.00\n", + " sqft_basement 0.00\n", + " yr_built 0.00\n", + " yr_renovated 17.79\n", + " zipcode 0.00\n", + " lat 0.00\n", + " long 0.00\n", + " sqft_living15 0.00\n", + " sqft_lot15 0.00\n", + " dtype: float64}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "identify_issues(kings_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The examination indicates that there are no duplicate entries within the dataset, ensuring the integrity of the records. However, attention is warranted to address missing values present in certain columns. Specifically, the 'waterfront' feature exhibits 11% of null values, representing a negligible portion of the dataset. Similarly, the 'yr_renovated' feature shows a relatively higher percentage of missing values, accounting for approximately 17.79% of the dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Before making changes make a copy instead of overwriting data" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "house_data_clean = kings_data.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Changing the date to date time\n", + "house_data_clean['date'] = pd.to_datetime(house_data_clean['date'])\n", + "\n", + "# Extracting only the year from the column Date\n", + "house_data_clean.date = house_data_clean['date'].dt.year\n", + "\n", + "# Changing the dates for the year built \n", + "house_data_clean['yr_built'] = pd.to_datetime(house_data_clean['yr_built'],format='%Y').dt.year\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above code converts the 'date' column data to only contain the year the house was sold, for the purpose of analysis we will use only the year since the changes month by month will be minor." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Dealing with the missing values" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def missing_values(dataset):\n", + " # drop the rows from views\n", + " dataset.dropna(subset=['view'],inplace=True)\n", + "\n", + " # Filling the NaN values for waterfront with NO\n", + " dataset.waterfront.fillna('NO',inplace=True)\n", + " \n", + " # Dropping the yr_renovated column \n", + " dataset.drop('yr_renovated',axis=1,inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "missing_values(house_data_clean)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "'yr_renovated' has the highest percentage of NaN values 17%. This will be dropped since it will not be used within our model inline with the business problem.\n", + "\n", + "'Waterfront' feature has 11% null values, this was filled with NO on the assumption that these cells were not filled since they lacked waterfronts\n", + "\n", + "For the 'View' column, the null values were dropped by row since the overall percentage impact would be minute" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'duplicates': 2,\n", + " 'missing values': id 0.0\n", + " date 0.0\n", + " price 0.0\n", + " bedrooms 0.0\n", + " bathrooms 0.0\n", + " sqft_living 0.0\n", + " sqft_lot 0.0\n", + " floors 0.0\n", + " waterfront 0.0\n", + " view 0.0\n", + " condition 0.0\n", + " grade 0.0\n", + " sqft_above 0.0\n", + " sqft_basement 0.0\n", + " yr_built 0.0\n", + " zipcode 0.0\n", + " lat 0.0\n", + " long 0.0\n", + " sqft_living15 0.0\n", + " sqft_lot15 0.0\n", + " dtype: float64}" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "identify_issues(house_data_clean)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewconditiongradesqft_abovesqft_basementyr_builtzipcodelatlongsqft_living15sqft_lot15
394718250690312014550000.041.75241084472.0NOGOODGood8 Good2060350.019369807447.6499-122.088252014789
2003886489001102014555000.032.50194032112.0NONONEAverage8 Good19400.020099802747.5644-122.09318803078
\n", + "
" + ], + "text/plain": [ + " id date price bedrooms bathrooms sqft_living sqft_lot \\\n", + "3947 1825069031 2014 550000.0 4 1.75 2410 8447 \n", + "20038 8648900110 2014 555000.0 3 2.50 1940 3211 \n", + "\n", + " floors waterfront view condition grade sqft_above sqft_basement \\\n", + "3947 2.0 NO GOOD Good 8 Good 2060 350.0 \n", + "20038 2.0 NO NONE Average 8 Good 1940 0.0 \n", + "\n", + " yr_built zipcode lat long sqft_living15 sqft_lot15 \n", + "3947 1936 98074 47.6499 -122.088 2520 14789 \n", + "20038 2009 98027 47.5644 -122.093 1880 3078 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "house_data_clean[house_data_clean.duplicated()]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 } From 045ce6159d1527b5878aa4faf6feebcf0c14ebab Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Wed, 1 May 2024 12:16:48 +0300 Subject: [PATCH 41/42] Update student.ipynb --- student.ipynb | 1152 ++++++++++++++++++++++++------------------------- 1 file changed, 564 insertions(+), 588 deletions(-) diff --git a/student.ipynb b/student.ipynb index 6216aa63..4a29c2f7 100644 --- a/student.ipynb +++ b/student.ipynb @@ -6,7 +6,7 @@ "source": [ "## Final Project Submission\n", "\n", - "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng, Derrick Kiptoo. \n", + "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng. \n", "* Student pace: full time\n", "* Scheduled project review date/time: \n", "* Instructor name: Nikita \n", @@ -91,35 +91,11 @@ "source": [ "## Data Loading\n", "\n" - Clyde - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Import Necessary Libraries" ] }, { "cell_type": "code", - "execution_count": 1, - "metadata": {}, - "outputs": [], - "source": [ - "import matplotlib.pyplot as plt\n", - "%matplotlib inline\n", - "import numpy as np\n", - "import pandas as pd\n", - "import scipy.stats as stats\n", - "import seaborn as sns\n", - "import statsmodels.api as sm" - - ] - }, - { - "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -491,566 +467,566 @@ "\n", "

21597 rows × 21 columns

\n", "" - ], - "text/plain": [ - " id date price bedrooms bathrooms sqft_living \\\n", - "0 7129300520 10/13/2014 221900.0 3 1.00 1180 \n", - "1 6414100192 12/9/2014 538000.0 3 2.25 2570 \n", - "2 5631500400 2/25/2015 180000.0 2 1.00 770 \n", - "3 2487200875 12/9/2014 604000.0 4 3.00 1960 \n", - "4 1954400510 2/18/2015 510000.0 3 2.00 1680 \n", - "... ... ... ... ... ... ... \n", - "21592 263000018 5/21/2014 360000.0 3 2.50 1530 \n", - "21593 6600060120 2/23/2015 400000.0 4 2.50 2310 \n", - "21594 1523300141 6/23/2014 402101.0 2 0.75 1020 \n", - "21595 291310100 1/16/2015 400000.0 3 2.50 1600 \n", - "21596 1523300157 10/15/2014 325000.0 2 0.75 1020 \n", - "\n", - " sqft_lot floors waterfront view ... grade sqft_above \\\n", - "0 5650 1.0 NaN NONE ... 7 Average 1180 \n", - "1 7242 2.0 NO NONE ... 7 Average 2170 \n", - "2 10000 1.0 NO NONE ... 6 Low Average 770 \n", - "3 5000 1.0 NO NONE ... 7 Average 1050 \n", - "4 8080 1.0 NO NONE ... 8 Good 1680 \n", - "... ... ... ... ... ... ... ... \n", - "21592 1131 3.0 NO NONE ... 8 Good 1530 \n", - "21593 5813 2.0 NO NONE ... 8 Good 2310 \n", - "21594 1350 2.0 NO NONE ... 7 Average 1020 \n", - "21595 2388 2.0 NaN NONE ... 8 Good 1600 \n", - "21596 1076 2.0 NO NONE ... 7 Average 1020 \n", - "\n", - " sqft_basement yr_built yr_renovated zipcode lat long \\\n", - "0 0.0 1955 0.0 98178 47.5112 -122.257 \n", - "1 400.0 1951 1991.0 98125 47.7210 -122.319 \n", - "2 0.0 1933 NaN 98028 47.7379 -122.233 \n", - "3 910.0 1965 0.0 98136 47.5208 -122.393 \n", - "4 0.0 1987 0.0 98074 47.6168 -122.045 \n", - "... ... ... ... ... ... ... \n", - "21592 0.0 2009 0.0 98103 47.6993 -122.346 \n", - "21593 0.0 2014 0.0 98146 47.5107 -122.362 \n", - "21594 0.0 2009 0.0 98144 47.5944 -122.299 \n", - "21595 0.0 2004 0.0 98027 47.5345 -122.069 \n", - "21596 0.0 2008 0.0 98144 47.5941 -122.299 \n", - "\n", - " sqft_living15 sqft_lot15 \n", - "0 1340 5650 \n", - "1 1690 7639 \n", - "2 2720 8062 \n", - "3 1360 5000 \n", - "4 1800 7503 \n", - "... ... ... \n", - "21592 1530 1509 \n", - "21593 1830 7200 \n", - "21594 1020 2007 \n", - "21595 1410 1287 \n", - "21596 1020 1357 \n", - "\n", - "[21597 rows x 21 columns]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "load_data('data/kc_house_data.csv') # Assuming 'data' folder is in the same directory\n", - "\n", - "\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The dataset contains 21 columns, each representing a distinct feature, and 21,597 rows, with each row corresponding to a specific house sale entry.\n", - "\n", - "The dataset contains a mix of data types, including integers (int64), floating-point numbers (float64), and objects (strings). For instance, numerical features such as bedrooms, bathrooms, and sqft_living are represented as integers or floating-point numbers to facilitate mathematical computations, while categorical features like waterfront and view are stored as objects to accommodate text-based categories." - ] - }, - { - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The dataset contains 21597 houses with 21 features\n", - "\n", - "Columns and their data types:\n", - "id: int64\n", - "date: object\n", - "price: float64\n", - "bedrooms: int64\n", - "bathrooms: float64\n", - "sqft_living: int64\n", - "sqft_lot: int64\n", - "floors: float64\n", - "waterfront: object\n", - "view: object\n", - "condition: object\n", - "grade: object\n", - "sqft_above: int64\n", - "sqft_basement: object\n", - "yr_built: int64\n", - "yr_renovated: float64\n", - "zipcode: int64\n", - "lat: float64\n", - "long: float64\n", - "sqft_living15: int64\n", - "sqft_lot15: int64\n", - "\n" - ] - } - ], - "source": [ - "kings_data = load_data('data/kc_house_data.csv')" - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "#create a function that takes in a column and returns the column statistics as a dictionary\n", - "def descriptive_analytics(column):\n", - " stats_dict = column.describe().to_dict()\n", - " \n", - " print(\"Descriptive Statistics for Column '{}':\".format(column.name))\n", - " print(\"The count of the column is:\", stats_dict['count'])\n", - " print(\"The mean of the column is:\", stats_dict['mean'])\n", - " print(\"The standard deviation of the column is:\", stats_dict['std'])\n", - " print(\"The minimum value of the column is:\", stats_dict['min'])\n", - " print(\"The 25th percentile of the column is:\", stats_dict['25%'])\n", - " print(\"The median of the column is:\", stats_dict['50%'])\n", - " print(\"The 75th percentile of the column is:\", stats_dict['75%'])\n", - " print(\"The maximum value of the column is:\", stats_dict['max'])" - ] - }, - { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Descriptive Statistics for Column 'price':\n", - "The count of the column is: 21597.0\n", - "The mean of the column is: 540296.5735055795\n", - "The standard deviation of the column is: 367368.1401013936\n", - "The minimum value of the column is: 78000.0\n", - "The 25th percentile of the column is: 322000.0\n", - "The median of the column is: 450000.0\n", - "The 75th percentile of the column is: 645000.0\n", - "The maximum value of the column is: 7700000.0\n" - ] - } - ], - "source": [ - "descriptive_analytics(kings_data['price'])" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see that the maximum price of a house is 7700000 dollars and the minimum price is 78000 dollars\n", - "\n", - "There are 21597 prices regarding to the houses in the dataset\n", - "\n", - "Average price of a house is 540296.57 dollars" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Preperation\n", - "\n" - ] - }, - { - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 21597 entries, 0 to 21596\n", - "Data columns (total 21 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 21597 non-null int64 \n", - " 1 date 21597 non-null object \n", - " 2 price 21597 non-null float64\n", - " 3 bedrooms 21597 non-null int64 \n", - " 4 bathrooms 21597 non-null float64\n", - " 5 sqft_living 21597 non-null int64 \n", - " 6 sqft_lot 21597 non-null int64 \n", - " 7 floors 21597 non-null float64\n", - " 8 waterfront 19221 non-null object \n", - " 9 view 21534 non-null object \n", - " 10 condition 21597 non-null object \n", - " 11 grade 21597 non-null object \n", - " 12 sqft_above 21597 non-null int64 \n", - " 13 sqft_basement 21597 non-null object \n", - " 14 yr_built 21597 non-null int64 \n", - " 15 yr_renovated 17755 non-null float64\n", - " 16 zipcode 21597 non-null int64 \n", - " 17 lat 21597 non-null float64\n", - " 18 long 21597 non-null float64\n", - " 19 sqft_living15 21597 non-null int64 \n", - " 20 sqft_lot15 21597 non-null int64 \n", - "dtypes: float64(6), int64(9), object(6)\n", - "memory usage: 3.5+ MB\n" - ] - } - ], - "source": [ - "kings_data.info()" - ] - }, - { - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "def identify_issues(dataset):\n", - " # Identify missing values as a percentage of the whole dataset\n", - " missing_values = (dataset.isnull().sum())/len(dataset) * 100\n", - "\n", - " # Identify duplicates\n", - " duplicates = dataset.duplicated().sum()\n", - " \n", - " #return a dictionary \n", - " return {'duplicates': duplicates,\n", - " 'missing values': missing_values.round(2)} \n" - ] - }, - { - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'duplicates': 0,\n", - " 'missing values': id 0.00\n", - " date 0.00\n", - " price 0.00\n", - " bedrooms 0.00\n", - " bathrooms 0.00\n", - " sqft_living 0.00\n", - " sqft_lot 0.00\n", - " floors 0.00\n", - " waterfront 11.00\n", - " view 0.29\n", - " condition 0.00\n", - " grade 0.00\n", - " sqft_above 0.00\n", - " sqft_basement 0.00\n", - " yr_built 0.00\n", - " yr_renovated 17.79\n", - " zipcode 0.00\n", - " lat 0.00\n", - " long 0.00\n", - " sqft_living15 0.00\n", - " sqft_lot15 0.00\n", - " dtype: float64}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "identify_issues(kings_data)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The examination indicates that there are no duplicate entries within the dataset, ensuring the integrity of the records. However, attention is warranted to address missing values present in certain columns. Specifically, the 'waterfront' feature exhibits 11% of null values, representing a negligible portion of the dataset. Similarly, the 'yr_renovated' feature shows a relatively higher percentage of missing values, accounting for approximately 17.79% of the dataset" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Before making changes make a copy instead of overwriting data" - ] - }, - { - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "house_data_clean = kings_data.copy()" - ] - }, - { - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# Changing the date to date time\n", - "house_data_clean['date'] = pd.to_datetime(house_data_clean['date'])\n", - "\n", - "# Extracting only the year from the column Date\n", - "house_data_clean.date = house_data_clean['date'].dt.year\n", - "\n", - "# Changing the dates for the year built \n", - "house_data_clean['yr_built'] = pd.to_datetime(house_data_clean['yr_built'],format='%Y').dt.year\n" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The above code converts the 'date' column data to only contain the year the house was sold, for the purpose of analysis we will use only the year since the changes month by month will be minor." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Dealing with the missing values" - ] - }, - { - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "def missing_values(dataset):\n", - " # drop the rows from views\n", - " dataset.dropna(subset=['view'],inplace=True)\n", - "\n", - " # Filling the NaN values for waterfront with NO\n", - " dataset.waterfront.fillna('NO',inplace=True)\n", - " \n", - " # Dropping the yr_renovated column \n", - " dataset.drop('yr_renovated',axis=1,inplace=True)" - ] - }, - { - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "missing_values(house_data_clean)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "'yr_renovated' has the highest percentage of NaN values 17%. This will be dropped since it will not be used within our model inline with the business problem.\n", - "\n", - "'Waterfront' feature has 11% null values, this was filled with NO on the assumption that these cells were not filled since they lacked waterfronts\n", - "\n", - "For the 'View' column, the null values were dropped by row since the overall percentage impact would be minute" - ] - }, - { - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'duplicates': 2,\n", - " 'missing values': id 0.0\n", - " date 0.0\n", - " price 0.0\n", - " bedrooms 0.0\n", - " bathrooms 0.0\n", - " sqft_living 0.0\n", - " sqft_lot 0.0\n", - " floors 0.0\n", - " waterfront 0.0\n", - " view 0.0\n", - " condition 0.0\n", - " grade 0.0\n", - " sqft_above 0.0\n", - " sqft_basement 0.0\n", - " yr_built 0.0\n", - " zipcode 0.0\n", - " lat 0.0\n", - " long 0.0\n", - " sqft_living15 0.0\n", - " sqft_lot15 0.0\n", - " dtype: float64}" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "identify_issues(house_data_clean)" - ] - }, - { - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewconditiongradesqft_abovesqft_basementyr_builtzipcodelatlongsqft_living15sqft_lot15
394718250690312014550000.041.75241084472.0NOGOODGood8 Good2060350.019369807447.6499-122.088252014789
2003886489001102014555000.032.50194032112.0NONONEAverage8 Good19400.020099802747.5644-122.09318803078
\n", - "
" - ], - "text/plain": [ - " id date price bedrooms bathrooms sqft_living sqft_lot \\\n", - "3947 1825069031 2014 550000.0 4 1.75 2410 8447 \n", - "20038 8648900110 2014 555000.0 3 2.50 1940 3211 \n", - "\n", - " floors waterfront view condition grade sqft_above sqft_basement \\\n", - "3947 2.0 NO GOOD Good 8 Good 2060 350.0 \n", - "20038 2.0 NO NONE Average 8 Good 1940 0.0 \n", - "\n", - " yr_built zipcode lat long sqft_living15 sqft_lot15 \n", - "3947 1936 98074 47.6499 -122.088 2520 14789 \n", - "20038 2009 98027 47.5644 -122.093 1880 3078 " - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "house_data_clean[house_data_clean.duplicated()]" - ] - } - ], - "metadata": { - "kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" - }, - "language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" - } - }, - "nbformat": 4, - "nbformat_minor": 2 + ], + "text/plain": [ + " id date price bedrooms bathrooms sqft_living \\\n", + "0 7129300520 10/13/2014 221900.0 3 1.00 1180 \n", + "1 6414100192 12/9/2014 538000.0 3 2.25 2570 \n", + "2 5631500400 2/25/2015 180000.0 2 1.00 770 \n", + "3 2487200875 12/9/2014 604000.0 4 3.00 1960 \n", + "4 1954400510 2/18/2015 510000.0 3 2.00 1680 \n", + "... ... ... ... ... ... ... \n", + "21592 263000018 5/21/2014 360000.0 3 2.50 1530 \n", + "21593 6600060120 2/23/2015 400000.0 4 2.50 2310 \n", + "21594 1523300141 6/23/2014 402101.0 2 0.75 1020 \n", + "21595 291310100 1/16/2015 400000.0 3 2.50 1600 \n", + "21596 1523300157 10/15/2014 325000.0 2 0.75 1020 \n", + "\n", + " sqft_lot floors waterfront view ... grade sqft_above \\\n", + "0 5650 1.0 NaN NONE ... 7 Average 1180 \n", + "1 7242 2.0 NO NONE ... 7 Average 2170 \n", + "2 10000 1.0 NO NONE ... 6 Low Average 770 \n", + "3 5000 1.0 NO NONE ... 7 Average 1050 \n", + "4 8080 1.0 NO NONE ... 8 Good 1680 \n", + "... ... ... ... ... ... ... ... \n", + "21592 1131 3.0 NO NONE ... 8 Good 1530 \n", + "21593 5813 2.0 NO NONE ... 8 Good 2310 \n", + "21594 1350 2.0 NO NONE ... 7 Average 1020 \n", + "21595 2388 2.0 NaN NONE ... 8 Good 1600 \n", + "21596 1076 2.0 NO NONE ... 7 Average 1020 \n", + "\n", + " sqft_basement yr_built yr_renovated zipcode lat long \\\n", + "0 0.0 1955 0.0 98178 47.5112 -122.257 \n", + "1 400.0 1951 1991.0 98125 47.7210 -122.319 \n", + "2 0.0 1933 NaN 98028 47.7379 -122.233 \n", + "3 910.0 1965 0.0 98136 47.5208 -122.393 \n", + "4 0.0 1987 0.0 98074 47.6168 -122.045 \n", + "... ... ... ... ... ... ... \n", + "21592 0.0 2009 0.0 98103 47.6993 -122.346 \n", + "21593 0.0 2014 0.0 98146 47.5107 -122.362 \n", + "21594 0.0 2009 0.0 98144 47.5944 -122.299 \n", + "21595 0.0 2004 0.0 98027 47.5345 -122.069 \n", + "21596 0.0 2008 0.0 98144 47.5941 -122.299 \n", + "\n", + " sqft_living15 sqft_lot15 \n", + "0 1340 5650 \n", + "1 1690 7639 \n", + "2 2720 8062 \n", + "3 1360 5000 \n", + "4 1800 7503 \n", + "... ... ... \n", + "21592 1530 1509 \n", + "21593 1830 7200 \n", + "21594 1020 2007 \n", + "21595 1410 1287 \n", + "21596 1020 1357 \n", + "\n", + "[21597 rows x 21 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "load_data('data/kc_house_data.csv') # Assuming 'data' folder is in the same directory\n", + "\n", + "\n" + ] +}, +{ + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The dataset contains 21 columns, each representing a distinct feature, and 21,597 rows, with each row corresponding to a specific house sale entry.\n", + "\n", + "The dataset contains a mix of data types, including integers (int64), floating-point numbers (float64), and objects (strings). For instance, numerical features such as bedrooms, bathrooms, and sqft_living are represented as integers or floating-point numbers to facilitate mathematical computations, while categorical features like waterfront and view are stored as objects to accommodate text-based categories." + ] +}, +{ + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The dataset contains 21597 houses with 21 features\n", + "\n", + "Columns and their data types:\n", + "id: int64\n", + "date: object\n", + "price: float64\n", + "bedrooms: int64\n", + "bathrooms: float64\n", + "sqft_living: int64\n", + "sqft_lot: int64\n", + "floors: float64\n", + "waterfront: object\n", + "view: object\n", + "condition: object\n", + "grade: object\n", + "sqft_above: int64\n", + "sqft_basement: object\n", + "yr_built: int64\n", + "yr_renovated: float64\n", + "zipcode: int64\n", + "lat: float64\n", + "long: float64\n", + "sqft_living15: int64\n", + "sqft_lot15: int64\n", + "\n" + ] + } + ], + "source": [ + "kings_data = load_data('data/kc_house_data.csv')" + ] +}, +{ + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "#create a function that takes in a column and returns the column statistics as a dictionary\n", + "def descriptive_analytics(column):\n", + " stats_dict = column.describe().to_dict()\n", + " \n", + " print(\"Descriptive Statistics for Column '{}':\".format(column.name))\n", + " print(\"The count of the column is:\", stats_dict['count'])\n", + " print(\"The mean of the column is:\", stats_dict['mean'])\n", + " print(\"The standard deviation of the column is:\", stats_dict['std'])\n", + " print(\"The minimum value of the column is:\", stats_dict['min'])\n", + " print(\"The 25th percentile of the column is:\", stats_dict['25%'])\n", + " print(\"The median of the column is:\", stats_dict['50%'])\n", + " print(\"The 75th percentile of the column is:\", stats_dict['75%'])\n", + " print(\"The maximum value of the column is:\", stats_dict['max'])" + ] +}, +{ + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Descriptive Statistics for Column 'price':\n", + "The count of the column is: 21597.0\n", + "The mean of the column is: 540296.5735055795\n", + "The standard deviation of the column is: 367368.1401013936\n", + "The minimum value of the column is: 78000.0\n", + "The 25th percentile of the column is: 322000.0\n", + "The median of the column is: 450000.0\n", + "The 75th percentile of the column is: 645000.0\n", + "The maximum value of the column is: 7700000.0\n" + ] + } + ], + "source": [ + "descriptive_analytics(kings_data['price'])" + ] +}, +{ + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the maximum price of a house is 7700000 dollars and the minimum price is 78000 dollars\n", + "\n", + "There are 21597 prices regarding to the houses in the dataset\n", + "\n", + "Average price of a house is 540296.57 dollars" + ] +}, +{ + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preperation\n", + "\n" + ] +}, +{ + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 21597 entries, 0 to 21596\n", + "Data columns (total 21 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 21597 non-null int64 \n", + " 1 date 21597 non-null object \n", + " 2 price 21597 non-null float64\n", + " 3 bedrooms 21597 non-null int64 \n", + " 4 bathrooms 21597 non-null float64\n", + " 5 sqft_living 21597 non-null int64 \n", + " 6 sqft_lot 21597 non-null int64 \n", + " 7 floors 21597 non-null float64\n", + " 8 waterfront 19221 non-null object \n", + " 9 view 21534 non-null object \n", + " 10 condition 21597 non-null object \n", + " 11 grade 21597 non-null object \n", + " 12 sqft_above 21597 non-null int64 \n", + " 13 sqft_basement 21597 non-null object \n", + " 14 yr_built 21597 non-null int64 \n", + " 15 yr_renovated 17755 non-null float64\n", + " 16 zipcode 21597 non-null int64 \n", + " 17 lat 21597 non-null float64\n", + " 18 long 21597 non-null float64\n", + " 19 sqft_living15 21597 non-null int64 \n", + " 20 sqft_lot15 21597 non-null int64 \n", + "dtypes: float64(6), int64(9), object(6)\n", + "memory usage: 3.5+ MB\n" + ] + } + ], + "source": [ + "kings_data.info()" + ] +}, +{ + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def identify_issues(dataset):\n", + " # Identify missing values as a percentage of the whole dataset\n", + " missing_values = (dataset.isnull().sum())/len(dataset) * 100\n", + "\n", + " # Identify duplicates\n", + " duplicates = dataset.duplicated().sum()\n", + " \n", + " #return a dictionary \n", + " return {'duplicates': duplicates,\n", + " 'missing values': missing_values.round(2)} \n" + ] +}, +{ + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'duplicates': 0,\n", + " 'missing values': id 0.00\n", + " date 0.00\n", + " price 0.00\n", + " bedrooms 0.00\n", + " bathrooms 0.00\n", + " sqft_living 0.00\n", + " sqft_lot 0.00\n", + " floors 0.00\n", + " waterfront 11.00\n", + " view 0.29\n", + " condition 0.00\n", + " grade 0.00\n", + " sqft_above 0.00\n", + " sqft_basement 0.00\n", + " yr_built 0.00\n", + " yr_renovated 17.79\n", + " zipcode 0.00\n", + " lat 0.00\n", + " long 0.00\n", + " sqft_living15 0.00\n", + " sqft_lot15 0.00\n", + " dtype: float64}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "identify_issues(kings_data)" + ] +}, +{ + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The examination indicates that there are no duplicate entries within the dataset, ensuring the integrity of the records. However, attention is warranted to address missing values present in certain columns. Specifically, the 'waterfront' feature exhibits 11% of null values, representing a negligible portion of the dataset. Similarly, the 'yr_renovated' feature shows a relatively higher percentage of missing values, accounting for approximately 17.79% of the dataset" + ] +}, +{ + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Before making changes make a copy instead of overwriting data" + ] +}, +{ + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "house_data_clean = kings_data.copy()" + ] +}, +{ + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Changing the date to date time\n", + "house_data_clean['date'] = pd.to_datetime(house_data_clean['date'])\n", + "\n", + "# Extracting only the year from the column Date\n", + "house_data_clean.date = house_data_clean['date'].dt.year\n", + "\n", + "# Changing the dates for the year built \n", + "house_data_clean['yr_built'] = pd.to_datetime(house_data_clean['yr_built'],format='%Y').dt.year\n" + ] +}, +{ + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above code converts the 'date' column data to only contain the year the house was sold, for the purpose of analysis we will use only the year since the changes month by month will be minor." + ] +}, +{ + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Dealing with the missing values" + ] +}, +{ + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def missing_values(dataset):\n", + " # drop the rows from views\n", + " dataset.dropna(subset=['view'],inplace=True)\n", + "\n", + " # Filling the NaN values for waterfront with NO\n", + " dataset.waterfront.fillna('NO',inplace=True)\n", + " \n", + " # Dropping the yr_renovated column \n", + " dataset.drop('yr_renovated',axis=1,inplace=True)" + ] +}, +{ + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "missing_values(house_data_clean)" + ] +}, +{ + "cell_type": "markdown", + "metadata": {}, + "source": [ + "'yr_renovated' has the highest percentage of NaN values 17%. This will be dropped since it will not be used within our model inline with the business problem.\n", + "\n", + "'Waterfront' feature has 11% null values, this was filled with NO on the assumption that these cells were not filled since they lacked waterfronts\n", + "\n", + "For the 'View' column, the null values were dropped by row since the overall percentage impact would be minute" + ] +}, +{ + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'duplicates': 2,\n", + " 'missing values': id 0.0\n", + " date 0.0\n", + " price 0.0\n", + " bedrooms 0.0\n", + " bathrooms 0.0\n", + " sqft_living 0.0\n", + " sqft_lot 0.0\n", + " floors 0.0\n", + " waterfront 0.0\n", + " view 0.0\n", + " condition 0.0\n", + " grade 0.0\n", + " sqft_above 0.0\n", + " sqft_basement 0.0\n", + " yr_built 0.0\n", + " zipcode 0.0\n", + " lat 0.0\n", + " long 0.0\n", + " sqft_living15 0.0\n", + " sqft_lot15 0.0\n", + " dtype: float64}" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "identify_issues(house_data_clean)" + ] +}, +{ + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewconditiongradesqft_abovesqft_basementyr_builtzipcodelatlongsqft_living15sqft_lot15
394718250690312014550000.041.75241084472.0NOGOODGood8 Good2060350.019369807447.6499-122.088252014789
2003886489001102014555000.032.50194032112.0NONONEAverage8 Good19400.020099802747.5644-122.09318803078
\n", + "
" + ], + "text/plain": [ + " id date price bedrooms bathrooms sqft_living sqft_lot \\\n", + "3947 1825069031 2014 550000.0 4 1.75 2410 8447 \n", + "20038 8648900110 2014 555000.0 3 2.50 1940 3211 \n", + "\n", + " floors waterfront view condition grade sqft_above sqft_basement \\\n", + "3947 2.0 NO GOOD Good 8 Good 2060 350.0 \n", + "20038 2.0 NO NONE Average 8 Good 1940 0.0 \n", + "\n", + " yr_built zipcode lat long sqft_living15 sqft_lot15 \n", + "3947 1936 98074 47.6499 -122.088 2520 14789 \n", + "20038 2009 98027 47.5644 -122.093 1880 3078 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "house_data_clean[house_data_clean.duplicated()]" + ] +} +], +"metadata": { +"kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" +}, +"language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" +} +}, +"nbformat": 4, +"nbformat_minor": 2 } From 89189c4f5e4bd353c9fe4c3db76a4319e4554327 Mon Sep 17 00:00:00 2001 From: clydeochieng <107258512+clydeochieng@users.noreply.github.com> Date: Wed, 1 May 2024 12:22:29 +0300 Subject: [PATCH 42/42] Update student.ipynb --- student.ipynb | 1152 +++++++++++++++++++++++++------------------------ 1 file changed, 588 insertions(+), 564 deletions(-) diff --git a/student.ipynb b/student.ipynb index 4a29c2f7..37939df3 100644 --- a/student.ipynb +++ b/student.ipynb @@ -6,7 +6,7 @@ "source": [ "## Final Project Submission\n", "\n", - "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng. \n", + "* Student name: Solphine Joseph, Grace Rotich, Mathew Kiprotich, Hilary Simiyu, Clyde Ochieng, Derrick Kiptoo. \n", "* Student pace: full time\n", "* Scheduled project review date/time: \n", "* Instructor name: Nikita \n", @@ -93,9 +93,33 @@ "\n" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Loading\n", + "\n", + "#### Import Necessary Libraries\n" + ] + }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "import matplotlib.pyplot as plt\n", + "%matplotlib inline\n", + "import numpy as np\n", + "import pandas as pd\n", + "import scipy.stats as stats\n", + "import seaborn as sns\n", + "import statsmodels.api as sm" + ] + }, + { + "cell_type": "code", + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -467,566 +491,566 @@ "\n", "

21597 rows × 21 columns

\n", "" - ], - "text/plain": [ - " id date price bedrooms bathrooms sqft_living \\\n", - "0 7129300520 10/13/2014 221900.0 3 1.00 1180 \n", - "1 6414100192 12/9/2014 538000.0 3 2.25 2570 \n", - "2 5631500400 2/25/2015 180000.0 2 1.00 770 \n", - "3 2487200875 12/9/2014 604000.0 4 3.00 1960 \n", - "4 1954400510 2/18/2015 510000.0 3 2.00 1680 \n", - "... ... ... ... ... ... ... \n", - "21592 263000018 5/21/2014 360000.0 3 2.50 1530 \n", - "21593 6600060120 2/23/2015 400000.0 4 2.50 2310 \n", - "21594 1523300141 6/23/2014 402101.0 2 0.75 1020 \n", - "21595 291310100 1/16/2015 400000.0 3 2.50 1600 \n", - "21596 1523300157 10/15/2014 325000.0 2 0.75 1020 \n", - "\n", - " sqft_lot floors waterfront view ... grade sqft_above \\\n", - "0 5650 1.0 NaN NONE ... 7 Average 1180 \n", - "1 7242 2.0 NO NONE ... 7 Average 2170 \n", - "2 10000 1.0 NO NONE ... 6 Low Average 770 \n", - "3 5000 1.0 NO NONE ... 7 Average 1050 \n", - "4 8080 1.0 NO NONE ... 8 Good 1680 \n", - "... ... ... ... ... ... ... ... \n", - "21592 1131 3.0 NO NONE ... 8 Good 1530 \n", - "21593 5813 2.0 NO NONE ... 8 Good 2310 \n", - "21594 1350 2.0 NO NONE ... 7 Average 1020 \n", - "21595 2388 2.0 NaN NONE ... 8 Good 1600 \n", - "21596 1076 2.0 NO NONE ... 7 Average 1020 \n", - "\n", - " sqft_basement yr_built yr_renovated zipcode lat long \\\n", - "0 0.0 1955 0.0 98178 47.5112 -122.257 \n", - "1 400.0 1951 1991.0 98125 47.7210 -122.319 \n", - "2 0.0 1933 NaN 98028 47.7379 -122.233 \n", - "3 910.0 1965 0.0 98136 47.5208 -122.393 \n", - "4 0.0 1987 0.0 98074 47.6168 -122.045 \n", - "... ... ... ... ... ... ... \n", - "21592 0.0 2009 0.0 98103 47.6993 -122.346 \n", - "21593 0.0 2014 0.0 98146 47.5107 -122.362 \n", - "21594 0.0 2009 0.0 98144 47.5944 -122.299 \n", - "21595 0.0 2004 0.0 98027 47.5345 -122.069 \n", - "21596 0.0 2008 0.0 98144 47.5941 -122.299 \n", - "\n", - " sqft_living15 sqft_lot15 \n", - "0 1340 5650 \n", - "1 1690 7639 \n", - "2 2720 8062 \n", - "3 1360 5000 \n", - "4 1800 7503 \n", - "... ... ... \n", - "21592 1530 1509 \n", - "21593 1830 7200 \n", - "21594 1020 2007 \n", - "21595 1410 1287 \n", - "21596 1020 1357 \n", - "\n", - "[21597 rows x 21 columns]" - ] - }, - "execution_count": 3, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "load_data('data/kc_house_data.csv') # Assuming 'data' folder is in the same directory\n", - "\n", - "\n" - ] -}, -{ - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The dataset contains 21 columns, each representing a distinct feature, and 21,597 rows, with each row corresponding to a specific house sale entry.\n", - "\n", - "The dataset contains a mix of data types, including integers (int64), floating-point numbers (float64), and objects (strings). For instance, numerical features such as bedrooms, bathrooms, and sqft_living are represented as integers or floating-point numbers to facilitate mathematical computations, while categorical features like waterfront and view are stored as objects to accommodate text-based categories." - ] -}, -{ - "cell_type": "code", - "execution_count": 4, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "The dataset contains 21597 houses with 21 features\n", - "\n", - "Columns and their data types:\n", - "id: int64\n", - "date: object\n", - "price: float64\n", - "bedrooms: int64\n", - "bathrooms: float64\n", - "sqft_living: int64\n", - "sqft_lot: int64\n", - "floors: float64\n", - "waterfront: object\n", - "view: object\n", - "condition: object\n", - "grade: object\n", - "sqft_above: int64\n", - "sqft_basement: object\n", - "yr_built: int64\n", - "yr_renovated: float64\n", - "zipcode: int64\n", - "lat: float64\n", - "long: float64\n", - "sqft_living15: int64\n", - "sqft_lot15: int64\n", - "\n" - ] - } - ], - "source": [ - "kings_data = load_data('data/kc_house_data.csv')" - ] -}, -{ - "cell_type": "code", - "execution_count": 5, - "metadata": {}, - "outputs": [], - "source": [ - "#create a function that takes in a column and returns the column statistics as a dictionary\n", - "def descriptive_analytics(column):\n", - " stats_dict = column.describe().to_dict()\n", - " \n", - " print(\"Descriptive Statistics for Column '{}':\".format(column.name))\n", - " print(\"The count of the column is:\", stats_dict['count'])\n", - " print(\"The mean of the column is:\", stats_dict['mean'])\n", - " print(\"The standard deviation of the column is:\", stats_dict['std'])\n", - " print(\"The minimum value of the column is:\", stats_dict['min'])\n", - " print(\"The 25th percentile of the column is:\", stats_dict['25%'])\n", - " print(\"The median of the column is:\", stats_dict['50%'])\n", - " print(\"The 75th percentile of the column is:\", stats_dict['75%'])\n", - " print(\"The maximum value of the column is:\", stats_dict['max'])" - ] -}, -{ - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Descriptive Statistics for Column 'price':\n", - "The count of the column is: 21597.0\n", - "The mean of the column is: 540296.5735055795\n", - "The standard deviation of the column is: 367368.1401013936\n", - "The minimum value of the column is: 78000.0\n", - "The 25th percentile of the column is: 322000.0\n", - "The median of the column is: 450000.0\n", - "The 75th percentile of the column is: 645000.0\n", - "The maximum value of the column is: 7700000.0\n" - ] - } - ], - "source": [ - "descriptive_analytics(kings_data['price'])" - ] -}, -{ - "cell_type": "markdown", - "metadata": {}, - "source": [ - "We can see that the maximum price of a house is 7700000 dollars and the minimum price is 78000 dollars\n", - "\n", - "There are 21597 prices regarding to the houses in the dataset\n", - "\n", - "Average price of a house is 540296.57 dollars" - ] -}, -{ - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## Data Preperation\n", - "\n" - ] -}, -{ - "cell_type": "code", - "execution_count": 7, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "RangeIndex: 21597 entries, 0 to 21596\n", - "Data columns (total 21 columns):\n", - " # Column Non-Null Count Dtype \n", - "--- ------ -------------- ----- \n", - " 0 id 21597 non-null int64 \n", - " 1 date 21597 non-null object \n", - " 2 price 21597 non-null float64\n", - " 3 bedrooms 21597 non-null int64 \n", - " 4 bathrooms 21597 non-null float64\n", - " 5 sqft_living 21597 non-null int64 \n", - " 6 sqft_lot 21597 non-null int64 \n", - " 7 floors 21597 non-null float64\n", - " 8 waterfront 19221 non-null object \n", - " 9 view 21534 non-null object \n", - " 10 condition 21597 non-null object \n", - " 11 grade 21597 non-null object \n", - " 12 sqft_above 21597 non-null int64 \n", - " 13 sqft_basement 21597 non-null object \n", - " 14 yr_built 21597 non-null int64 \n", - " 15 yr_renovated 17755 non-null float64\n", - " 16 zipcode 21597 non-null int64 \n", - " 17 lat 21597 non-null float64\n", - " 18 long 21597 non-null float64\n", - " 19 sqft_living15 21597 non-null int64 \n", - " 20 sqft_lot15 21597 non-null int64 \n", - "dtypes: float64(6), int64(9), object(6)\n", - "memory usage: 3.5+ MB\n" - ] - } - ], - "source": [ - "kings_data.info()" - ] -}, -{ - "cell_type": "code", - "execution_count": 8, - "metadata": {}, - "outputs": [], - "source": [ - "def identify_issues(dataset):\n", - " # Identify missing values as a percentage of the whole dataset\n", - " missing_values = (dataset.isnull().sum())/len(dataset) * 100\n", - "\n", - " # Identify duplicates\n", - " duplicates = dataset.duplicated().sum()\n", - " \n", - " #return a dictionary \n", - " return {'duplicates': duplicates,\n", - " 'missing values': missing_values.round(2)} \n" - ] -}, -{ - "cell_type": "code", - "execution_count": 9, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'duplicates': 0,\n", - " 'missing values': id 0.00\n", - " date 0.00\n", - " price 0.00\n", - " bedrooms 0.00\n", - " bathrooms 0.00\n", - " sqft_living 0.00\n", - " sqft_lot 0.00\n", - " floors 0.00\n", - " waterfront 11.00\n", - " view 0.29\n", - " condition 0.00\n", - " grade 0.00\n", - " sqft_above 0.00\n", - " sqft_basement 0.00\n", - " yr_built 0.00\n", - " yr_renovated 17.79\n", - " zipcode 0.00\n", - " lat 0.00\n", - " long 0.00\n", - " sqft_living15 0.00\n", - " sqft_lot15 0.00\n", - " dtype: float64}" - ] - }, - "execution_count": 9, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "identify_issues(kings_data)" - ] -}, -{ - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The examination indicates that there are no duplicate entries within the dataset, ensuring the integrity of the records. However, attention is warranted to address missing values present in certain columns. Specifically, the 'waterfront' feature exhibits 11% of null values, representing a negligible portion of the dataset. Similarly, the 'yr_renovated' feature shows a relatively higher percentage of missing values, accounting for approximately 17.79% of the dataset" - ] -}, -{ - "cell_type": "markdown", - "metadata": {}, - "source": [ - "##### Before making changes make a copy instead of overwriting data" - ] -}, -{ - "cell_type": "code", - "execution_count": 10, - "metadata": {}, - "outputs": [], - "source": [ - "house_data_clean = kings_data.copy()" - ] -}, -{ - "cell_type": "code", - "execution_count": 11, - "metadata": {}, - "outputs": [], - "source": [ - "# Changing the date to date time\n", - "house_data_clean['date'] = pd.to_datetime(house_data_clean['date'])\n", - "\n", - "# Extracting only the year from the column Date\n", - "house_data_clean.date = house_data_clean['date'].dt.year\n", - "\n", - "# Changing the dates for the year built \n", - "house_data_clean['yr_built'] = pd.to_datetime(house_data_clean['yr_built'],format='%Y').dt.year\n" - ] -}, -{ - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The above code converts the 'date' column data to only contain the year the house was sold, for the purpose of analysis we will use only the year since the changes month by month will be minor." - ] -}, -{ - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### Dealing with the missing values" - ] -}, -{ - "cell_type": "code", - "execution_count": 12, - "metadata": {}, - "outputs": [], - "source": [ - "def missing_values(dataset):\n", - " # drop the rows from views\n", - " dataset.dropna(subset=['view'],inplace=True)\n", - "\n", - " # Filling the NaN values for waterfront with NO\n", - " dataset.waterfront.fillna('NO',inplace=True)\n", - " \n", - " # Dropping the yr_renovated column \n", - " dataset.drop('yr_renovated',axis=1,inplace=True)" - ] -}, -{ - "cell_type": "code", - "execution_count": 13, - "metadata": {}, - "outputs": [], - "source": [ - "missing_values(house_data_clean)" - ] -}, -{ - "cell_type": "markdown", - "metadata": {}, - "source": [ - "'yr_renovated' has the highest percentage of NaN values 17%. This will be dropped since it will not be used within our model inline with the business problem.\n", - "\n", - "'Waterfront' feature has 11% null values, this was filled with NO on the assumption that these cells were not filled since they lacked waterfronts\n", - "\n", - "For the 'View' column, the null values were dropped by row since the overall percentage impact would be minute" - ] -}, -{ - "cell_type": "code", - "execution_count": 14, - "metadata": {}, - "outputs": [ - { - "data": { - "text/plain": [ - "{'duplicates': 2,\n", - " 'missing values': id 0.0\n", - " date 0.0\n", - " price 0.0\n", - " bedrooms 0.0\n", - " bathrooms 0.0\n", - " sqft_living 0.0\n", - " sqft_lot 0.0\n", - " floors 0.0\n", - " waterfront 0.0\n", - " view 0.0\n", - " condition 0.0\n", - " grade 0.0\n", - " sqft_above 0.0\n", - " sqft_basement 0.0\n", - " yr_built 0.0\n", - " zipcode 0.0\n", - " lat 0.0\n", - " long 0.0\n", - " sqft_living15 0.0\n", - " sqft_lot15 0.0\n", - " dtype: float64}" - ] - }, - "execution_count": 14, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "identify_issues(house_data_clean)" - ] -}, -{ - "cell_type": "code", - "execution_count": 15, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
\n", - "\n", - "\n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - " \n", - "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewconditiongradesqft_abovesqft_basementyr_builtzipcodelatlongsqft_living15sqft_lot15
394718250690312014550000.041.75241084472.0NOGOODGood8 Good2060350.019369807447.6499-122.088252014789
2003886489001102014555000.032.50194032112.0NONONEAverage8 Good19400.020099802747.5644-122.09318803078
\n", - "
" - ], - "text/plain": [ - " id date price bedrooms bathrooms sqft_living sqft_lot \\\n", - "3947 1825069031 2014 550000.0 4 1.75 2410 8447 \n", - "20038 8648900110 2014 555000.0 3 2.50 1940 3211 \n", - "\n", - " floors waterfront view condition grade sqft_above sqft_basement \\\n", - "3947 2.0 NO GOOD Good 8 Good 2060 350.0 \n", - "20038 2.0 NO NONE Average 8 Good 1940 0.0 \n", - "\n", - " yr_built zipcode lat long sqft_living15 sqft_lot15 \n", - "3947 1936 98074 47.6499 -122.088 2520 14789 \n", - "20038 2009 98027 47.5644 -122.093 1880 3078 " - ] - }, - "execution_count": 15, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "house_data_clean[house_data_clean.duplicated()]" - ] -} -], -"metadata": { -"kernelspec": { - "display_name": "Python 3", - "language": "python", - "name": "python3" -}, -"language_info": { - "codemirror_mode": { - "name": "ipython", - "version": 3 - }, - "file_extension": ".py", - "mimetype": "text/x-python", - "name": "python", - "nbconvert_exporter": "python", - "pygments_lexer": "ipython3", - "version": "3.10.13" -} -}, -"nbformat": 4, -"nbformat_minor": 2 + ], + "text/plain": [ + " id date price bedrooms bathrooms sqft_living \\\n", + "0 7129300520 10/13/2014 221900.0 3 1.00 1180 \n", + "1 6414100192 12/9/2014 538000.0 3 2.25 2570 \n", + "2 5631500400 2/25/2015 180000.0 2 1.00 770 \n", + "3 2487200875 12/9/2014 604000.0 4 3.00 1960 \n", + "4 1954400510 2/18/2015 510000.0 3 2.00 1680 \n", + "... ... ... ... ... ... ... \n", + "21592 263000018 5/21/2014 360000.0 3 2.50 1530 \n", + "21593 6600060120 2/23/2015 400000.0 4 2.50 2310 \n", + "21594 1523300141 6/23/2014 402101.0 2 0.75 1020 \n", + "21595 291310100 1/16/2015 400000.0 3 2.50 1600 \n", + "21596 1523300157 10/15/2014 325000.0 2 0.75 1020 \n", + "\n", + " sqft_lot floors waterfront view ... grade sqft_above \\\n", + "0 5650 1.0 NaN NONE ... 7 Average 1180 \n", + "1 7242 2.0 NO NONE ... 7 Average 2170 \n", + "2 10000 1.0 NO NONE ... 6 Low Average 770 \n", + "3 5000 1.0 NO NONE ... 7 Average 1050 \n", + "4 8080 1.0 NO NONE ... 8 Good 1680 \n", + "... ... ... ... ... ... ... ... \n", + "21592 1131 3.0 NO NONE ... 8 Good 1530 \n", + "21593 5813 2.0 NO NONE ... 8 Good 2310 \n", + "21594 1350 2.0 NO NONE ... 7 Average 1020 \n", + "21595 2388 2.0 NaN NONE ... 8 Good 1600 \n", + "21596 1076 2.0 NO NONE ... 7 Average 1020 \n", + "\n", + " sqft_basement yr_built yr_renovated zipcode lat long \\\n", + "0 0.0 1955 0.0 98178 47.5112 -122.257 \n", + "1 400.0 1951 1991.0 98125 47.7210 -122.319 \n", + "2 0.0 1933 NaN 98028 47.7379 -122.233 \n", + "3 910.0 1965 0.0 98136 47.5208 -122.393 \n", + "4 0.0 1987 0.0 98074 47.6168 -122.045 \n", + "... ... ... ... ... ... ... \n", + "21592 0.0 2009 0.0 98103 47.6993 -122.346 \n", + "21593 0.0 2014 0.0 98146 47.5107 -122.362 \n", + "21594 0.0 2009 0.0 98144 47.5944 -122.299 \n", + "21595 0.0 2004 0.0 98027 47.5345 -122.069 \n", + "21596 0.0 2008 0.0 98144 47.5941 -122.299 \n", + "\n", + " sqft_living15 sqft_lot15 \n", + "0 1340 5650 \n", + "1 1690 7639 \n", + "2 2720 8062 \n", + "3 1360 5000 \n", + "4 1800 7503 \n", + "... ... ... \n", + "21592 1530 1509 \n", + "21593 1830 7200 \n", + "21594 1020 2007 \n", + "21595 1410 1287 \n", + "21596 1020 1357 \n", + "\n", + "[21597 rows x 21 columns]" + ] + }, + "execution_count": 3, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "load_data('data/kc_house_data.csv') # Assuming 'data' folder is in the same directory\n", + "\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The dataset contains 21 columns, each representing a distinct feature, and 21,597 rows, with each row corresponding to a specific house sale entry.\n", + "\n", + "The dataset contains a mix of data types, including integers (int64), floating-point numbers (float64), and objects (strings). For instance, numerical features such as bedrooms, bathrooms, and sqft_living are represented as integers or floating-point numbers to facilitate mathematical computations, while categorical features like waterfront and view are stored as objects to accommodate text-based categories." + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "The dataset contains 21597 houses with 21 features\n", + "\n", + "Columns and their data types:\n", + "id: int64\n", + "date: object\n", + "price: float64\n", + "bedrooms: int64\n", + "bathrooms: float64\n", + "sqft_living: int64\n", + "sqft_lot: int64\n", + "floors: float64\n", + "waterfront: object\n", + "view: object\n", + "condition: object\n", + "grade: object\n", + "sqft_above: int64\n", + "sqft_basement: object\n", + "yr_built: int64\n", + "yr_renovated: float64\n", + "zipcode: int64\n", + "lat: float64\n", + "long: float64\n", + "sqft_living15: int64\n", + "sqft_lot15: int64\n", + "\n" + ] + } + ], + "source": [ + "kings_data = load_data('data/kc_house_data.csv')" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": {}, + "outputs": [], + "source": [ + "#create a function that takes in a column and returns the column statistics as a dictionary\n", + "def descriptive_analytics(column):\n", + " stats_dict = column.describe().to_dict()\n", + " \n", + " print(\"Descriptive Statistics for Column '{}':\".format(column.name))\n", + " print(\"The count of the column is:\", stats_dict['count'])\n", + " print(\"The mean of the column is:\", stats_dict['mean'])\n", + " print(\"The standard deviation of the column is:\", stats_dict['std'])\n", + " print(\"The minimum value of the column is:\", stats_dict['min'])\n", + " print(\"The 25th percentile of the column is:\", stats_dict['25%'])\n", + " print(\"The median of the column is:\", stats_dict['50%'])\n", + " print(\"The 75th percentile of the column is:\", stats_dict['75%'])\n", + " print(\"The maximum value of the column is:\", stats_dict['max'])" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Descriptive Statistics for Column 'price':\n", + "The count of the column is: 21597.0\n", + "The mean of the column is: 540296.5735055795\n", + "The standard deviation of the column is: 367368.1401013936\n", + "The minimum value of the column is: 78000.0\n", + "The 25th percentile of the column is: 322000.0\n", + "The median of the column is: 450000.0\n", + "The 75th percentile of the column is: 645000.0\n", + "The maximum value of the column is: 7700000.0\n" + ] + } + ], + "source": [ + "descriptive_analytics(kings_data['price'])" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "We can see that the maximum price of a house is 7700000 dollars and the minimum price is 78000 dollars\n", + "\n", + "There are 21597 prices regarding to the houses in the dataset\n", + "\n", + "Average price of a house is 540296.57 dollars" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Data Preperation\n", + "\n" + ] + }, + { + "cell_type": "code", + "execution_count": 7, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 21597 entries, 0 to 21596\n", + "Data columns (total 21 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 21597 non-null int64 \n", + " 1 date 21597 non-null object \n", + " 2 price 21597 non-null float64\n", + " 3 bedrooms 21597 non-null int64 \n", + " 4 bathrooms 21597 non-null float64\n", + " 5 sqft_living 21597 non-null int64 \n", + " 6 sqft_lot 21597 non-null int64 \n", + " 7 floors 21597 non-null float64\n", + " 8 waterfront 19221 non-null object \n", + " 9 view 21534 non-null object \n", + " 10 condition 21597 non-null object \n", + " 11 grade 21597 non-null object \n", + " 12 sqft_above 21597 non-null int64 \n", + " 13 sqft_basement 21597 non-null object \n", + " 14 yr_built 21597 non-null int64 \n", + " 15 yr_renovated 17755 non-null float64\n", + " 16 zipcode 21597 non-null int64 \n", + " 17 lat 21597 non-null float64\n", + " 18 long 21597 non-null float64\n", + " 19 sqft_living15 21597 non-null int64 \n", + " 20 sqft_lot15 21597 non-null int64 \n", + "dtypes: float64(6), int64(9), object(6)\n", + "memory usage: 3.5+ MB\n" + ] + } + ], + "source": [ + "kings_data.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": {}, + "outputs": [], + "source": [ + "def identify_issues(dataset):\n", + " # Identify missing values as a percentage of the whole dataset\n", + " missing_values = (dataset.isnull().sum())/len(dataset) * 100\n", + "\n", + " # Identify duplicates\n", + " duplicates = dataset.duplicated().sum()\n", + " \n", + " #return a dictionary \n", + " return {'duplicates': duplicates,\n", + " 'missing values': missing_values.round(2)} \n" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'duplicates': 0,\n", + " 'missing values': id 0.00\n", + " date 0.00\n", + " price 0.00\n", + " bedrooms 0.00\n", + " bathrooms 0.00\n", + " sqft_living 0.00\n", + " sqft_lot 0.00\n", + " floors 0.00\n", + " waterfront 11.00\n", + " view 0.29\n", + " condition 0.00\n", + " grade 0.00\n", + " sqft_above 0.00\n", + " sqft_basement 0.00\n", + " yr_built 0.00\n", + " yr_renovated 17.79\n", + " zipcode 0.00\n", + " lat 0.00\n", + " long 0.00\n", + " sqft_living15 0.00\n", + " sqft_lot15 0.00\n", + " dtype: float64}" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "identify_issues(kings_data)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The examination indicates that there are no duplicate entries within the dataset, ensuring the integrity of the records. However, attention is warranted to address missing values present in certain columns. Specifically, the 'waterfront' feature exhibits 11% of null values, representing a negligible portion of the dataset. Similarly, the 'yr_renovated' feature shows a relatively higher percentage of missing values, accounting for approximately 17.79% of the dataset" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "##### Before making changes make a copy instead of overwriting data" + ] + }, + { + "cell_type": "code", + "execution_count": 10, + "metadata": {}, + "outputs": [], + "source": [ + "house_data_clean = kings_data.copy()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [], + "source": [ + "# Changing the date to date time\n", + "house_data_clean['date'] = pd.to_datetime(house_data_clean['date'])\n", + "\n", + "# Extracting only the year from the column Date\n", + "house_data_clean.date = house_data_clean['date'].dt.year\n", + "\n", + "# Changing the dates for the year built \n", + "house_data_clean['yr_built'] = pd.to_datetime(house_data_clean['yr_built'],format='%Y').dt.year\n" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "The above code converts the 'date' column data to only contain the year the house was sold, for the purpose of analysis we will use only the year since the changes month by month will be minor." + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Dealing with the missing values" + ] + }, + { + "cell_type": "code", + "execution_count": 12, + "metadata": {}, + "outputs": [], + "source": [ + "def missing_values(dataset):\n", + " # drop the rows from views\n", + " dataset.dropna(subset=['view'],inplace=True)\n", + "\n", + " # Filling the NaN values for waterfront with NO\n", + " dataset.waterfront.fillna('NO',inplace=True)\n", + " \n", + " # Dropping the yr_renovated column \n", + " dataset.drop('yr_renovated',axis=1,inplace=True)" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": {}, + "outputs": [], + "source": [ + "missing_values(house_data_clean)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "'yr_renovated' has the highest percentage of NaN values 17%. This will be dropped since it will not be used within our model inline with the business problem.\n", + "\n", + "'Waterfront' feature has 11% null values, this was filled with NO on the assumption that these cells were not filled since they lacked waterfronts\n", + "\n", + "For the 'View' column, the null values were dropped by row since the overall percentage impact would be minute" + ] + }, + { + "cell_type": "code", + "execution_count": 14, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{'duplicates': 2,\n", + " 'missing values': id 0.0\n", + " date 0.0\n", + " price 0.0\n", + " bedrooms 0.0\n", + " bathrooms 0.0\n", + " sqft_living 0.0\n", + " sqft_lot 0.0\n", + " floors 0.0\n", + " waterfront 0.0\n", + " view 0.0\n", + " condition 0.0\n", + " grade 0.0\n", + " sqft_above 0.0\n", + " sqft_basement 0.0\n", + " yr_built 0.0\n", + " zipcode 0.0\n", + " lat 0.0\n", + " long 0.0\n", + " sqft_living15 0.0\n", + " sqft_lot15 0.0\n", + " dtype: float64}" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "identify_issues(house_data_clean)" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontviewconditiongradesqft_abovesqft_basementyr_builtzipcodelatlongsqft_living15sqft_lot15
394718250690312014550000.041.75241084472.0NOGOODGood8 Good2060350.019369807447.6499-122.088252014789
2003886489001102014555000.032.50194032112.0NONONEAverage8 Good19400.020099802747.5644-122.09318803078
\n", + "
" + ], + "text/plain": [ + " id date price bedrooms bathrooms sqft_living sqft_lot \\\n", + "3947 1825069031 2014 550000.0 4 1.75 2410 8447 \n", + "20038 8648900110 2014 555000.0 3 2.50 1940 3211 \n", + "\n", + " floors waterfront view condition grade sqft_above sqft_basement \\\n", + "3947 2.0 NO GOOD Good 8 Good 2060 350.0 \n", + "20038 2.0 NO NONE Average 8 Good 1940 0.0 \n", + "\n", + " yr_built zipcode lat long sqft_living15 sqft_lot15 \n", + "3947 1936 98074 47.6499 -122.088 2520 14789 \n", + "20038 2009 98027 47.5644 -122.093 1880 3078 " + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "house_data_clean[house_data_clean.duplicated()]" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.10.13" + } + }, + "nbformat": 4, + "nbformat_minor": 2 }