From e56af21b412a96d1869786e1a2321319173d8009 Mon Sep 17 00:00:00 2001 From: Jakkkc Date: Sun, 28 Apr 2024 14:10:08 +0300 Subject: [PATCH 1/3] project init --- student.ipynb | 254 +++++++++++++++++++++++++++++++++++++++++++++++++- 1 file changed, 251 insertions(+), 3 deletions(-) diff --git a/student.ipynb b/student.ipynb index d3bb34af..bd046b77 100644 --- a/student.ipynb +++ b/student.ipynb @@ -16,12 +16,260 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "# Your code here - remember to use markdown cells for comments as well!\n", + "import pandas as pd" + ] + }, + { + "cell_type": "code", + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ - "# Your code here - remember to use markdown cells for comments as well!" + "def load_data(filepath):\n", + "# read csv file\n", + " data = pd.read_csv(filepath)\n", + " \n", + " \n", + " \n", + " return data" ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": {}, + "outputs": [], + "source": [ + "df = load_data(\"data/kc_house_data.csv\")" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
iddatepricebedroomsbathroomssqft_livingsqft_lotfloorswaterfrontview...gradesqft_abovesqft_basementyr_builtyr_renovatedzipcodelatlongsqft_living15sqft_lot15
0712930052010/13/2014221900.031.00118056501.0NaNNONE...7 Average11800.019550.09817847.5112-122.25713405650
1641410019212/9/2014538000.032.25257072422.0NONONE...7 Average2170400.019511991.09812547.7210-122.31916907639
256315004002/25/2015180000.021.00770100001.0NONONE...6 Low Average7700.01933NaN9802847.7379-122.23327208062
3248720087512/9/2014604000.043.00196050001.0NONONE...7 Average1050910.019650.09813647.5208-122.39313605000
419544005102/18/2015510000.032.00168080801.0NONONE...8 Good16800.019870.09807447.6168-122.04518007503
\n", + "

5 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " id date price bedrooms bathrooms sqft_living \\\n", + "0 7129300520 10/13/2014 221900.0 3 1.00 1180 \n", + "1 6414100192 12/9/2014 538000.0 3 2.25 2570 \n", + "2 5631500400 2/25/2015 180000.0 2 1.00 770 \n", + "3 2487200875 12/9/2014 604000.0 4 3.00 1960 \n", + "4 1954400510 2/18/2015 510000.0 3 2.00 1680 \n", + "\n", + " sqft_lot floors waterfront view ... grade sqft_above \\\n", + "0 5650 1.0 NaN NONE ... 7 Average 1180 \n", + "1 7242 2.0 NO NONE ... 7 Average 2170 \n", + "2 10000 1.0 NO NONE ... 6 Low Average 770 \n", + "3 5000 1.0 NO NONE ... 7 Average 1050 \n", + "4 8080 1.0 NO NONE ... 8 Good 1680 \n", + "\n", + " sqft_basement yr_built yr_renovated zipcode lat long \\\n", + "0 0.0 1955 0.0 98178 47.5112 -122.257 \n", + "1 400.0 1951 1991.0 98125 47.7210 -122.319 \n", + "2 0.0 1933 NaN 98028 47.7379 -122.233 \n", + "3 910.0 1965 0.0 98136 47.5208 -122.393 \n", + "4 0.0 1987 0.0 98074 47.6168 -122.045 \n", + "\n", + " sqft_living15 sqft_lot15 \n", + "0 1340 5650 \n", + "1 1690 7639 \n", + "2 2720 8062 \n", + "3 1360 5000 \n", + "4 1800 7503 \n", + "\n", + "[5 rows x 21 columns]" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df.head()" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] } ], "metadata": { @@ -40,7 +288,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.6.4" + "version": "3.8.5" } }, "nbformat": 4, From 80089f85141adb8508bd310df0e8507419736e31 Mon Sep 17 00:00:00 2001 From: Jakkkc Date: Sun, 28 Apr 2024 14:56:19 +0300 Subject: [PATCH 2/3] look for inconsistencies --- student.ipynb | 214 ++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 209 insertions(+), 5 deletions(-) diff --git a/student.ipynb b/student.ipynb index bd046b77..864cd41a 100644 --- a/student.ipynb +++ b/student.ipynb @@ -16,7 +16,7 @@ }, { "cell_type": "code", - "execution_count": 1, + "execution_count": 2, "metadata": {}, "outputs": [], "source": [ @@ -26,7 +26,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": 3, "metadata": {}, "outputs": [], "source": [ @@ -41,7 +41,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": 4, "metadata": {}, "outputs": [], "source": [ @@ -50,7 +50,7 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -255,7 +255,7 @@ "[5 rows x 21 columns]" ] }, - "execution_count": 4, + "execution_count": 5, "metadata": {}, "output_type": "execute_result" } @@ -264,6 +264,210 @@ "df.head()" ] }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "\n", + "RangeIndex: 21597 entries, 0 to 21596\n", + "Data columns (total 21 columns):\n", + " # Column Non-Null Count Dtype \n", + "--- ------ -------------- ----- \n", + " 0 id 21597 non-null int64 \n", + " 1 date 21597 non-null object \n", + " 2 price 21597 non-null float64\n", + " 3 bedrooms 21597 non-null int64 \n", + " 4 bathrooms 21597 non-null float64\n", + " 5 sqft_living 21597 non-null int64 \n", + " 6 sqft_lot 21597 non-null int64 \n", + " 7 floors 21597 non-null float64\n", + " 8 waterfront 19221 non-null object \n", + " 9 view 21534 non-null object \n", + " 10 condition 21597 non-null object \n", + " 11 grade 21597 non-null object \n", + " 12 sqft_above 21597 non-null int64 \n", + " 13 sqft_basement 21597 non-null object \n", + " 14 yr_built 21597 non-null int64 \n", + " 15 yr_renovated 17755 non-null float64\n", + " 16 zipcode 21597 non-null int64 \n", + " 17 lat 21597 non-null float64\n", + " 18 long 21597 non-null float64\n", + " 19 sqft_living15 21597 non-null int64 \n", + " 20 sqft_lot15 21597 non-null int64 \n", + "dtypes: float64(6), int64(9), object(6)\n", + "memory usage: 3.5+ MB\n" + ] + } + ], + "source": [ + "df.info()" + ] + }, + { + "cell_type": "code", + "execution_count": 11, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
bedroomsbathroomssqft_livingsqft_lotfloorssqft_abovesqft_living15sqft_lot15
count21597.00000021597.00000021597.0000002.159700e+0421597.00000021597.00000021597.00000021597.000000
mean3.3732002.1158262080.3218501.509941e+041.4940961788.5968421986.62031812758.283512
std0.9262990.768984918.1061254.141264e+040.539683827.759761685.23047227274.441950
min1.0000000.500000370.0000005.200000e+021.000000370.000000399.000000651.000000
25%3.0000001.7500001430.0000005.040000e+031.0000001190.0000001490.0000005100.000000
50%3.0000002.2500001910.0000007.618000e+031.5000001560.0000001840.0000007620.000000
75%4.0000002.5000002550.0000001.068500e+042.0000002210.0000002360.00000010083.000000
max33.0000008.00000013540.0000001.651359e+063.5000009410.0000006210.000000871200.000000
\n", + "
" + ], + "text/plain": [ + " bedrooms bathrooms sqft_living sqft_lot floors \\\n", + "count 21597.000000 21597.000000 21597.000000 2.159700e+04 21597.000000 \n", + "mean 3.373200 2.115826 2080.321850 1.509941e+04 1.494096 \n", + "std 0.926299 0.768984 918.106125 4.141264e+04 0.539683 \n", + "min 1.000000 0.500000 370.000000 5.200000e+02 1.000000 \n", + "25% 3.000000 1.750000 1430.000000 5.040000e+03 1.000000 \n", + "50% 3.000000 2.250000 1910.000000 7.618000e+03 1.500000 \n", + "75% 4.000000 2.500000 2550.000000 1.068500e+04 2.000000 \n", + "max 33.000000 8.000000 13540.000000 1.651359e+06 3.500000 \n", + "\n", + " sqft_above sqft_living15 sqft_lot15 \n", + "count 21597.000000 21597.000000 21597.000000 \n", + "mean 1788.596842 1986.620318 12758.283512 \n", + "std 827.759761 685.230472 27274.441950 \n", + "min 370.000000 399.000000 651.000000 \n", + "25% 1190.000000 1490.000000 5100.000000 \n", + "50% 1560.000000 1840.000000 7620.000000 \n", + "75% 2210.000000 2360.000000 10083.000000 \n", + "max 9410.000000 6210.000000 871200.000000 " + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "df[[\"bedrooms\",\"bathrooms\",\"sqft_living\",\"sqft_lot\",\"floors\",\"sqft_above\",\"sqft_basement\",\"sqft_living15\",\"sqft_lot15\"]].describe()" + ] + }, { "cell_type": "code", "execution_count": null, From 9b2ba2dba44e554bf429a7c237dbc98ec6127793 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?=E2=80=9CIan?= Date: Sun, 28 Apr 2024 15:55:15 +0300 Subject: [PATCH 3/3] Check for null values --- student.ipynb | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/student.ipynb b/student.ipynb index 864cd41a..8854a316 100644 --- a/student.ipynb +++ b/student.ipynb @@ -478,7 +478,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3", + "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, @@ -492,9 +492,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.5" + "version": "3.11.7" } }, "nbformat": 4, - "nbformat_minor": 2 + "nbformat_minor": 4 }