From 7a2339748d8abb62ca2117bfaeb23f089aede23f Mon Sep 17 00:00:00 2001 From: Trevor Lyon Date: Wed, 24 Jun 2020 10:56:18 -0600 Subject: [PATCH 1/4] FEAT: Add Utah scraper --- .../datasets/official/UT/__init__.py | 1 + src/cmdc_tools/datasets/official/UT/data.py | 33 +++++++++++++++++++ 2 files changed, 34 insertions(+) create mode 100644 src/cmdc_tools/datasets/official/UT/__init__.py create mode 100644 src/cmdc_tools/datasets/official/UT/data.py diff --git a/src/cmdc_tools/datasets/official/UT/__init__.py b/src/cmdc_tools/datasets/official/UT/__init__.py new file mode 100644 index 0000000..14bcd55 --- /dev/null +++ b/src/cmdc_tools/datasets/official/UT/__init__.py @@ -0,0 +1 @@ +from .data import Utah diff --git a/src/cmdc_tools/datasets/official/UT/data.py b/src/cmdc_tools/datasets/official/UT/data.py new file mode 100644 index 0000000..0c9266c --- /dev/null +++ b/src/cmdc_tools/datasets/official/UT/data.py @@ -0,0 +1,33 @@ +import pandas as pd + +from ...base import DatasetBaseNoDate +from ..base import ArcGIS + + +class Utah(ArcGIS, DatasetBaseNoDate): + ARCGIS_ID = "KaHXE9OkiB9e63uE" + + def get(self): + df = self.get_all_sheet_to_df( + "Utah_COVID19_Cases_by_Local_Health_Department_over_time", sheet=0, srvid=6 + ) + + renamed = df.rename( + columns={ + "COVID_Cases_Total": "cases_total", + "Day": "dt", + "Hospitalizations": "cumulative_hospitalized", + "DISTNAME": "region", + } + ) + + renamed["dt"] = renamed["dt"].map(lambda x: pd.datetime.fromtimestamp(x / 1000)) + + # Keep only region rows + # counties = renamed.loc[renamed.region.str.contains("County")] + + return ( + renamed[["dt", "region", "cases_total", "cumulative_hospitalized"]] + .sort_values(["dt", "region"]) + .melt(id_vars=["dt", "region"], var_name="variable_name") + ) From cc084e3148fc66335860fcc93aed72ba122f5869 Mon Sep 17 00:00:00 2001 From: Trevor Lyon Date: Thu, 30 Jul 2020 16:08:14 -0600 Subject: [PATCH 2/4] FEAT: UT scraper --- src/cmdc_tools/datasets/__init__.py | 2 + .../datasets/official/UT/__init__.py | 2 +- src/cmdc_tools/datasets/official/UT/data.py | 124 ++++++++++++++++-- src/cmdc_tools/datasets/official/__init__.py | 1 + 4 files changed, 118 insertions(+), 11 deletions(-) diff --git a/src/cmdc_tools/datasets/__init__.py b/src/cmdc_tools/datasets/__init__.py index e246e8b..b5d9463 100644 --- a/src/cmdc_tools/datasets/__init__.py +++ b/src/cmdc_tools/datasets/__init__.py @@ -49,6 +49,8 @@ SanDiego, Tennessee, TennesseeCounties, + Utah, + UtahFips, Vermont, Wisconsin, WIDane, diff --git a/src/cmdc_tools/datasets/official/UT/__init__.py b/src/cmdc_tools/datasets/official/UT/__init__.py index 14bcd55..e17f42b 100644 --- a/src/cmdc_tools/datasets/official/UT/__init__.py +++ b/src/cmdc_tools/datasets/official/UT/__init__.py @@ -1 +1 @@ -from .data import Utah +from .data import Utah, UtahFips diff --git a/src/cmdc_tools/datasets/official/UT/data.py b/src/cmdc_tools/datasets/official/UT/data.py index 0c9266c..51b2031 100644 --- a/src/cmdc_tools/datasets/official/UT/data.py +++ b/src/cmdc_tools/datasets/official/UT/data.py @@ -1,33 +1,137 @@ +import asyncio +import json +from functools import reduce +from pprint import pprint + import pandas as pd +import us from ...base import DatasetBaseNoDate +from ...puppet import with_page from ..base import ArcGIS class Utah(ArcGIS, DatasetBaseNoDate): ARCGIS_ID = "KaHXE9OkiB9e63uE" + has_fips = False + state_fips = int(us.states.lookup("Utah").fips) + source = "https://coronavirus-dashboard.utah.gov/#overview" def get(self): + return self._get_overview() + + def _get_overview(self): df = self.get_all_sheet_to_df( - "Utah_COVID19_Cases_by_Local_Health_Department_over_time", sheet=0, srvid=6 + "Utah_COVID19_Case_Counts_by_LHD_by_Day_View", sheet=0, srvid=6 ) - renamed = df.rename( columns={ + "DISTNAME": "district", "COVID_Cases_Total": "cases_total", "Day": "dt", "Hospitalizations": "cumulative_hospitalized", - "DISTNAME": "region", } ) - renamed["dt"] = renamed["dt"].map(lambda x: pd.datetime.fromtimestamp(x / 1000)) - # Keep only region rows - # counties = renamed.loc[renamed.region.str.contains("County")] - return ( - renamed[["dt", "region", "cases_total", "cumulative_hospitalized"]] - .sort_values(["dt", "region"]) - .melt(id_vars=["dt", "region"], var_name="variable_name") + renamed[["dt", "district", "cases_total", "cumulative_hospitalized"]] + .melt(id_vars=["dt", "district"], var_name="variable_name") + .assign(vintage=pd.Timestamp.utcnow()) + .sort_values(["dt", "district"]) ) + + +class UtahFips(DatasetBaseNoDate): + has_fips = True + state_fips = int(us.states.lookup("Utah").fips) + source = "https://coronavirus-dashboard.utah.gov/#overview" + + def get(self): + hosp = self._get_hosp_sync() + tests = self._get_tests_sync() + + return pd.concat([hosp, tests], sort=False) + + async def _get_hosp(self): + url = "https://coronavirus-dashboard.utah.gov/#hospitalizations-mortality" + async with with_page() as page: + await page.goto(url) + await page.waitForXPath("//div[@class='plot-container plotly']") + plots = await page.Jx( + "//div[@id='daily-hospital-survey-previous-8-weeks']//div[@class='plot-container plotly']/.." + ) + text = await page.evaluate("(elem) => [elem.data, elem.layout]", plots[0]) + data = text[0] + layout = text[1] + reduced = self._extract_plotly_data(data, layout) + + renamed = reduced.rename( + columns={ + "ICU": "icu_beds_in_use_any", + "Non-ICU": "hospital_beds_in_use_any", + } + ) + + return ( + renamed[["dt", "icu_beds_in_use_any", "hospital_beds_in_use_any"]] + .melt(id_vars=["dt"], var_name="variable_name") + .assign(vintage=pd.Timestamp.utcnow(), fips=self.state_fips) + ) + + async def _get_tests(self): + url = "https://coronavirus-dashboard.utah.gov/#overview" + + async with with_page() as page: + await page.goto(url) + await page.waitForXPath("//div[@class='plot-container plotly']") + plots = await page.Jx( + "//div[@id='total-tests-by-date']//div[@class='plot-container plotly']/.." + ) + text = await page.evaluate("(elem) => [elem.data, elem.layout]", plots[0]) + data = text[0] + layout = text[1] + # return text + + df = self._extract_plotly_data(data, layout) + + renamed = df.fillna(0) + + renamed["positive_tests_total"] = ( + renamed["Positive PCR"] + renamed["Positive Antigen"] + ).astype(int) + renamed["negative_tests_total"] = ( + renamed["Negative PCR"] + renamed["Negative Antigen"] + ).astype(int) + sorts = renamed.set_index("dt").sort_index().cumsum().reset_index() + return ( + sorts[["dt", "negative_tests_total", "positive_tests_total"]] + .melt(id_vars=["dt"], var_name="variable_name") + .assign(vintage=pd.Timestamp.utcnow(), fips=self.state_fips) + ) + + def _get_hosp_sync(self): + return asyncio.run(self._get_hosp()) + + def _get_tests_sync(self): + return asyncio.run(self._get_tests()) + + def _extract_plotly_data(self, data, layout): + dfs = [] + for trace in data: + trace_name = trace.get("name", "") + if trace_name == "": + continue + x = trace["x"] + y = trace["y"] + df = pd.DataFrame(data={"x": x, f"{trace_name}": y}) + df["dt"] = df.x.map( + lambda x: ( + pd.Timestamp(layout["xaxis"]["ticktext"][0] + " 2020") + + pd.Timedelta(days=(x - layout["xaxis"]["tickvals"][0])) + ) + ) + dfs.append(df.set_index("dt")) + return reduce( + lambda left, right: pd.merge(left, right, on=["dt"], how="outer"), dfs, + ).reset_index() diff --git a/src/cmdc_tools/datasets/official/__init__.py b/src/cmdc_tools/datasets/official/__init__.py index 25b6257..fc36c94 100644 --- a/src/cmdc_tools/datasets/official/__init__.py +++ b/src/cmdc_tools/datasets/official/__init__.py @@ -26,5 +26,6 @@ from .PA import Pennsylvania from .RI import RhodeIsland from .TN import Tennessee, TennesseeCounties +from .UT import Utah, UtahFips from .VT import Vermont from .WI import Wisconsin, WIDane From 17c5a0250d6931b3445f4ad2b871174231035e22 Mon Sep 17 00:00:00 2001 From: Trevor Lyon Date: Tue, 4 Aug 2020 12:58:26 -0600 Subject: [PATCH 3/4] deaths. agg district data to state data --- src/cmdc_tools/datasets/official/UT/data.py | 52 ++++++++++++++++++--- 1 file changed, 45 insertions(+), 7 deletions(-) diff --git a/src/cmdc_tools/datasets/official/UT/data.py b/src/cmdc_tools/datasets/official/UT/data.py index 51b2031..da04101 100644 --- a/src/cmdc_tools/datasets/official/UT/data.py +++ b/src/cmdc_tools/datasets/official/UT/data.py @@ -13,7 +13,7 @@ class Utah(ArcGIS, DatasetBaseNoDate): ARCGIS_ID = "KaHXE9OkiB9e63uE" - has_fips = False + has_fips = True state_fips = int(us.states.lookup("Utah").fips) source = "https://coronavirus-dashboard.utah.gov/#overview" @@ -32,13 +32,17 @@ def _get_overview(self): "Hospitalizations": "cumulative_hospitalized", } ) - renamed["dt"] = renamed["dt"].map(lambda x: pd.datetime.fromtimestamp(x / 1000)) - + renamed["dt"] = ( + renamed["dt"].map(lambda x: pd.datetime.fromtimestamp(x / 1000)).dt.date + ) return ( renamed[["dt", "district", "cases_total", "cumulative_hospitalized"]] - .melt(id_vars=["dt", "district"], var_name="variable_name") - .assign(vintage=pd.Timestamp.utcnow()) - .sort_values(["dt", "district"]) + .groupby(["dt"]) + .agg("sum") + .reset_index()[["dt", "cases_total", "cumulative_hospitalized"]] + .melt(id_vars=["dt"], var_name="variable_name") + .assign(vintage=pd.Timestamp.utcnow(), fips=self.state_fips) + .sort_values(["dt", "variable_name"]) ) @@ -50,8 +54,9 @@ class UtahFips(DatasetBaseNoDate): def get(self): hosp = self._get_hosp_sync() tests = self._get_tests_sync() + deaths = self._get_deaths_sync() - return pd.concat([hosp, tests], sort=False) + return pd.concat([hosp, tests, deaths], sort=False) async def _get_hosp(self): url = "https://coronavirus-dashboard.utah.gov/#hospitalizations-mortality" @@ -110,12 +115,45 @@ async def _get_tests(self): .assign(vintage=pd.Timestamp.utcnow(), fips=self.state_fips) ) + async def _get_deaths(self): + url = "https://coronavirus-dashboard.utah.gov/#hospitalizations-mortality" + async with with_page() as page: + await page.goto(url) + await page.waitForXPath("//div[@class='plot-container plotly']") + plots = await page.Jx( + "//div[@id='covid-19-deaths-by-date-of-death-data-will-backfill-n314']//div[@class='plot-container plotly']/.." + ) + text = await page.evaluate("(elem) => [elem.data, elem.layout]", plots[0]) + data = text[0] + layout = text[1] + # return text + + df = self._extract_plotly_data(data, layout) + + renamed = df.fillna(0) + + renamed = renamed.rename(columns={"Deaths": "deaths_total"}) + agged = ( + renamed[["dt", "deaths_total"]] + .set_index("dt") + .sort_index() + .cumsum() + .reset_index() + ) + + return agged.melt(id_vars=["dt"], var_name="variable_name").assign( + vintage=pd.Timestamp.utcnow(), fips=self.state_fips + ) + def _get_hosp_sync(self): return asyncio.run(self._get_hosp()) def _get_tests_sync(self): return asyncio.run(self._get_tests()) + def _get_deaths_sync(self): + return asyncio.run(self._get_deaths()) + def _extract_plotly_data(self, data, layout): dfs = [] for trace in data: From b1f647b1c0c40cde0f21691e206345bfc3ab8b3c Mon Sep 17 00:00:00 2001 From: Trevor Lyon Date: Tue, 4 Aug 2020 14:16:45 -0600 Subject: [PATCH 4/4] fix id selection --- src/cmdc_tools/datasets/official/UT/data.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/cmdc_tools/datasets/official/UT/data.py b/src/cmdc_tools/datasets/official/UT/data.py index da04101..88e84f6 100644 --- a/src/cmdc_tools/datasets/official/UT/data.py +++ b/src/cmdc_tools/datasets/official/UT/data.py @@ -121,7 +121,7 @@ async def _get_deaths(self): await page.goto(url) await page.waitForXPath("//div[@class='plot-container plotly']") plots = await page.Jx( - "//div[@id='covid-19-deaths-by-date-of-death-data-will-backfill-n314']//div[@class='plot-container plotly']/.." + "//div[contains(@id, 'covid-19-deaths-by-date-of')]//div[@class='plot-container plotly']/.." ) text = await page.evaluate("(elem) => [elem.data, elem.layout]", plots[0]) data = text[0]