-
Notifications
You must be signed in to change notification settings - Fork 9
/
controller.py
119 lines (106 loc) · 3.16 KB
/
controller.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
"""
This script pulls the apple and google data asyncronously and returns two filtered dataframes
"""
import time
x = time.time()
import sys
import asyncio
import datetime as dt
import pandas as pd
from pyspark.sql import SparkSession
import databricks.koalas as ks
from databricks.koalas import option_context
final = None
data = None
async def googles():
## Loading google data and cleaning
global final
with option_context(
"compute.ops_on_diff_frames", True, "compute.default_index_type", "distributed"
):
data = ks.read_csv("Google-Playstore.csv")
gameList = [
"Action",
"Adventure",
"Arcade",
"Board",
"Card",
"Casino",
"Casual",
"Educational",
"Music",
"Puzzle",
"Racing",
"Role Playing",
"Simulation",
"Sports",
"Strategy",
"Trivia",
"Word",
]
try:
final = data[data["Category"].isin(gameList)]
music = data[data["Category"] == "Music & Audio"]
health = data[data["Category"] == "Health & Fitness"]
final["SuperCategory"] = "Games"
music["SuperCategory"] = "Music"
health["SuperCategory"] = "Health"
final = ks.concat([final, music, health])
final = final[
["App Name", "Released", "Size", "Rating", "Rating Count", "SuperCategory"]
]
except Exception as e:
print("Failed to load google data: ", e)
## Reducing google data and casting types
final.columns = [
"App_Name",
"Released",
"Size_Bytes",
"Average_User_Rating",
"Reviews",
"Category",
]
final["Released_Year"] = ks.to_datetime(
final["Released"], errors="coerce"
).dt.strftime("%Y")
final["Released"] = ks.to_datetime(final["Released"], errors="coerce").dt.strftime(
"%Y-%m"
)
final["Size_Bytes"] = final["Size_Bytes"].str.replace("M", "000000").astype(int)
async def apples():
## Loading apple data and cleaning
global data
with option_context(
"compute.ops_on_diff_frames", True, "compute.default_index_type", "distributed"
):
data = ks.read_csv("appleAppData.csv")
## Reducing data and casting types
try:
data = data[data["Primary_Genre"].isin(["Games", "Music", "Health"])]
data["Category"] = data["Primary_Genre"]
data = data[
[
"App_Name",
"Released",
"Size_Bytes",
"Average_User_Rating",
"Reviews",
"Category",
]
]
data["Released_Year"] = ks.to_datetime(
data["Released"], errors="coerce"
).dt.strftime("%Y")
data["Released"] = ks.to_datetime(
data["Released"], errors="coerce"
).dt.strftime("%Y-%m")
data["Size_Bytes"] = data["Size_Bytes"].astype(int)
except Exception as e:
print("Failed to load apple data:", e)
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(googles(), apples()))
try:
final = ks.concat([final, data])
del data
except:
print("Failed to concat data")