-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdashboard_demo.py
442 lines (382 loc) · 21.8 KB
/
dashboard_demo.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
# File to build dashboard using streamlite
import streamlit as st
import pandas as pd
import numpy as np
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import matplotlib.pyplot as plt
import plotly.express as px
import statistics
def display_title():
# Create a title for the dashboard
st.title('Events in Stuttgart 📅 🎉')
# Create a subheader
st.subheader('A dashboard to explore great places in Stuttgart on Wednesdays!')
# the user should have multiple options to choose from
# - The season (soring, summer, autumn, winter)
# - The type of event (concert, party, singer in a bar, ...)
# - The location (größeren Viertel von Stuttgart zur Auswahl stellen (durch ZIP Code) )
def get_user_preferences(supercategories: list):
st.sidebar.title("Dashboard Controls")
# Create a multiselect widget for the location
location_sidebar = st.sidebar.multiselect(
'Which part of Stuttgart do you prefer?',
["Europaviertel","Relenberg","Karlshöhe","Am Rosensteinpark","Kräherwald","Botnang-West","Vogelsang","Südheim","Bad Cannstatt","Sternhäule","Pfaffenwald","Freiberg","Rosenberg","Uhlandshöhe","Im Geiger","Zuffenhausen-Elbelen","Stöckach","Weinsteige","Heusteigviertel","Neckarvorstadt","Mönchfeld","Waldau","Möhringen-Süd","Feuerbach-Ost","Südheim", "Other"],
["Europaviertel","Relenberg","Karlshöhe","Am Rosensteinpark","Kräherwald","Botnang-West","Vogelsang","Südheim","Bad Cannstatt","Sternhäule","Pfaffenwald","Freiberg","Rosenberg","Uhlandshöhe","Im Geiger","Zuffenhausen-Elbelen","Stöckach","Weinsteige","Heusteigviertel","Neckarvorstadt","Mönchfeld","Waldau","Möhringen-Süd","Feuerbach-Ost","Südheim", "Other"],
)
# Create a multiselect widget for the season
season = st.sidebar.multiselect(
'In what season are you looking for an event?',
['spring', 'summer', 'autumn', 'winter'],
['spring', 'summer', 'autumn', 'winter']
)
# Create a multiselect widget for the mood
mood = st.sidebar.multiselect(
'What should be the flair of the event?',
['Energetisch', 'Gesellig', 'Körperbewusst', 'Künstlerisch', 'Unterhaltsam'],
['Energetisch', 'Gesellig', 'Körperbewusst', 'Künstlerisch', 'Unterhaltsam']
)
# Create a multiselect widget for the type of event
event_type = st.sidebar.multiselect(
'Which type of event do you prefer?',
# Use below line if all supercategories should be displayed
# options = [supercategory for supercategory in supercategories],
options = ["restaurant", "kultur", "party", "performance & event venue", "public figure", "dance & night club", "bar", "anderes", "church", "university", "performance art theatre", "local business", "club", "arts", "library", "museum", "education", "business services", "government organization", "arts & entertainment", "non-profit organization", "community organization", "bookstore", "non-governmental organization (ngo)", "company", "sports"],
default = []
)
return event_type, location_sidebar, season, mood
def display_subcategories(event_types: list, df: pd.DataFrame, default_value='No Subcategory'):
subcategories = []
for event_type in event_types:
unique_subcategories = df[df['supercategory'] == event_type]['subcategory'].unique()
# remove nan from subcategories
unique_subcategories = unique_subcategories[~pd.isnull(unique_subcategories)]
for subcategory in unique_subcategories:
if subcategory not in subcategories:
subcategories.append(subcategory)
# If there are no subcategories, use the default value
if not subcategories:
subcategories = [default_value]
print(subcategories)
# Create a multiselect widget for the subtype of event
event_subtype = st.sidebar.multiselect(
'Which genre of event do you prefer?',
subcategories,
[]
)
return event_subtype
def create_link_to_GoogleMaps(row):
google_maps_address = f"https://www.google.com/maps/search/?api=1&query={row['Location']},{row['Address']}, Stuttgart"
return f'<a href="{google_maps_address}" target="_blank">Find {row["Location"]} on Maps</a>'
def prepare_sub_df_for_output(df: pd.DataFrame, top5: bool, event_type: list, location_sidebar: list, season: list, event_subtype: list, mood: list):
""" Select the data from the dataframe that corresponds to the user preferences and return a dataframe with the relevant data
:param df: the dataframe with all the events
:param top5: boolean to indicate if only the top 5 locations should be returned
:param event_type: list of event types
:param location_sidebar: list of district names that the user selected
:param season: list of seasons that the user selected
:param event_subtype: list of event subtypes that the user selected
:param mood: list of moods that the user selected
"""
sub_df = df[df['season'].isin(season) & df['district'].isin(location_sidebar) & df['supercategory'].isin(event_type) & (df['subcategory'].isin(event_subtype) | (df['subcategory'] == "")) & df['stimmung'].isin(mood)]
# Only select the relevant columns
sub_df = sub_df[['name', 'description', 'location.name', 'location.location.address.street', 'supercategory', 'subcategory', 'stimmung']]
sub_df.columns = ['Event', 'Description', 'Location', 'Address', 'Type', 'Category', 'Flair']
# create a dictionary with the location as key and the address and category as values
location_dict = {}
for index, row in sub_df.iterrows():
if row['Location'] not in location_dict:
location_dict[row['Location']] = [row['Address'], [row['Type']], [row['Category']], [row['Flair']], 1]
else:
location_dict[row['Location']][3].append(row['Flair'])
location_dict[row['Location']][2].append(row['Category'])
location_dict[row['Location']][1].append(row['Type'])
location_dict[row['Location']][4] += 1
# After creating the dictionary, find the most common flair, type and category for each location. This will be displayed in the dashboard
for location in location_dict:
location_dict[location][3] = statistics.mode(location_dict[location][3])
location_dict[location][2] = statistics.mode(location_dict[location][2])
location_dict[location][1] = statistics.mode(location_dict[location][1])
# count number of events per location
events_per_location_count_df = sub_df.groupby(['Location']).count()
# sort by number of events per location
events_per_location_count_df = events_per_location_count_df.sort_values(by=['Event'], ascending=False)
# calculate average number of events per location
if top5 == True:
# get the top 5 location names
top_5_locations = events_per_location_count_df.head(5).index.tolist()
# create a new dataframe with only the top 5 locations using the location_dict
top_5_locations_df = pd.DataFrame(columns=['Location', 'Address', 'Type', 'Category', 'Flair', 'Popularity'])
for location in top_5_locations:
star_rating = min(location_dict[location][4], 5)
num_of_events = location_dict[location][4]
new_entry = pd.DataFrame([[location, location_dict[location][0], location_dict[location][1], location_dict[location][2], location_dict[location][3], star_rating*'⭐', num_of_events]], columns=['Location', 'Address', 'Type', 'Category', 'Flair', 'Popularity', 'Number of Events'])
top_5_locations_df = pd.concat([top_5_locations_df, new_entry])
top_5_locations = top_5_locations_df.reset_index(drop=True)
top_5_locations['Google Maps Link 📍🗺️'] = top_5_locations.apply(create_link_to_GoogleMaps, axis=1)
return top_5_locations
else:
# get all location names
all_locations = events_per_location_count_df.index.tolist()
# create new df with all locations using the location_dict
all_locations_df = pd.DataFrame(columns=['Location', 'Address', 'Type', 'Category', 'Flair', 'Popularity'])
for location in all_locations:
star_rating = min(location_dict[location][4], 5)
num_of_events = location_dict[location][4]
new_entry = pd.DataFrame([[location, location_dict[location][0], location_dict[location][1], location_dict[location][2], location_dict[location][3], star_rating*'⭐', num_of_events]], columns=['Location', 'Address', 'Type', 'Category', 'Flair', 'Popularity', 'Number of Events'])
all_locations_df = pd.concat([all_locations_df, new_entry])
all_locations = all_locations_df.reset_index(drop=True)
all_locations['Google Maps Link 📍🗺️'] = all_locations.apply(create_link_to_GoogleMaps, axis=1)
return all_locations
def display_locations(df: pd.DataFrame, selected_tab: str, event_type, location_sidebar, season, event_subtype, mood):
if selected_tab == "Top 5 Locations":
st.subheader('Top 5 Locations for your preferences🚀')
output_df = prepare_sub_df_for_output(df, top5=True, event_type=event_type, location_sidebar=location_sidebar, season=season, event_subtype=event_subtype, mood=mood)
st.write(output_df.to_html(escape=False, index=False, justify='center'), unsafe_allow_html=True)
elif selected_tab == "All Locations":
st.subheader('All locations that correspond to your preferences')
output_df = prepare_sub_df_for_output(df, top5=False, event_type=event_type, location_sidebar=location_sidebar, season=season, event_subtype=event_subtype, mood=mood)
st.write(output_df.to_html(escape=False, index=False, justify='center'), unsafe_allow_html=True)
def show_no_of_events_used(df: pd.DataFrame):
st.markdown(' ')
num_events = len(df)
# Display the number of events in a visually appealing way
st.markdown(
f"""
<div style="display: flex; justify-content: space-between; border-top: 3px solid #ddd; align-items: flex-end;">
<div style="text-align: left; font-size: 36px; color: #4CAF50; transform: rotate(90deg); margin-bottom: 70px;">
#Wednesdays
</div>
<div style="text-align: center; font-size: 120px; color: #ff6347; padding-top: 10px;">
{num_events}
<div style="font-size: 24px; color: #808080; margin-top: 10px;">Events Used for Training</div>
</div>
<div style="text-align: right; font-size: 36px; color: #4CAF50; transform: rotate(-90deg); margin-bottom: 70px;">
Stuttgart
</div>
</div>
""",
unsafe_allow_html=True
)
# Create a frame to visually separate sections
st.markdown('<hr style="border: 2px solid #ddd;">', unsafe_allow_html=True)
def show_google_maps_stuttgart():
st.markdown(' ')
st.write('When you are from outside of Stuttgart, you can find the best way to get to Stuttgart here:')
# Google Maps iframe code
google_maps_iframe = """
<iframe
width="600"
height="450"
frameborder="0"
style="border:0"
src="https://www.google.com/maps/embed?pb=!1m18!1m12!1m3!1d194810.4263677699!2d9.073201596210784!3d48.7758451766426!2m3!1f0!2f0!3f0!3m2!1i1024!2i768!4f13.1!3m3!1m2!1s0x4799c00fb81e49cf%3A0x66ba215d16b9970c!2sStuttgart!5e0!3m2!1sen!2sus!4v1633772401089!5m2!1sen!2sus"
allowfullscreen
></iframe>
"""
# Display the Google Maps iframe in Streamlit
st.markdown(google_maps_iframe, unsafe_allow_html=True)
def display_colnames(df: pd.DataFrame):
st.markdown(' ')
# Assuming df is your DataFrame
df = df
# List of columns created by feature engineering
feature_engineering_cols = ['dayofweek', 'year', 'month', 'season', 'district', 'supercategory', 'subcategory', 'starting_hour', 'time_of_day', 'stimmung']
# Display a title
st.title('Columns used for creating dashboard:')
# Create a frame for all columns
st.markdown('<div style="border: 2px solid #ddd; padding: 10px; border-radius: 10px;">', unsafe_allow_html=True)
# Display all columns
st.text('All Columns:')
st.write(', '.join(df.columns))
# Close the frame for all columns
st.markdown('</div>', unsafe_allow_html=True)
# Create a frame for feature engineering columns with a different color
st.markdown('<div style="border: 2px solid #4CAF50; padding: 10px; border-radius: 10px; margin-top: 20px;">', unsafe_allow_html=True)
# Display feature engineering columns
st.text('Feature Engineering Columns:')
st.write(', '.join(feature_engineering_cols))
# Close the frame for feature engineering columns
st.markdown('</div>', unsafe_allow_html=True)
def dislpay_frequent_words_from_description(df: pd.DataFrame):
# download german stopwords
import nltk
nltk.download('stopwords')
nltk.download('punkt')
st.markdown(' ')
st.title('Wordcloud of event descriptions')
# Assuming df is your DataFrame
df = df
# Concatenate all text from the specified column
text_data = ' '.join(df['description'].dropna())
# Tokenize the text
tokens = word_tokenize(text_data)
# Remove stopwords
german_stopwords = set(stopwords.words('german'))
custom_stopwords = {"br", "href", "s", "u", "S", "https", "de", "nofollow", "rel"}
german_stopwords = german_stopwords.union(custom_stopwords)
filtered_tokens = [word.lower() for word in tokens if word.lower() not in german_stopwords]
# Join the filtered tokens back into text
filtered_text = ' '.join(filtered_tokens)
# Generate a word cloud with German stopwords filtered out
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(filtered_text)
# Display the word cloud using Matplotlib
fig, ax = plt.subplots(figsize=(10, 5))
ax.imshow(wordcloud, interpolation='bilinear')
ax.axis('off')
# Display the figure in Streamlit
st.pyplot(fig)
st.markdown('NOTE: The above wordcloud was created using natural language processing (NLP) techniques. The wordcloud is based on the event descriptions of the events in the dataset.', help='You need help understanding the wordcloud? Ask the developers!')
# Diagramm
def generate_activity_type_chart(df: pd.DataFrame):
st.markdown(" ")
st.title("Events by Category 💃🏼")
# Zahlen rechnen
activity_counts = df['supercategory'].value_counts()
# only plot the top 10 categories
activity_counts = activity_counts[:10]
# plotly.express
fig = px.bar(
x=activity_counts.index,
y=activity_counts.values,
labels={'x': 'Activity Type', 'y': 'Number of Events'},
title='Distribution of Activity Types'
)
# Diagramm zeigen
st.plotly_chart(fig)
# Diagramm_Kreis
def generate_activity_type_pie_chart(df: pd.DataFrame):
st.markdown(" ")
st.title("Percentage of Events by Category")
# Rechnen
activity_counts = df['supercategory'].value_counts()
# plotly.express
fig = px.pie(activity_counts, values=activity_counts, names=activity_counts.index, title='Distribution of Activity Types')
# Zeigen
st.plotly_chart(fig)
# Diagramm_Month
def generate_activity_time_chart(df: pd.DataFrame):
st.markdown(" ")
st.title("Events by Month 📅")
# Month
activity_time_counts = df['month'].value_counts()
# plotly.express
fig = px.bar(activity_time_counts, x=activity_time_counts.index, y=activity_time_counts.values,
labels={'x': 'month', 'y': 'Number of Events'},
title='Distribution of Activity Times')
# Zeigen
st.plotly_chart(fig)
def generate_latitude_longitude_chart(df: pd.DataFrame):
st.markdown(" ")
st.title("Coordinate Plot of all Events 🌍")
coordinate_df = df[["location.name", "location.location.coordinate.lat", "location.location.coordinate.lon"]].dropna()
coordinate_df = coordinate_df.rename(columns={"location.name": "Location", "location.location.coordinate.lat": "latitude", "location.location.coordinate.lon": "longitude"})
st.map(coordinate_df, color="#FAED27")
def visualize_time_of_day(df: pd.DataFrame):
# visualize time of day with pie chart
st.markdown(" ")
st.title("Events by Time of Day 🕒")
time_of_day_counts = df['time_of_day'].value_counts()
fig = px.pie(time_of_day_counts, values=time_of_day_counts, names=time_of_day_counts.index, title='Distribution of Time of Day')
st.plotly_chart(fig)
st.markdown("The starting hours were grouped into the above 4 times of the day.", help="We used the following classification scheme: Morning: 6am - 12pm, Afternoon: 12pm - 6pm, Evening: 6pm - 12am, Night: 12am - 6am")
def visualize_starting_hour_of_events(df: pd.DataFrame):
# visualize starting hour with line chart
st.markdown(" ")
st.title("Events by Starting Hour 🕰️")
# Count the occurrences of each starting hour (starting hour is an integer)
starting_hour_counts = df['starting_hour'].value_counts()
# Use plotly.express to create the line chart
fig = px.bar(
x=starting_hour_counts.index,
y=starting_hour_counts.values,
title='Distribution of Starting Hour',
labels={'x': 'Starting Hour', 'y': 'Number of Events'}
)
# Show the line chart
st.plotly_chart(fig)
def visualize_subcategory_by_supercategory(df: pd.DataFrame):
# multiple plots, one for each supercategory
st.markdown(" ")
st.title("Events by Event Type and Subcategory 🧨🎈")
# Create a list of the supercategories
supercategories = df['supercategory'].unique()
# Create a frame for the plots
st.markdown('<div style="display: flex; flex-wrap: wrap;">', unsafe_allow_html=True)
# Create a plot for each supercategory
for supercategory in supercategories:
if supercategory == 'familie-kinder' or supercategory == 'anderes':
continue
# Filter the DataFrame for the supercategory
filtered_df = df[df['supercategory'] == supercategory]
# Count the occurrences of each subcategory for the filtered DataFrame
subcategory_counts = filtered_df['subcategory'].value_counts()
# Use plotly.express to create the bar chart
try:
fig = px.bar(
x=subcategory_counts.index,
# display y as percentage
y=subcategory_counts.values,
title=f'Distribution of Subcategories for {supercategory}',
labels={'x': 'Subcategory', 'y': 'Number of Events'},
color_discrete_sequence=['green']
)
except ValueError:
continue
# Set title for the subplot
fig.update_layout(title_text=supercategory)
# Show the bar chart
st.plotly_chart(fig)
# Close the frame for the plots
st.markdown('</div>', unsafe_allow_html=True)
def main():
# Read in the csv-file
df = pd.read_csv('data/2000_events_sample.csv')
unique_supercategories = df["supercategory"].unique()
print(df.shape)
display_title()
event_type, location_sidebar, season, mood = get_user_preferences(unique_supercategories)
# Display the also the subcategories for each supercategory that is selected
event_subtype = display_subcategories(event_type, df)
st.write('We will analyze your preferences and show you our recommendations for matching locations in Stuttgart.')
selected_tab = st.selectbox("Choose top location or all locations", ["Top 5 Locations", "All Locations", "Information about subset of dataset", 'Information about whole dataset'])
if selected_tab == "Information about subset of dataset":
dislpay_frequent_words_from_description(df)
expander1 = st.expander("Click to see more information about the dataset")
with expander1:
generate_activity_type_chart(df)
generate_activity_type_pie_chart(df)
# We excplicitly chose not to use percentages for each subcategory because most of the supercategory only have
# one subcategory.
# visualize_subcategory_by_supercategory(df)
visualize_starting_hour_of_events(df)
visualize_time_of_day(df)
generate_activity_time_chart(df)
show_no_of_events_used(df)
# generate_latitude_longitude_chart(df)
#show_google_maps_stuttgart()
display_colnames(df)
elif selected_tab == "Information about whole dataset":
# include images
st.markdown(" ")
st.title("Information about whole dataset")
st.markdown("The wordcloud of the event descriptions shows the most frequent words in the event descriptions of all events in the dataset.")
st.image("img/word_cloud_all_events.png", use_column_width=True)
expander2 = st.expander("Click to see more information about the whole dataset")
with expander2:
st.image('img/distribution_of_activity_type_all_events.png', use_column_width=True)
st.markdown('The yellow specifies the rows per column which contain missing values.')
st.image('img/Missing_values_all_events.png', use_column_width=True)
st.markdown('After cleaning the dataset, e.g. removing columns with more than 80% missing values, the heatmap of missing values looks like this:')
st.image('img/missing_values_after_cleaning_all_events.png', use_column_width=True)
st.image('img/starting_hour_all_events.png', use_column_width=True)
st.image('img/time_of_day_all_events.png', use_column_width=True)
else:
display_locations(df, selected_tab, event_type, location_sidebar, season, event_subtype, mood)
show_no_of_events_used(df)
#generate_latitude_longitude_chart(df)
st.markdown(' ')
st.markdown('<div style="text-align:center;">Copyright © 2024 Julius Döbelt and Haoran Huang. All rights reserved.</div>', unsafe_allow_html=True)
if __name__ == "__main__":
main()