-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.py
56 lines (39 loc) · 1.74 KB
/
main.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
from datetime import datetime, timedelta
# Generate sample IoT data
num_records = 10000
start_time = datetime(2023, 1, 1, 0, 0, 0)
end_time = start_time + timedelta(minutes=num_records)
data = {
'timestamp': pd.date_range(start=start_time, end=end_time, freq='T')[:-1],
'temperature': [round(20 + i % 10 + 5 * (i % 100) / 100, 2) for i in range(num_records)],
'humidity': [round(40 + i % 20 + 8 * (i % 50) / 100, 2) for i in range(num_records)],
'occupancy': [i % 2 for i in range(num_records)]
}
print(len(data['humidity']))
print(len(data["temperature"]))
print(len(data['occupancy']))
print(len(data['timestamp']))
df = pd.DataFrame(data)
df['date'] = df['timestamp'].dt.date
table = pa.Table.from_pandas(df)
parquet_file_path = 'smart_building_data.parquet'
pq.write_to_dataset(table, root_path=parquet_file_path, partition_cols=['date'], compression='snappy', flavor='spark',use_legacy_dataset=True)
read_table = pq.read_table(parquet_file_path)
read_df = read_table.to_pandas()
print("Original IoT Data:")
print(df.head())
print("\nRead IoT Data from Parquet:")
print(read_df.head())
import timeit
def query_data_for_date(df, query_date):
return df[df['date'] == query_date]
query_date = datetime(2023, 1, 1).date()
original_query_time = timeit.timeit(lambda: query_data_for_date(df, query_date), number=1000)
parquet_query_time = timeit.timeit(lambda: query_data_for_date(read_df, query_date), number=1000)
# Display the query performance results
print(f"Querying data for {query_date}:\n")
print(f"Time taken on the original DataFrame: {original_query_time:.6f} seconds")
print(f"Time taken on the Parquet file with partitioning: {parquet_query_time:.6f} seconds")