-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathpreprocessing.py
127 lines (111 loc) · 5.11 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
import pandas as pd
import unicodedata
from xml.dom import minidom
import string
import re
from datetime import datetime, timedelta
import hashlib
import os
# global variable
data_list = []
def nomarlizeText(element):
# str_title = " ".join(t.nodeValue for t in element[0].childNodes if t.nodeType == t.TEXT_NODE)
# Convert text to lowercase
str_lower = element.lower()
# Punctuation removal
# symbols [!”#$%&’()*+,-./:;<=>?@[\]^_`{|}~]:
remove_symbols = str_lower.translate(str.maketrans('','', string.punctuation))
# White spaces removal
remove_spaces = remove_symbols.strip()
# remove duplicate space
remove_dup_spaces = re.sub("\s\s+", " ", remove_spaces)
# normalize string
str_normalize = unicodedata.normalize('NFKD', remove_dup_spaces).encode('ascii', 'ignore').decode('ascii')
# digits removal
# remove_digits = str_normalize.translate(str.maketrans('', '', digits))
# white space removal
result = str_normalize.replace(' ', '')
return result
# ------------------------------------------------------------------
# the timestamp only valid if it was detected within the date that xml collected
def check_timestamp_valid(filename_str, date_valid):
datetime_str = filename_str.split('_')[0]
datetime_str = re.sub('-', '', datetime_str)
if(date_valid==datetime_str):
return True
else:
return False
# ------------------------------------------------------------------
def convert_xlm_file_to_date(filename_str):
datetime_str = filename_str.split('_')[0]
datetime_str = re.sub('-', '', datetime_str)
date_time_obj = datetime.strptime(datetime_str, '%Y%m%d')
date_after = date_time_obj + timedelta(days=1)
si = datetime(date_time_obj.year, date_time_obj.month, date_time_obj.day, hour=6, minute=10, second=0)
ei = datetime(date_after.year, date_after.month, date_after.day, hour=1, minute=50, second=0)
return si, ei
# ------------------------------------------------------------------
def read_xml(base_path, input_file, channel_c):
file_path = base_path + "\\" + input_file
xmldoc = minidom.parse(file_path)
itemlist = xmldoc.getElementsByTagName('programme')
# print("Total item:", len(itemlist))
# print(itemlist[0].attributes['start'].value)
for s in itemlist:
start_time = s.attributes['start'].value.strip().split('+')[0].strip()
stop_time = s.attributes['stop'].value.strip().split('+')[0].strip()
channel_code = s.attributes['channel'].value.strip().split('.')[0].strip().lower()
# channel_name = common.get_channel_name_by_code(channel_code)
title = s.getElementsByTagName('title')[0].firstChild.nodeValue
nomarlized_title = nomarlizeText(title).strip()
category_p = s.getElementsByTagName('category')[0].firstChild.nodeValue
length_p = s.getElementsByTagName('length')[0].firstChild.nodeValue
year_start = int(start_time[:4])
month_start = int(start_time[4:6])
day_start = int(start_time[6:8])
hour_start = int(start_time[8:10])
min_start = int(start_time[10:12])
second_start = 0
# datetime_element = datetime(year, month, day, hour, minute, second, milliseconds)
s_j = datetime(year=year_start, month=month_start, day=day_start, hour=hour_start, minute=min_start, second=second_start)
year_stop = int(stop_time[:4])
month_stop = int(stop_time[4:6])
day_stop = int(stop_time[6:8])
hour_stop= int(stop_time[8:10])
min_stop = int(stop_time[10:12])
second_stop = 0
e_j = datetime(year=year_stop, month=month_stop, day=day_stop, hour=hour_stop, minute=min_stop, second=second_stop)
s_i, e_i = convert_xlm_file_to_date(input_file)
# print(si, ei)
duration_program = int(length_p)
str_start = start_time
str_stop = stop_time
if (s_j < e_j and s_j > s_i and e_j < e_i and channel_code==channel_c and duration_program >=5):
value_hash = str(channel_code)+str(nomarlized_title)
hash_id = hashlib.sha1(value_hash.encode()).hexdigest()
data_row = {
"Channel_Code": channel_code,
"Hashcode" : hash_id,
"Title": title,
"Start": str_start,
"Length": length_p
}
# print(data_row)
data_list.append(data_row)
# print(len(data_list))
return
# ------------------------------------------------------------------
# base_path = r"D:\Programing\XML_Analysis\data"
# input_file = "2022-01-01_100000.xml"
# channel_c = "c192"
# read_xml(base_path, input_file, channel_c)
base_path = r"data\\01"
channel_list = ["c192", "c4", "c80", "c111", "c234", "c481", "c226", "c2111" ]
for channel_name in channel_list:
print("Processing: ", channel_name)
for _, _, f in os.walk(base_path):
for file_name in f:
if(file_name.endswith(".xml")):
read_xml(base_path, file_name, channel_name)
df = pd.DataFrame(data_list)
df.to_csv("meta_data.csv", index = False, header=True, encoding="utf-8-sig")