-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexplore.py
105 lines (93 loc) · 3.59 KB
/
explore.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
import pandas as pd
import time
from utils_logging import text_emphasis, elapsed_time, log_uploaded_file_stats
import glob
import os
import requests
def main():
czo_data = pd.read_csv('./data/CZO-datasets-metadata-2019-10-29.csv')
df_rows = czo_data[['CZOS', 'COMPONENT_FILES-location$topic$url$data_level$private$doi$metadata_url']]
data = []
for k, idx in enumerate(df_rows.index):
# line = (df_rows['CZOS'][idx], df_rows['COMPONENT_FILES-location$topic$url$data_level$private$doi$metadata_url'][idx].split('|')[2].split('|'))
fdata = df_rows['COMPONENT_FILES-location$topic$url$data_level$private$doi$metadata_url'][idx].split('|')
for item in fdata:
line = (df_rows['CZOS'][idx], item.split('$')[2])
data.append(line)
df = pd.DataFrame.from_records(data, columns=['czo', 'files'])
dg = df.groupby('files').first()
conf_name = []
sizes = []
nf1, nf2 = [], []
ftype = []
for f in dg.index:
while f.endswith('/'):
f = f[:len(f)-1]
file_candidate = f.split('/')[-1]
assert file_candidate, f
found = glob.glob(os.path.join('/home/mobrien/czo2hs/tmp2/**/', file_candidate))
if not found:
likely_file = '.' in f.split('/')[-1]
for item in ['.htm', '.jsp', '.ml', '.php', '.shtm', '.nc', '.aspx']:
if item in f.split('/')[-1]:
likely_file = False
if likely_file:
nf1.append(f)
print('{} - Not found {}'.format(len(nf1), f))
else:
nf2.append(f)
print('{} - Likely external link {}'.format(len(nf2), f))
conf_name.append('')
sizes.append(0)
ftype.append('')
else:
chaff = [x for x in found if "." not in x]
ffiles = [x for x in found if "." in x]
if not ffiles:
print('Found only chaff for {} at {} - qty {}'.format(file_candidate, chaff[:5], len(chaff)))
conf_name.append('')
sizes.append(0)
ftype.append('')
elif len(ffiles) > 0:
assert file_candidate == ffiles[0].split('/')[-1], ffiles[0].split('/')[-1]
conf_name.append(file_candidate)
sz = os.stat(ffiles[0]).st_size // 1000 # KB
sizes.append(sz)
ftype.append(file_candidate.split('.')[-1].lower())
if sz == 0:
print('Zero size {}'.format(file_candidate))
else:
print('unknown state'.format(file_candidate))
conf_name.append('')
sizes.append(0)
ftype.append('')
dg['conf'] = conf_name
dg['sizes'] = sizes
dg['type'] = ftype
dg.to_csv('czodata.csv')
if not os.path.exists('./outputs'):
os.mkdir('./outputs')
with open('./outputs/notfound.txt', 'w') as f:
for item in nf1:
f.write(item + '\n')
f.write(str(len(nf1)))
with open('./outputs/extlinks.txt', 'w') as f:
for item in nf2:
f.write(item + '\n')
f.write(str(len(nf2)))
if __name__ == "__main__":
# with open('./notfound.txt', 'r') as f:
# e = f.read().splitlines()
#
# for item in e:
# res = requests.get(item)
# if not res.status_code == 200:
# print(item, res)
start_time = time
start = time.time()
try:
main()
except KeyboardInterrupt:
print("\nExit ok")
finally:
print("Total Migration {}".format(elapsed_time(start, time.time())))