-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathcreate_sql_db_user_dirs.py
executable file
·190 lines (154 loc) · 5.85 KB
/
create_sql_db_user_dirs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
#!/usr/bin/env python
"""Create SQLite database about all CRAB directories in user's Tier2 area.
Holds info about directory path, size, creation time, creator.
"""
from __future__ import print_function, division
import os
import argparse
import datetime
import subprocess
import create_sql_db_xml as creator
os.nice(10)
def get_user_dir_sizes(username):
"""Get Tier2 directories and their respective size
This looks for CRAB job directories,
e.g. /pnfs/desy.de/cms/tier2/store/user/akaravdi/SingleElectron/crab_pickEvents/170626_204933
ignoring any trailing 0000 etc
# TODO check for actual ntuples. How to tell apart form non-UHH2 dir?
Parameters
----------
username : str
Username to look for dirs. Can use more complex string to only search
in certain directories, e.g. aggleton/RunII
Yields
------
str, float
Directory, Size (kB)
"""
cmd = (r"nice -n 10 find /pnfs/desy.de/cms/tier2/store/user/" + username +
" -type d -regextype posix-egrep -regex \".*/[0-9_]{5,}\" ! -path '*/log' "
"-exec du -sk {} \;")
# this is python2 & 3-friendly, and ensures that each line is piped out immediately
# this avoids a lower-level bug: https://www.turnkeylinux.org/blog/unix-buffering
# https://stackoverflow.com/questions/2715847/read-streaming-input-from-subprocess-communicate/17698359#17698359
p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, bufsize=1)
with p.stdout:
for line in iter(p.stdout.readline, b''):
l = line.decode().strip()
size, dirname = l.split()
size = float(size.strip())
dirname = dirname.strip()
yield dirname, size
p.wait() # wait for the subprocess to exit
def get_dir_user(ntuple_dir):
"""Get "username" from filepath
Assumes it comes after .../user/ or /group/
e.g.:
get_dir_user("/nfs/dust/cms/user/robin/UHH2/Ntuple_2016v2.root")
>> robin
get_dir_user("/pnfs/desy.de/cms/tier2//store/user/abenecke/RunII_102X_v1/PUPPIStudies/")
>> abenecke
get_dir_user("/pnfs/desy.de/cms/tier2//store/group/uhh/uhh2ntuples/RunII_102X_v2/"")
>> uhh
Parameters
----------
ntuple_dir : str
Returns
-------
str
Username, or None if not found
"""
if "/user/" not in ntuple_dir and "/group/" not in ntuple_dir:
return None
ntuple_dir = os.path.normpath(ntuple_dir)
parts = ntuple_dir.split("/")
if "/user/" in ntuple_dir:
ind = parts.index("user")
if ind == len(parts)-1:
return None
return parts[ind+1]
elif "/group/" in ntuple_dir:
ind = parts.index("group")
if ind == len(parts)-1:
return None
return parts[ind+1]
def get_creation_time(path):
"""Get creation time of path, in ISO8601 format: YYYY-MM-DD HH:MM:SS.SSS
This format is necessary for SQLite
"""
stat = os.lstat(path)
return datetime.datetime.fromtimestamp(stat.st_ctime).isoformat(' ')
def get_user_dir_data(username):
"""Get data about each CRAB directory corresponding to user `username`,
looking in /pnfs/desy.de/cms/tier2/store/user/"+username
Currently returns info about:
- dirname: directory name
- size: directory size in kB
- user: owning user (allows `username` param to have subdirs)
- creation time: directory creation time
Parameters
----------
username : str
Username (can also add subdirectory patterns, e.g. "aggleton/RunII*")
Yields
------
dict
Data in a dict
"""
for dirname, size in get_user_dir_sizes(username):
user = get_dir_user(dirname)
if user is None:
user = ""
print(dirname)
data = dict(
dirname=dirname,
size=float(size),
user=user,
creation_time=get_creation_time(dirname)
)
yield data
def create_user_dir_table(username, output_filename, table_name='user_dir', append=True):
"""Main function to create table of user's directories
Parameters
----------
username : str
Tier2 username to look for directories
output_filename : str
SQL output filename
table_name : str, optional
Name of table in SQL file
append : bool, optional
If True, then append to any existing table with `table_name`.
Otherwise, delete existing before adding entries
"""
user_dir_table_maker = creator.SQLTable(table_name)
user_dir_table_maker.create_connection(path=output_filename)
if not append:
user_dir_table_fields = [
"dirname TEXT NOT NULL",
"size FLOAT",
"user TEXT",
"creation_time TEXT"
]
user_dir_table_maker.create_table(table_fields=user_dir_table_fields)
print("Filling user dir table...")
user_dir_table_maker.fill_table(get_user_dir_data(username))
if __name__ == "__main__":
parser = argparse.ArgumentParser(description=__doc__,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument("user",
help="CERN username to test")
parser.add_argument("--output",
default="xml_table.sqlite",
help="Output SQL filename")
parser.add_argument("--append",
action='store_true',
help="If True, append data to existing table in --output, if one exists. "
"Otherwise, overwrites tables contents")
args = parser.parse_args()
if not os.path.isfile(args.output) and args.append:
print("Output does not exist, setting --append False")
args.append = False
create_user_dir_table(username=args.user,
output_filename=args.output,
append=args.append)