This repository has been archived by the owner on Sep 12, 2024. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 51
/
analyzeDuplicates.py
295 lines (265 loc) · 8.87 KB
/
analyzeDuplicates.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
"""
Analyzes duplicate dump files and calculates the minimum file
set for invalidating
Usage: python analyzeDuplicates.py FILE
FILE: A text file with the output of lumis from the duplicateEvents.py
script
Should be in this format:
dataset : /DATASET_NAME
runs
lumi x is in these files
file 1
file 2
lumi y is in these files
file 3
file 4
....
"""
import sys
import dbs3Client as dbs
import random
from optparse import OptionParser
import reqMgrClient
def buildGraphs(lines):
"""
Builds a dictionary that contains an undirected graph for each
dataset
dataset -> graph.
Each graph contains the representation of files with duplicate
lumis
Vertices: Files
Edges: (f1,f2, #l) where f1 and f2 are two files that have
duplicated lumis, and #l is the ammount of lumis they have
in common
"""
graphs = {}
graph = None
i = 0
dataset = None
while i < len(lines):
#dataset name
if "dataset" in lines[i]:
dataset = lines[i].split(":")[-1].strip()
graph = {}
graphs[dataset] = graph
i += 1
continue
#ignore o
if( "Lumi" not in lines[i]
and not lines[i].startswith("/") ):
i += 1
continue
#text line 'Lumi ### is in the following files:'
lumi = lines[i].split()[1]
#text lines with file names
f1 = lines[i+1]
f2 = lines[i+2]
#create edge (f1, f2)
if f1 not in graph:
graph[f1] = {}
if f2 not in graph[f1]:
graph[f1][f2] = 0
graph[f1][f2] += 1
#create edge (f2, f1)
if f2 not in graph:
graph[f2] = {}
if f1 not in graph[f2]:
graph[f2][f1] = 0
graph[f2][f1] += 1
i += 3
return graphs
def buildGraph(lumis):
graph = {}
for lumi in lumis:
files = lumis[lumi]
#text lines with file names
f1 = files[0]
f2 = files[1]
#create edge (f1, f2)
if f1 not in graph:
graph[f1] = {}
if f2 not in graph[f1]:
graph[f1][f2] = 0
graph[f1][f2] += 1
#create edge (f2, f1)
if f2 not in graph:
graph[f2] = {}
if f1 not in graph[f2]:
graph[f2][f1] = 0
graph[f2][f1] += 1
return graph
def getSumDegree(graph, v):
"""
Returns the sum of all edges adjacent to the vertex.
"""
return sum( e for v2, e in graph[v].items())
def hasEdges(graph):
"""
True if at least one edge is between to vertices,
that is, there is at least one lumi present in two different
files
"""
for v in graph.values():
if v:
return True
return False
def deleteMaxDegreeFirst(graph, events):
"""
Removes duplication by deleting files in a greedy fashion.
That is, removing the files with the highest degree (duplicates)
first, and keep doing so until there is no edge on the graph (no lumi
in two different files)
"""
files = []
print "Initial files:", len(graph)
#quadratic first
while hasEdges(graph):
maxv = None
maxd = 0
#get vertex with highest degree
for v in graph:
d = getSumDegree(graph,v)
if d > maxd:
maxd = d
maxv = v
#remove maxv from all its adjacent vertices
for v in graph[maxv]:
del graph[v][maxv]
#remove maxv entry
del graph[maxv]
files.append(maxv)
#print "End Files:",len(graph), "Invalidated:",len(graph)
return files
def deleteSmallestVertexFirst(graph, events):
"""
Removes duplication by deleting files in a greedy fashion.
That is, removing the files smallest files
first, and keep doing so until there is no edge on the graph (no lumi
in two different files)
"""
files = []
print "Initial files:", len(graph)
#sort by number of events
ls = sorted(graph.keys(), key=lambda x: events[x])
#quadratic first
while hasEdges(graph):
#get smallest vertex
minv = ls.pop()
#remove minv from all its adjacent vertices
for v in graph[minv]:
del graph[v][minv]
#remove maxv entry
del graph[minv]
files.append(minv)
#print "End Files:",len(graph), "Invalidated:",len(graph)
return files
def colorBipartiteGraph(graph, events):
"""
Removes duplication by identifying a bipartite graph and removing
the smaller side
"""
red = set()
green = set()
for f1, f2d in graph.items():
f1red = f1 in red
f1green = f1 in green
for f2 in f2d.keys():
f2red = f2 in red
f2green = f2 in green
#both have no color
if not(f1red or f1green or f2red or f1green):
red.add(f1)
green.add(f2)
#some has two colors:
elif (f1red and f1green) or (f2red and f2green):
print "NOT BIPARTITE GRAPH"
raise Exception("Not a bipartite graph, cannot use this algorithm for removing")
#have same color
elif (f1red and f2red) or (f1green and f2green):
print "NOT BIPARTITE GRAPH"
raise Exception("Not a bipartite graph, cannot use this algorithm for removing")
#both are colored but different
elif f1red != f2red and f1green != f2green:
continue
#color opposite
elif f1red:
green.add(f2)
elif f1green:
red.add(f2)
elif f2red:
green.add(f1)
elif f2green:
green.add(f1)
#validate against the # of events of the files
eventsRed = sum(events[f] for f in red)
eventsGreen = sum(events[f] for f in green)
if eventsRed < eventsGreen:
return list(red)
else:
return list(green)
def getFileEvents(dataset, files):
"""
Builds a dict files-> num events
"""
eventCount = {}
for f in files:
evs = dbs.getEventCountDataSetFileList(dataset, [f])
#evs = random.randrange(10000)
eventCount[f] = evs
return eventCount
url = 'cmsweb.cern.ch'
def main():
usage = "python %prog [OPTIONS]"
parser = OptionParser(usage)
parser.add_option("-a", "--doall",dest="doall", action="store_true" , default=False,
help="It will analyze all datasets of the workflow from the beginning. If this option is true,"\
" you should provide a workflow name or a list of them in the --file option.")
parser.add_option("-f", "--file",dest="file",
help="Input file with the contents of duplicateEvents.py (a list of lumis and files)."\
" If you are using the --doall option, it should contain a list of workflows instead")
options, args = parser.parse_args()
workflows = None
#if we not doing all, input should be treated as list of lumis an files
if not options.doall and options.file:
lines = [l.strip() for l in open(options.file)]
graphs = buildGraphs(lines)
# if do all and input file
elif options.doall and options.file:
workflows = [l.strip() for l in open(options.file)]
elif options.doall and not options.file:
workflows = args
else:
parser.error("You should provide an input file with the output of duplicateEvents")
# get the output datasets of the workflos and create the graph
if workflows:
datasets = []
for wf in workflows:
datasets += reqMgrClient.outputdatasetsWorkflow(url, wf);
graphs = {}
#analyze each dataset
for dataset in datasets:
dup, lumis = dbs.duplicateRunLumi(dataset, verbose="dict", skipInvalid=True)
#print lumis
graphs[dataset] = buildGraph(lumis)
for dataset, graph in graphs.items():
#look for datasetname
print "Getting events per file"
events = getFileEvents(dataset, graph.keys())
try:
#first algorithm that assumes bipartition
files = colorBipartiteGraph(graph, events)
except Exception as e:
#second, algorithm
#files = deleteMaxDegreeFirst(graph, events)
files = deleteSmallestVertexFirst(graph, events)
total = dbs.getEventCountDataSet(dataset)
invalid = dbs.getEventCountDataSetFileList(dataset, files)
print 'total events %s'%total
print 'invalidated files %s'%len(files)
print 'invalidated events %s'%invalid
if total:
print '%s%%'%(float(total-invalid)/total*100.0)
for f in sorted(files):
print f
if __name__ == '__main__':
main()