-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathvalidatePins.py
440 lines (385 loc) · 15.2 KB
/
validatePins.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
#!/usr/bin/env python
"""
Copyright (C) 2020 David Vallee Delisle <[email protected]>
This program is free software: you can redistribute it and/or modify
it under the terms of the GNU General Public License as published by
the Free Software Foundation, either version 3 of the License, or
(at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
You should have received a copy of the GNU General Public License
along with this program. If not, see <http://www.gnu.org/licenses/>.
### Description:
Compares the numa topology from the nova database with the
live process list of the computes.
This was written based on Red Hat OpenStack Platforn 10 (Newton)
### Usage:
stack@undercloud $ . stackrc
stack@undercloud $ ./validatePins.py > overcloud-pinset-validation.csv
"""
import subprocess
import json
import os
import sys
import re
import traceback
import xml.etree.ElementTree as ET
from collections import defaultdict
from keystoneauth1.identity import v3
from keystoneauth1 import session
from keystoneclient.v3 import client
from novaclient import client
from nova.virt.hardware import parse_cpu_spec
# Regex's used for parsing
controller_rex = re.compile('.*(control|ocld|ctrl).*') # You might have to change this to match your environment. This is how we detect controllers.
uuid_rex = re.compile('.*-uuid ([^\s]+) ')
instance_id_rex = re.compile('.*guest=(instance-[a-z0-9]+).*')
disk_size_rex = re.compile('disk size: (.*)')
# We need to wipe out logger config because of nova.
import logging
logging.shutdown()
reload(logging)
log = logging.getLogger('validate_pin')
log.setLevel(logging.INFO)
ch = logging.StreamHandler()
ch.setFormatter(logging.Formatter('%(asctime)s - %(levelname)s - %(message)s'))
log.addHandler(ch)
if "OS_CLOUDNAME" not in os.environ or os.environ["OS_CLOUDNAME"] != "undercloud":
log.error("Source stackrc before running this")
sys.exit(1)
# initiating some variables
# nova namespace for xml parsing
nova_ns = {'nova': 'http://openstack.org/xmlns/libvirt/nova/1.0' }
# Some dicts
hypervisors = defaultdict()
controller = None
# Counters
errors = 0
instance_count = 0
class BaseObject():
"""
Base object to standardize __repr__ on all classes
"""
def __repr__(self):
return "%s(%s)" % (
(self.__class__.__name__),
', '.join(["%s=%r" % (key, getattr(self, key))
for key in sorted(self.__dict__.keys())
if not key.startswith('_')]))
class Hypervisor(BaseObject):
"""
Hypervisor object
"""
def __init__(self, **kwargs):
self.name = None
self.ip = None
self.role = None
self.pinset_list = list()
self.pinset_line = None
self.ps_kvm_cpu = defaultdict(int)
self.ps_pinned_cpu = defaultdict(int)
self.db_pinned_cpu = defaultdict(int)
self.db_unused_pin = list()
self.ps_unused_pin = list()
self.ps_pid_cpu = defaultdict(lambda: defaultdict(int))
self.instances = defaultdict()
self.errors = list()
self.__dict__.update(kwargs)
def ssh(self, cmd):
"""
ssh wrappers
"""
global errors
broken = False
returned = None
try:
returned = subprocess.check_output("ssh -q heat-admin@%s \"%s\" 2>&1" % (self.ip, cmd), shell=True)
except Exception as e:
log.error("[%s] Error sshing into host: %s" % (self, e))
broken = True
errors += 1
return returned, broken
def calc_unused(self, src):
"""
Function to calculate unused pins
"""
if len(self.pinset_list):
pinned = getattr(self, src + "_pinned_cpu")
unused = getattr(self, src + "_unused_pin")
for p in self.pinset_list:
if p not in pinned:
unused.append(p)
setattr(self, src + "_unused_pin", unused)
def get_pinset(self):
"""
Parse the vcpu_pin_set line from nova.conf
using nova's parse_cpu_spec() function
"""
global errors
broken = None
try:
oc_pin_set, broken = self.ssh("sudo crudini --get \$(sudo ls -1t /var/lib/config-data/puppet-generated/nova_libvirt/etc/nova/nova.conf /etc/nova/nova.conf 2>/dev/null | head -1) DEFAULT vcpu_pin_set | cat 2>&1")
line = "".join(oc_pin_set).rstrip()
self.pinset_list = parse_cpu_spec(line)
self.pinset_line = line
except:
log.error("[%s] Unable to parse vcpu_pin_set: %s" % (self, line))
errors += 1
pass
return broken
def db_pin_cpu(self, uuid, cpu):
"""
Wrapper called when mapping the DB's topology
to this object
"""
self.db_pinned_cpu[p] += 1
self.instances[uuid].db_pcpus[p] += 1
self.calc_unused('db')
def ps_pin_cpu(self, pid, uuid, cpu, instance_id):
"""
Wrapper called when mapping the output of ps
to this object
"""
global errors
instance = self.instances[uuid]
instance.instance_id = instance_id
instance.pid = pid
instance.ps_pcpus[cpu] += 1
self.ps_kvm_cpu[cpu] += 1
self.ps_pid_cpu[pid][cpu] += 1
self.calc_unused('ps')
if cpu not in self.pinset_list:
errors += 1
instance.outside_pcpu.append(str(cpu))
instance.add_error("pCPU outside of pinset")
def check_ps_cpus(self):
global errors
for cpu in self.ps_kvm_cpu:
log.debug("[%s] pCPU %s is having %s kvm" % (self, cpu, self.ps_kvm_cpu[cpu]))
for i in self.instances:
instance = self.instances[i]
log.debug("[%s] instance vcpu_pinset %s pcpus %s checking for cpu %s" % (instance.uuid, instance.vcpu_pinset, instance.ps_pcpus, cpu))
if cpu in instance.ps_pcpus and not instance.vcpu_pinset:
self.ps_pinned_cpu[cpu] += 1
if self.ps_kvm_cpu[cpu] > 1:
instance.add_error("Some pCPUs are shared")
instance.shared_pcpu.append(str(cpu))
def get_ps(self):
"""
Function that retries the process list of an hypervisor
"""
oc_processes = self.ssh("ps -o cpuid,pid,comm,command -eL | grep '/KVM' | grep -v grep | cat")[0]
# Parsing the process list
for line in oc_processes.splitlines():
l = line.split()
cpu = int(l[0])
pid = l[1]
comm = " ".join(l[2:3])
command = " ".join(l[4:])
try:
uuid = re.search(uuid_rex, command).group(1)
instance_id = re.search(instance_id_rex, command).group(1)
except:
log.error("[%s] qemu-process didn't have a UUID in its arguments: %s" % (self.name, l))
uuid = None
instance_id = None
pass
if uuid:
self.ps_pin_cpu(pid, uuid, cpu, instance_id)
def get_instances_by_cpu(self, cpu):
instance_list = list()
for i in self.instances:
instance = self.instances[i]
if cpu in instance.ps_pcpus:
instance_list.append(instance)
return instance_list
def validate_pin(self):
global errors
failed = False
self.db_pinned_cpu = sorted(self.db_pinned_cpu)
self.ps_pinned_cpu = sorted(self.ps_pinned_cpu)
if len(self.ps_pinned_cpu) and not len(self.db_pinned_cpu):
log.error("[%s] Found KVM process on host but not in Nova DB" % (self.name))
errors += 1
failed = True
for cpu in self.ps_pinned_cpu:
instances = self.get_instances_by_cpu(cpu)
for instance in instances:
instance.outside_pcpu.append(str(cpu))
instance.add_error("Instance pinned on host but not in Nova DB")
if not len(self.ps_pinned_cpu) and len(self.db_pinned_cpu):
log.error("[%s] Found pins in Nova DB (%s), but not used by KVM on host (%s)" % (self.name, self.db_pinned_cpu, self.ps_pinned_cpu))
errors += 1
failed = True
if not failed:
# The dics should be the same on each host.
if self.db_pinned_cpu != self.ps_pinned_cpu:
log.error("[%s] Mismatch between Nova DB and processes on host: NovaDB: %s Host: %s" % (self.name, self.db_pinned_cpu, self.ps_pinned_cpu))
errors += 1
failed = True
for cpu in self.ps_pinned_cpu:
if cpu not in self.db_pinned_cpu:
instances = self.get_instances_by_cpu(cpu)
for instance in instances:
instance.outside_pcpu.append(str(cpu))
instance.add_error("Instance using CPU outside of topology")
return failed
class Instance(BaseObject):
def __init__(self, **kwargs):
self.name = None
self.uuid = None
self.state = None
self.db_pcpus = defaultdict(int)
self.xml_pcpus = defaultdict(int)
self.ps_pcpus = defaultdict(int)
self.outside_pcpu = list()
self.shared_pcpu = list()
self.disk_size = 0
self.vcpu_pinset = None
self.vcpu_pinset_list = list()
self.dumpxml = None
self.errors = list()
self.__dict__.update(kwargs)
def add_error(self, msg):
if msg not in instance.errors:
instance.errors.append(msg)
def get_host(self):
return hypervisors[self.hypervisor]
def get_disksize(self):
self.disk_size = self.get_host().ssh("sudo stat -c %%s /var/lib/nova/instances/%s/disk" % i)[0].rstrip()
def get_xml(self):
self.dumpxml = self.get_host().ssh("sudo virsh dumpxml %s" % self.instance_id)[0]
def get_name(self):
# Getting instance metadata
try:
self.name = ET.fromstring(self.dumpxml)\
.find('metadata')\
.find('nova:instance', nova_ns)\
.find('nova:name', nova_ns).text
except Exception as error: # pylint: disable=broad-except
log.error("[%s] Unable to find instance name: %s" % (self, error))
log.error("%s" % traceback.format_exc())
pass
def get_xml_pins(self):
try:
self.vcpu_pinset = ET.fromstring(self.dumpxml)\
.find('vcpu').attrib['cpuset']
except KeyError as error:
self.vcpu_pinset = None
for item in ET.fromstring(self.dumpxml)\
.find('cputune')\
.iter('vcpupin'):
log.debug("[%s] instance pinned on %s" % (self.uuid, item.attrib))
self.xml_pcpus[item.attrib['cpuset']] += 1
return
for cpu in parse_cpu_spec(self.vcpu_pinset):
self.vcpu_pinset_list.append(cpu)
# Getting undercloud's credentials
AUTH_URL = os.environ['OS_AUTH_URL']
USERNAME = os.environ['OS_USERNAME']
PASSWORD = os.environ['OS_PASSWORD']
USER_DOMAIN_NAME = None
PROJECT_DOMAIN_NAME = None
if 'OS_TENANT_NAME' in os.environ:
project_dict = {'project_id': os.environ['OS_TENANT_NAME'] }
else:
project_dict = {'project_name': os.environ['OS_PROJECT_NAME'],
'project_domain_name': os.environ['OS_PROJECT_DOMAIN_NAME'],
'user_domain_name': os.environ['OS_USER_DOMAIN_NAME'] }
VERSION = 2
if __name__ == '__main__':
# Preparing the openstack environment
log.debug("Poking the undercloud to get list of hypervisors")
nova = client.Client(VERSION, USERNAME, PASSWORD, auth_url=AUTH_URL, connection_pool=True, **project_dict)
servers = nova.servers.list(detailed=True)
# We're getting a list of all the hypervisors and their IPs to ssh in later
for server in servers:
hypervisor = Hypervisor(name=server.name, ip=server.networks['ctlplane'][0])
if re.search(controller_rex, server.name):
hypervisor.role = "Controller"
if not controller:
controller = hypervisor
else:
hypervisor.role = "Compute"
hypervisors[server.name] = hypervisor
log.debug("%i hypervisors (including controllers)" % len(hypervisors))
if not len(hypervisors):
log.error("No hypervisor found in the undercloud?")
sys.exit(1)
log.debug("[%s] Querying the overcloud DB to get numa topologogy" % controller)
# Getting the numa topology from the overcloud
# We have to ssh into the controllers because normally, the mysql process isn't accessible from outside
oc_db_data, broken = controller.ssh("sudo mysql -N -s -D nova -u root --password=\$(sudo hiera -c /etc/puppet/hiera.yaml mysql::server::root_password) -e 'select node,instance_uuid,vm_state,numa_topology from instance_extra a left join instances b on a.instance_uuid = b.uuid where b.deleted = 0;'")
if broken:
log.error("Unable to ssh in the controller")
sys.exit(1)
# parsing the mysql output here
for line in oc_db_data.splitlines():
l = line.split()
# We have to strip the domain here
instance = Instance(hypervisor=l[0].split('.')[0], uuid=l[1], state=l[2])
host = hypervisors[instance.hypervisor]
host.instances[instance.uuid] = instance
log.debug("Creating instance on Host %s Instance %s" % (host.name, instance.uuid))
try:
data = json.loads(" ".join(l[3:]))
except ValueError:
# Instance has no topology defined, we just skip it
log.debug("Instance %s has no topology defined" % instance)
continue
if instance.state == 'active':
d = data['nova_object.data']['cells']
instance_count += 1
for cell in d:
if not isinstance(cell['nova_object.data'], list):
if cell['nova_object.data']['cpu_pinning_raw']:
for v,p in cell['nova_object.data']['cpu_pinning_raw'].items():
host.db_pin_cpu(instance.uuid, p)
else:
log.debug("[%s] Instance has no pins defined in the extra_spec numa_topology object" % (instance))
log.debug("%i instances found" % instance_count)
if instance_count == 0:
log.error("No valid (active+pinned) instance found, quitting")
sys.exit(0)
# Looping through all the hypervisors
for hostname in filter(lambda x: hypervisors[x].role == 'Compute', hypervisors):
host = hypervisors[hostname]
# Getting the pinset configuration in nova.conf
ssh_failed = host.get_pinset()
if ssh_failed:
log.error("[%s] host not responding to ssh" % (host))
continue
if not host.pinset_list:
log.debug("[%s] No pinset defined" % host)
continue
# Getting process list for host
host.get_ps()
# Getting instances' metadata
for i in host.instances:
log.debug("Instance %s on host %s" % (i, host.name))
instance = host.instances[i]
instance.get_disksize()
instance.get_xml()
instance.get_name()
instance.get_xml_pins()
# Let's count the cpus
host.check_ps_cpus()
# Let's make sure we have data on both sides
host.validate_pin()
if len(host.db_pinned_cpu):
log.debug("[%s] (DB) Unused pins: %s" % (host.name, host.db_unused_pin))
if len(host.ps_pinned_cpu):
log.debug("[%s] (PS) Unused pins: %s" % (host.name, host.ps_unused_pin))
print("Run completed, %i errors" % errors)
# Outputing some CSV
if errors:
print('"%s","%s","%s","%s","%s","%s","%s"' % ("Instance UUID", "Host Name", "Instance Name", "Ephemeral Disk Size", "Shared pCPUs", "Outside pCPUs", "Errors"))
for hostname in filter(lambda x: hypervisors[x].role == 'Compute', hypervisors):
host = hypervisors[hostname]
log.debug("Hypervisor %s Role: %s # of instances: %s" % (host.name, host.role, len(host.instances)))
for i in host.instances:
instance = host.instances[i]
if len(instance.errors):
print('"%s","%s","%s","%s","%s","%s","%s"' % (i, instance.hypervisor, instance.name, instance.disk_size, ",".join(instance.shared_pcpu), ",".join(instance.outside_pcpu), ",".join(instance.errors)))