Skip to content

Commit

Permalink
QEMU: Fix for deadlock on VM delete.
Browse files Browse the repository at this point in the history
It seems that libvirt can sometimes deadlock if two threads
simultaneously try to close the console log stream and delete
(undefine) the VM. I am not sure if this is intended behavior or not.

This change changes the node shutdown flow, so that the console log is
fully closed before attempting to undefine the VM. Hopefully this will
prevent the deadlock.

The watchdog timer has been kept in case there are other ways this
deadlock can occur.
  • Loading branch information
cwize1 authored and squirrelsc committed Apr 18, 2022
1 parent 5d6deb7 commit 56c4437
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 84 deletions.
22 changes: 5 additions & 17 deletions lisa/sut_orchestrator/libvirt/ch_platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -153,32 +153,20 @@ def _create_node_domain_xml(
xml = ET.tostring(domain, "unicode")
return xml

def _stop_and_delete_vm(
self,
environment: Environment,
log: Logger,
lv_conn: libvirt.virConnect,
node: Node,
) -> None:
super()._stop_and_delete_vm_flags(
environment,
log,
lv_conn,
node,
0, # ch driver currently doesn't support any flags
)
def _get_domain_undefine_flags(self) -> int:
return 0

def _create_domain_and_attach_logger(
self,
libvirt_conn: libvirt.virConnect,
domain: libvirt.virDomain,
node_context: NodeContext,
) -> None:
domain.createWithFlags(0)
assert node_context.domain
node_context.domain.createWithFlags(0)

assert node_context.console_logger
node_context.console_logger.attach(
libvirt_conn, domain, node_context.console_log_file_path
libvirt_conn, node_context.domain, node_context.console_log_file_path
)

# Create the OS disk.
Expand Down
9 changes: 6 additions & 3 deletions lisa/sut_orchestrator/libvirt/console_logger.py
Original file line number Diff line number Diff line change
Expand Up @@ -41,11 +41,14 @@ def attach(
self._console_stream_callback_started = True

# Close the logger.
def close(self) -> None:
def close(self, abort: bool = True) -> None:
# Check if attach() run successfully.
if self._console_stream_callback_started:
# Close the stream on libvirt callbacks thread.
libvirt_events_thread.run_callback(self._close_stream, True)
if abort:
# Close the stream on libvirt callbacks thread.
libvirt_events_thread.run_callback(self._close_stream, True)

# Wait for stream to close.
self._stream_completed.wait()

else:
Expand Down
6 changes: 5 additions & 1 deletion lisa/sut_orchestrator/libvirt/context.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
from dataclasses import dataclass, field
from typing import Any, Dict, List, Optional

import libvirt # type: ignore

from lisa.environment import Environment
from lisa.node import Node

Expand Down Expand Up @@ -34,11 +36,13 @@ class NodeContext:
os_disk_file_path: str = ""
console_log_file_path: str = ""
extra_cloud_init_user_data: List[Dict[str, Any]] = field(default_factory=list)
console_logger: Optional[QemuConsoleLogger] = None
use_bios_firmware: bool = False
data_disks: List[DataDiskContext] = field(default_factory=list)
next_disk_index: int = 0

console_logger: Optional[QemuConsoleLogger] = None
domain: Optional[libvirt.virDomain] = None


def get_environment_context(environment: Environment) -> EnvironmentContext:
return environment.get_context(EnvironmentContext)
Expand Down
103 changes: 40 additions & 63 deletions lisa/sut_orchestrator/libvirt/platform.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,8 +134,7 @@ def _deploy_environment(self, environment: Environment, log: Logger) -> None:
self._deploy_nodes(environment, log)

def _delete_environment(self, environment: Environment, log: Logger) -> None:
with libvirt.open(self.libvirt_conn_str) as lv_conn:
self._delete_nodes(environment, log, lv_conn)
self._delete_nodes(environment, log)

def _configure_environment(self, environment: Environment, log: Logger) -> None:
environment_context = get_environment_context(environment)
Expand Down Expand Up @@ -287,7 +286,7 @@ def _deploy_nodes(self, environment: Environment, log: Logger) -> None:
environment.platform.runbook.keep_environment
== constants.ENVIRONMENT_KEEP_NO
):
self._delete_nodes(environment, log, lv_conn)
self._delete_nodes(environment, log)

raise ex

Expand Down Expand Up @@ -376,7 +375,6 @@ def _configure_node(
node_context.console_log_file_path = str(
node.local_log_path / "qemu-console.log"
)
node_context.console_logger = QemuConsoleLogger()

# Read extra cloud-init data.
extra_user_data = (
Expand Down Expand Up @@ -417,22 +415,22 @@ def _configure_node(
def _create_domain_and_attach_logger(
self,
libvirt_conn: libvirt.virConnect,
domain: libvirt.virDomain,
node_context: NodeContext,
) -> None:
# Start the VM in the paused state.
# This gives the console logger a chance to connect before the VM starts
# for real.
domain.createWithFlags(libvirt.VIR_DOMAIN_START_PAUSED)
assert node_context.domain
node_context.domain.createWithFlags(libvirt.VIR_DOMAIN_START_PAUSED)

# Attach the console logger
assert node_context.console_logger
node_context.console_logger = QemuConsoleLogger()
node_context.console_logger.attach(
libvirt_conn, domain, node_context.console_log_file_path
libvirt_conn, node_context.domain, node_context.console_log_file_path
)

# Start the VM.
domain.resume()
node_context.domain.resume()

# Create all the VMs.
def _create_nodes(
Expand Down Expand Up @@ -480,92 +478,71 @@ def _create_node(

# Create libvirt domain (i.e. VM).
xml = self._create_node_domain_xml(environment, log, node)
domain = lv_conn.defineXML(xml)
node_context.domain = lv_conn.defineXML(xml)

self._create_domain_and_attach_logger(
lv_conn,
domain,
node_context,
)

# Delete all the VMs.
def _delete_nodes(
self, environment: Environment, log: Logger, lv_conn: libvirt.virConnect
) -> None:
def _delete_nodes(self, environment: Environment, log: Logger) -> None:
# Delete nodes.
for node in environment.nodes.list():
node_context = get_node_context(node)

# Shutdown and delete the VM.
log.debug(f"Delete VM: {node_context.vm_name}")
self._stop_and_delete_vm(environment, log, lv_conn, node)

log.debug(f"Close VM console log: {node_context.vm_name}")
assert node_context.console_logger
node_context.console_logger.close()
self._delete_node(node, log)

# Delete VM disks directory.
try:
self.host_node.shell.remove(Path(self.vm_disks_dir), True)
except Exception as ex:
log.warning(f"Failed to delete VM files directory: {ex}")

def _watchdog_callback(self) -> None:
def _delete_node_watchdog_callback(self) -> None:
print("VM delete watchdog timer fired.\n", file=sys.__stderr__)
faulthandler.dump_traceback(file=sys.__stderr__, all_threads=True)
os._exit(1)

# Delete a VM.
def _stop_and_delete_vm_flags(
self,
environment: Environment,
log: Logger,
lv_conn: libvirt.virConnect,
node: Node,
lv_undefine_flags: int,
) -> None:
def _delete_node(self, node: Node, log: Logger) -> None:
node_context = get_node_context(node)

watchdog = Timer(60.0, self._watchdog_callback)
watchdog = Timer(60.0, self._delete_node_watchdog_callback)
watchdog.start()

# Find the VM.
try:
domain = lv_conn.lookupByName(node_context.vm_name)
except libvirt.libvirtError as ex:
log.warning(f"VM delete failed. Can't find domain. {ex}")
return

# Stop the VM.
try:
# In the libvirt API, "destroy" means "stop".
domain.destroy()
except libvirt.libvirtError as ex:
log.warning(f"VM stop failed. {ex}")
if node_context.domain:
log.debug(f"Stop VM: {node_context.vm_name}")
try:
# In the libvirt API, "destroy" means "stop".
node_context.domain.destroy()
except libvirt.libvirtError as ex:
log.warning(f"VM stop failed. {ex}")

# Wait for console log to close.
# Note: libvirt can deadlock if you try to undefine the VM while the stream
# is trying to close.
if node_context.console_logger:
log.debug(f"Close VM console log: {node_context.vm_name}")
node_context.console_logger.close()
node_context.console_logger = None

# Undefine the VM.
try:
domain.undefineFlags(lv_undefine_flags)
except libvirt.libvirtError as ex:
log.warning(f"VM delete failed. {ex}")
if node_context.domain:
log.debug(f"Delete VM: {node_context.vm_name}")
try:
node_context.domain.undefineFlags(self._get_domain_undefine_flags())
except libvirt.libvirtError as ex:
log.warning(f"VM delete failed. {ex}")

node_context.domain = None

watchdog.cancel()

def _stop_and_delete_vm(
self,
environment: Environment,
log: Logger,
lv_conn: libvirt.virConnect,
node: Node,
) -> None:
self._stop_and_delete_vm_flags(
environment,
log,
lv_conn,
node,
def _get_domain_undefine_flags(self) -> int:
return int(
libvirt.VIR_DOMAIN_UNDEFINE_MANAGED_SAVE
| libvirt.VIR_DOMAIN_UNDEFINE_SNAPSHOTS_METADATA
| libvirt.VIR_DOMAIN_UNDEFINE_NVRAM
| libvirt.VIR_DOMAIN_UNDEFINE_CHECKPOINTS_METADATA,
| libvirt.VIR_DOMAIN_UNDEFINE_CHECKPOINTS_METADATA
)

# Retrieve the VMs' dynamic properties (e.g. IP address).
Expand Down

0 comments on commit 56c4437

Please sign in to comment.