diff --git a/.github/workflows/test-using-pytest.yml b/.github/workflows/test-using-pytest.yml index 732a646ea..99d799036 100644 --- a/.github/workflows/test-using-pytest.yml +++ b/.github/workflows/test-using-pytest.yml @@ -18,7 +18,7 @@ jobs: run: | sudo apt-get update sudo apt-get -y upgrade - sudo apt-get install -y python3 python3-pip python3-aiohttp python3-msgpack python3-aiodns python3-alembic python3-sqlalchemy python3-setproctitle redis python3-aioredis python3-psutil sudo acl curl systemd-container squashfs-tools debootstrap python3-packaging python3-cpuinfo python3-nftables python3-jsonschema nftables + sudo apt-get install -y python3 python3-pip python3-aiohttp python3-msgpack python3-aiodns python3-alembic python3-sqlalchemy python3-setproctitle redis python3-aioredis python3-psutil sudo acl curl systemd-container squashfs-tools debootstrap python3-packaging python3-nftables python3-jsonschema nftables lshw python3-jwcrypto pip install --upgrade typing-extensions types-PyYAML - name: Install required Python packages diff --git a/docker/vm_supervisor-dev.dockerfile b/docker/vm_supervisor-dev.dockerfile index da730aca8..3214e5494 100644 --- a/docker/vm_supervisor-dev.dockerfile +++ b/docker/vm_supervisor-dev.dockerfile @@ -5,7 +5,7 @@ FROM debian:bullseye RUN apt-get update && apt-get -y upgrade && apt-get install -y \ sudo acl curl squashfs-tools git \ python3 python3-aiohttp python3-alembic python3-msgpack python3-pip python3-aiodns python3-aioredis\ - python3-nftables python3-psutil python3-setproctitle python3-sqlalchemy python3-packaging python3-cpuinfo ndppd nftables \ + python3-nftables python3-psutil python3-setproctitle python3-sqlalchemy python3-packaging ndppd nftables \ && rm -rf /var/lib/apt/lists/* RUN useradd jailman diff --git a/packaging/aleph-vm/DEBIAN/control b/packaging/aleph-vm/DEBIAN/control index 6b42eea41..50a45d810 100644 --- a/packaging/aleph-vm/DEBIAN/control +++ b/packaging/aleph-vm/DEBIAN/control @@ -3,6 +3,6 @@ Version: 0.1.8 Architecture: all Maintainer: Aleph.im Description: Aleph.im VM execution engine -Depends: python3,python3-pip,python3-aiohttp,python3-msgpack,python3-aiodns,python3-alembic,python3-sqlalchemy,python3-setproctitle,redis,python3-aioredis,python3-psutil,sudo,acl,curl,systemd-container,squashfs-tools,debootstrap,python3-packaging,python3-cpuinfo,python3-nftables,python3-jsonschema,cloud-image-utils,ndppd,python3-yaml,python3-dotenv,python3-schedule,qemu-system-x86,qemu-utils,python3-systemd,python3-dbus,btrfs-progs,nftables,python3-jwcrypto +Depends: python3,python3-pip,python3-aiohttp,python3-msgpack,python3-aiodns,python3-alembic,python3-sqlalchemy,python3-setproctitle,redis,python3-aioredis,python3-psutil,sudo,acl,curl,systemd-container,squashfs-tools,debootstrap,python3-packaging,python3-nftables,python3-jsonschema,cloud-image-utils,ndppd,python3-yaml,python3-dotenv,python3-schedule,qemu-system-x86,qemu-utils,python3-systemd,python3-dbus,btrfs-progs,nftables,lshw,python3-jwcrypto Section: aleph-im Priority: Extra diff --git a/pyproject.toml b/pyproject.toml index 95ed874a5..1a2c2443e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -34,7 +34,6 @@ dependencies = [ "sentry-sdk==1.31.0", "aioredis==1.3.1", "psutil==5.9.5", - "py-cpuinfo==9.0.0", "schedule==1.2.1", "nftables @ git+https://salsa.debian.org/pkg-netfilter-team/pkg-nftables#egg=nftables&subdirectory=py", "msgpack==1.0.7", diff --git a/src/aleph/vm/orchestrator/machine.py b/src/aleph/vm/orchestrator/machine.py new file mode 100644 index 000000000..55f1072ee --- /dev/null +++ b/src/aleph/vm/orchestrator/machine.py @@ -0,0 +1,74 @@ +import asyncio +import json +import re + +import psutil + + +async def get_hardware_info(): + lshw = await asyncio.create_subprocess_shell( + "lshw -sanitize -json", stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE + ) + + output, _ = await lshw.communicate() + data = json.loads(output) + + hw_info = {"cpu": None, "memory": None} + + for hw in data["children"][0]["children"]: + if hw["id"] == "cpu": + hw_info["cpu"] = hw + elif hw["class"] == "memory" and hw["id"] == "memory": + hw_info["memory"] = hw + + return hw_info + + +def get_cpu_info(hw): + cpu_info = hw["cpu"] + architecture = cpu_info["width"] + + if "x86_64" in cpu_info["capabilities"] or "x86-64" in cpu_info["capabilities"]: + architecture = "x86_64" + elif "arm64" in cpu_info["capabilities"] or "arm-64" in cpu_info["capabilities"]: + architecture = "arm64" + + vendor = cpu_info["vendor"] + # lshw vendor implementation => https://github.com/lyonel/lshw/blob/15e4ca64647ad119b69be63274e5de2696d3934f/src/core/cpuinfo.cc#L308 + + if "Intel Corp" in vendor: + vendor = "GenuineIntel" + elif "Advanced Micro Devices [AMD]" in vendor: + vendor = "AuthenticAMD" + + return { + "architecture": architecture, + "vendor": vendor, + "model": cpu_info["product"], + "frequency": cpu_info["capacity"], + "count": psutil.cpu_count(), + } + + +def get_memory_info(hw): + mem_info = hw["memory"] + + memory_type = "" + memory_clock = "" + for bank in mem_info["children"]: + memory_clock = bank.get("clock") + if "description" in bank: + matched = re.search("(DDR[2-6])", bank["description"]) + if matched: + memory_type = matched.group(0) + break + else: + pass + + return { + "size": mem_info["size"], + "units": mem_info["units"], + "type": memory_type, + "clock": memory_clock, + "clock_units": "Hz" if memory_clock is not None else "", + } diff --git a/src/aleph/vm/orchestrator/resources.py b/src/aleph/vm/orchestrator/resources.py index 448a822c5..009bb6579 100644 --- a/src/aleph/vm/orchestrator/resources.py +++ b/src/aleph/vm/orchestrator/resources.py @@ -1,9 +1,7 @@ import math from datetime import datetime, timezone -from functools import lru_cache from typing import Optional -import cpuinfo import psutil from aiohttp import web from aleph_message.models import ItemHash @@ -11,7 +9,12 @@ from pydantic import BaseModel, Field from aleph.vm.conf import settings -from aleph.vm.utils import cors_allow_all +from aleph.vm.orchestrator.machine import ( + get_cpu_info, + get_hardware_info, + get_memory_info, +) +from aleph.vm.utils import async_cache, cors_allow_all class Period(BaseModel): @@ -77,16 +80,69 @@ class MachineUsage(BaseModel): active: bool = True -@lru_cache -def get_machine_properties() -> MachineProperties: +class ExtendedCpuProperties(CpuProperties): + """CPU properties.""" + + model: Optional[str] = Field(default=None, description="CPU model") + frequency: Optional[str] = Field(default=None, description="CPU frequency") + count: Optional[str] = Field(default=None, description="CPU count") + + +class MemoryProperties(BaseModel): + """MEMORY properties.""" + + size: Optional[str] = Field(default=None, description="Memory size") + units: Optional[str] = Field(default=None, description="Memory size units") + type: Optional[str] = Field(default=None, description="Memory type") + clock: Optional[str] = Field(default=None, description="Memory clock") + clock_units: Optional[str] = Field(default=None, description="Memory clock units") + + +class MachineCapability(BaseModel): + cpu: ExtendedCpuProperties + memory: MemoryProperties + + +machine_properties_cached = None + + +@async_cache +async def get_machine_properties() -> MachineProperties: """Fetch machine properties such as architecture, CPU vendor, ... These should not change while the supervisor is running. In the future, some properties may have to be fetched from within a VM. """ - cpu_info = cpuinfo.get_cpu_info() # Slow + hw = await get_hardware_info() + cpu_info = get_cpu_info(hw) return MachineProperties( cpu=CpuProperties( + architecture=cpu_info["architecture"], + vendor=cpu_info["vendor"], + ), + ) + + +@async_cache +async def get_machine_capability() -> MachineCapability: + hw = await get_hardware_info() + cpu_info = get_cpu_info(hw) + mem_info = get_memory_info(hw) + + return MachineCapability( + cpu=ExtendedCpuProperties( + architecture=cpu_info["architecture"], + vendor=cpu_info["vendor"], + model=cpu_info["model"], + frequency=cpu_info["frequency"], + count=cpu_info["count"], + ), + memory=MemoryProperties( + size=mem_info["size"], + units=mem_info["units"], + type=mem_info["type"], + clock=mem_info["clock"], + clock_units=mem_info["clock_units"], architecture=cpu_info.get("raw_arch_string", cpu_info.get("arch_string_raw")), vendor=cpu_info.get("vendor_id", cpu_info.get("vendor_id_raw")), ), @@ -98,6 +154,7 @@ async def about_system_usage(_: web.Request): """Public endpoint to expose information about the system usage.""" period_start = datetime.now(timezone.utc).replace(second=0, microsecond=0) + machine_properties = await get_machine_properties() usage: MachineUsage = MachineUsage( cpu=CpuUsage( count=psutil.cpu_count(), @@ -116,12 +173,19 @@ async def about_system_usage(_: web.Request): start_timestamp=period_start, duration_seconds=60, ), - properties=get_machine_properties(), + properties=machine_properties, ) return web.json_response(text=usage.json(exclude_none=True)) +async def about_capability(_: web.Request): + """Public endpoint to expose information about the CRN capability.""" + + capability: MachineCapability = await get_machine_capability() + return web.json_response(text=capability.json(exclude_none=False)) + + class Allocation(BaseModel): """An allocation is the set of resources that are currently allocated on this orchestrator. It contains the item_hashes of all persistent VMs, instances, on-demand VMs and jobs. diff --git a/src/aleph/vm/orchestrator/supervisor.py b/src/aleph/vm/orchestrator/supervisor.py index 892106ba0..40269c9cb 100644 --- a/src/aleph/vm/orchestrator/supervisor.py +++ b/src/aleph/vm/orchestrator/supervisor.py @@ -21,7 +21,7 @@ from aleph.vm.version import __version__ from .metrics import create_tables, setup_engine -from .resources import about_system_usage +from .resources import about_capability, about_system_usage from .tasks import ( start_payment_monitoring_task, start_watch_for_messages_task, @@ -95,6 +95,7 @@ def setup_webapp(): web.get("/about/executions/details", about_executions), web.get("/about/executions/records", about_execution_records), web.get("/about/usage/system", about_system_usage), + web.get("/about/capability", about_capability), web.get("/about/config", about_config), # /control APIs are used to control the VMs and access their logs web.post("/control/allocation/notify", notify_allocation), diff --git a/src/aleph/vm/utils.py b/src/aleph/vm/utils.py index 63ce18253..e11a6af81 100644 --- a/src/aleph/vm/utils.py +++ b/src/aleph/vm/utils.py @@ -1,5 +1,6 @@ import asyncio import dataclasses +import functools import hashlib import json import logging @@ -211,3 +212,17 @@ def file_hashes_differ(source: Path, destination: Path, checksum: Callable[[Path return True return checksum(source) != checksum(destination) + + +def async_cache(fn): + """Simple async function cache decorator.""" + cache = {} + + @functools.wraps(fn) + async def wrapper(*args, **kwargs): + key = (args, frozenset(kwargs.items())) + if key not in cache: + cache[key] = await fn(*args, **kwargs) + return cache[key] + + return wrapper diff --git a/tests/supervisor/test_views.py b/tests/supervisor/test_views.py index 254e326df..a30cbf4ac 100644 --- a/tests/supervisor/test_views.py +++ b/tests/supervisor/test_views.py @@ -2,6 +2,7 @@ from aiohttp import web from aleph.vm.conf import settings +from aleph.vm.orchestrator.machine import get_hardware_info from aleph.vm.orchestrator.supervisor import setup_webapp @@ -40,16 +41,160 @@ async def test_system_usage(aiohttp_client): assert resp["cpu"]["count"] > 0 +FAKE_SYSTEM_INFO = { + "cpu": { + "id": "cpu", + "class": "processor", + "claimed": True, + "handle": "DMI:0400", + "description": "CPU", + "product": "AMD EPYC 7763 64-Core Processor", + "vendor": "Advanced Micro Devices [AMD]", + "physid": "400", + "businfo": "cpu@0", + "version": "25.1.1", + "slot": "CPU 0", + "units": "Hz", + "size": 2000000000, + "capacity": 2000000000, + "width": 64, + "configuration": {"cores": "8", "enabledcores": "8", "microcode": "167776681", "threads": "1"}, + "capabilities": { + "fpu": "mathematical co-processor", + "fpu_exception": "FPU exceptions reporting", + "wp": True, + "vme": "virtual mode extensions", + "de": "debugging extensions", + "pse": "page size extensions", + "tsc": "time stamp counter", + "msr": "model-specific registers", + "pae": "4GB+ memory addressing (Physical Address Extension)", + "mce": "machine check exceptions", + "cx8": "compare and exchange 8-byte", + "apic": "on-chip advanced programmable interrupt controller (APIC)", + "sep": "fast system calls", + "mtrr": "memory type range registers", + "pge": "page global enable", + "mca": "machine check architecture", + "cmov": "conditional move instruction", + "pat": "page attribute table", + "pse36": "36-bit page size extensions", + "clflush": True, + "mmx": "multimedia extensions (MMX)", + "fxsr": "fast floating point save/restore", + "sse": "streaming SIMD extensions (SSE)", + "sse2": "streaming SIMD extensions (SSE2)", + "ht": "HyperThreading", + "syscall": "fast system calls", + "nx": "no-execute bit (NX)", + "mmxext": "multimedia extensions (MMXExt)", + "fxsr_opt": True, + "pdpe1gb": True, + "rdtscp": True, + "rep_good": True, + "nopl": True, + "cpuid": True, + "extd_apicid": True, + "tsc_known_freq": True, + "pni": True, + "pclmulqdq": True, + "ssse3": True, + "fma": True, + "cx16": True, + "pcid": True, + "sse4_1": True, + "sse4_2": True, + "x2apic": True, + "movbe": True, + "popcnt": True, + "tsc_deadline_timer": True, + "aes": True, + "xsave": True, + "avx": True, + "f16c": True, + "rdrand": True, + "hypervisor": True, + "lahf_lm": True, + "cmp_legacy": True, + "svm": True, + "cr8_legacy": True, + "abm": True, + "sse4a": True, + "misalignsse": True, + "3dnowprefetch": True, + "osvw": True, + "perfctr_core": True, + "invpcid_single": True, + "ssbd": True, + "ibrs": True, + "ibpb": True, + "stibp": True, + "vmmcall": True, + "fsgsbase": True, + "tsc_adjust": True, + "bmi1": True, + "avx2": True, + "smep": True, + "bmi2": True, + "erms": True, + "invpcid": True, + "rdseed": True, + "adx": True, + "clflushopt": True, + "clwb": True, + "sha_ni": True, + "xsaveopt": True, + "xsavec": True, + "xgetbv1": True, + "xsaves": True, + "clzero": True, + "xsaveerptr": True, + "wbnoinvd": True, + "arat": True, + "npt": True, + "nrip_save": True, + "umip": True, + "pku": True, + "vaes": True, + "vpclmulqdq": True, + "rdpid": True, + "fsrm": True, + "arch_capabilities": True, + }, + }, + "memory": { + "id": "memory", + "class": "memory", + "claimed": True, + "handle": "DMI:1000", + "description": "System Memory", + "physid": "1000", + "units": "bytes", + "size": 17179869184, + "configuration": {"errordetection": "multi-bit-ecc"}, + "capabilities": {"ecc": "Multi-bit error-correcting code (ECC)"}, + "children": [ + { + "id": "bank", + "class": "memory", + "claimed": True, + "handle": "DMI:1100", + "description": "DIMM RAM", + "vendor": "QEMU", + "physid": "0", + "slot": "DIMM 0", + "units": "bytes", + "size": 17179869184, + } + ], + }, +} + + @pytest.mark.asyncio async def test_system_usage_mock(aiohttp_client, mocker): """Test that the usage system endpoints response value. No auth needed""" - mocker.patch( - "cpuinfo.cpuinfo.get_cpu_info", - { - "arch_string_raw": "x86_64", - "vendor_id_raw": "AuthenticAMD", - }, - ) + mocker.patch("aleph.vm.orchestrator.machine.get_hardware_info", FAKE_SYSTEM_INFO) mocker.patch( "psutil.getloadavg", lambda: [1, 2, 3], @@ -70,6 +215,36 @@ async def test_system_usage_mock(aiohttp_client, mocker): assert resp["cpu"]["count"] == 200 +@pytest.mark.asyncio +async def test_system_capability_mock(aiohttp_client, mocker): + """Test that the capability system endpoints response value. No auth needed""" + mocker.patch("aleph.vm.orchestrator.machine.get_hardware_info", FAKE_SYSTEM_INFO) + mocker.patch( + "psutil.getloadavg", + lambda: [1, 2, 3], + ) + mocker.patch( + "psutil.cpu_count", + lambda: 200, + ) + app = setup_webapp() + client = await aiohttp_client(app) + response: web.Response = await client.get("/about/capability") + assert response.status == 200 + # check if it is valid json + resp = await response.json() + assert resp == { + "cpu": { + "architecture": "x86_64", + "vendor": "AuthenticAMD", + "model": "AMD EPYC 7763 64-Core Processor", + "frequency": "2000000000", + "count": "200", + }, + "memory": {"size": "17179869184", "units": "bytes", "type": "", "clock": None, "clock_units": ""}, + } + + @pytest.mark.asyncio async def test_allocation_invalid_auth_token(aiohttp_client): """Test that the allocation endpoint fails when an invalid auth token is provided."""