Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add CPU information (model, vendor, frequency) and memory details (clock, size, type) to API #544

Open
wants to merge 18 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/test-using-pytest.yml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ jobs:
run: |
sudo apt-get update
sudo apt-get -y upgrade
sudo apt-get install -y python3 python3-pip python3-aiohttp python3-msgpack python3-aiodns python3-alembic python3-sqlalchemy python3-setproctitle redis python3-aioredis python3-psutil sudo acl curl systemd-container squashfs-tools debootstrap python3-packaging python3-cpuinfo python3-nftables python3-jsonschema nftables
sudo apt-get install -y python3 python3-pip python3-aiohttp python3-msgpack python3-aiodns python3-alembic python3-sqlalchemy python3-setproctitle redis python3-aioredis python3-psutil sudo acl curl systemd-container squashfs-tools debootstrap python3-packaging python3-nftables python3-jsonschema nftables lshw python3-jwcrypto
pip install --upgrade typing-extensions types-PyYAML

- name: Install required Python packages
Expand Down
2 changes: 1 addition & 1 deletion docker/vm_supervisor-dev.dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ FROM debian:bullseye
RUN apt-get update && apt-get -y upgrade && apt-get install -y \
sudo acl curl squashfs-tools git \
python3 python3-aiohttp python3-alembic python3-msgpack python3-pip python3-aiodns python3-aioredis\
python3-nftables python3-psutil python3-setproctitle python3-sqlalchemy python3-packaging python3-cpuinfo ndppd nftables \
python3-nftables python3-psutil python3-setproctitle python3-sqlalchemy python3-packaging ndppd nftables \
&& rm -rf /var/lib/apt/lists/*

RUN useradd jailman
Expand Down
2 changes: 1 addition & 1 deletion packaging/aleph-vm/DEBIAN/control
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@ Version: 0.1.8
Architecture: all
Maintainer: Aleph.im
Description: Aleph.im VM execution engine
Depends: python3,python3-pip,python3-aiohttp,python3-msgpack,python3-aiodns,python3-alembic,python3-sqlalchemy,python3-setproctitle,redis,python3-aioredis,python3-psutil,sudo,acl,curl,systemd-container,squashfs-tools,debootstrap,python3-packaging,python3-cpuinfo,python3-nftables,python3-jsonschema,cloud-image-utils,ndppd,python3-yaml,python3-dotenv,python3-schedule,qemu-system-x86,qemu-utils,python3-systemd,python3-dbus,btrfs-progs,nftables,python3-jwcrypto
Depends: python3,python3-pip,python3-aiohttp,python3-msgpack,python3-aiodns,python3-alembic,python3-sqlalchemy,python3-setproctitle,redis,python3-aioredis,python3-psutil,sudo,acl,curl,systemd-container,squashfs-tools,debootstrap,python3-packaging,python3-nftables,python3-jsonschema,cloud-image-utils,ndppd,python3-yaml,python3-dotenv,python3-schedule,qemu-system-x86,qemu-utils,python3-systemd,python3-dbus,btrfs-progs,nftables,lshw,python3-jwcrypto
Section: aleph-im
Priority: Extra
1 change: 0 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,6 @@ dependencies = [
"sentry-sdk==1.31.0",
"aioredis==1.3.1",
"psutil==5.9.5",
"py-cpuinfo==9.0.0",
"schedule==1.2.1",
"nftables @ git+https://salsa.debian.org/pkg-netfilter-team/pkg-nftables#egg=nftables&subdirectory=py",
"msgpack==1.0.7",
Expand Down
74 changes: 74 additions & 0 deletions src/aleph/vm/orchestrator/machine.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import asyncio
import json
import re

import psutil


async def get_hardware_info():
lshw = await asyncio.create_subprocess_shell(
"lshw -sanitize -json", stdout=asyncio.subprocess.PIPE, stderr=asyncio.subprocess.PIPE
)

output, _ = await lshw.communicate()
data = json.loads(output)

hw_info = {"cpu": None, "memory": None}

for hw in data["children"][0]["children"]:
if hw["id"] == "cpu":
hw_info["cpu"] = hw
elif hw["class"] == "memory" and hw["id"] == "memory":
hw_info["memory"] = hw

return hw_info


def get_cpu_info(hw):
cpu_info = hw["cpu"]
architecture = cpu_info["width"]

if "x86_64" in cpu_info["capabilities"] or "x86-64" in cpu_info["capabilities"]:
architecture = "x86_64"
elif "arm64" in cpu_info["capabilities"] or "arm-64" in cpu_info["capabilities"]:
architecture = "arm64"
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this really the best approach to get the architecture ? Seems weird to look for it in the CPU capabilities.
What if a 32 bit system is running on a x86-64 capable system ?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

here we want to know the architecture supported by the processor, not by the system
Obviously, 64-bit processors support 32-bit systems, but not vice versa.


vendor = cpu_info["vendor"]
# lshw vendor implementation => https://github.com/lyonel/lshw/blob/15e4ca64647ad119b69be63274e5de2696d3934f/src/core/cpuinfo.cc#L308

if "Intel Corp" in vendor:
vendor = "GenuineIntel"
elif "Advanced Micro Devices [AMD]" in vendor:
vendor = "AuthenticAMD"

return {
"architecture": architecture,
"vendor": vendor,
"model": cpu_info["product"],
"frequency": cpu_info["capacity"],
"count": psutil.cpu_count(),
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why return a dict and not instances of the Properties / Capabilities classes defined blow in the diff ?



def get_memory_info(hw):
mem_info = hw["memory"]

memory_type = ""
memory_clock = ""
for bank in mem_info["children"]:
memory_clock = bank.get("clock")
if "description" in bank:
matched = re.search("(DDR[2-6])", bank["description"])
if matched:
memory_type = matched.group(0)
break
else:
pass

return {
"size": mem_info["size"],
"units": mem_info["units"],
"type": memory_type,
"clock": memory_clock,
"clock_units": "Hz" if memory_clock is not None else "",
}
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why not return an instance of MemoryProperties ?

78 changes: 71 additions & 7 deletions src/aleph/vm/orchestrator/resources.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,20 @@
import math
from datetime import datetime, timezone
from functools import lru_cache
from typing import Optional

import cpuinfo
import psutil
from aiohttp import web
from aleph_message.models import ItemHash
from aleph_message.models.execution.environment import CpuProperties
from pydantic import BaseModel, Field

from aleph.vm.conf import settings
from aleph.vm.utils import cors_allow_all
from aleph.vm.orchestrator.machine import (
get_cpu_info,
get_hardware_info,
get_memory_info,
)
from aleph.vm.utils import async_cache, cors_allow_all


class Period(BaseModel):
Expand Down Expand Up @@ -77,16 +80,69 @@ class MachineUsage(BaseModel):
active: bool = True


@lru_cache
def get_machine_properties() -> MachineProperties:
class ExtendedCpuProperties(CpuProperties):
"""CPU properties."""

model: Optional[str] = Field(default=None, description="CPU model")
frequency: Optional[str] = Field(default=None, description="CPU frequency")
count: Optional[str] = Field(default=None, description="CPU count")


class MemoryProperties(BaseModel):
"""MEMORY properties."""

size: Optional[str] = Field(default=None, description="Memory size")
units: Optional[str] = Field(default=None, description="Memory size units")
type: Optional[str] = Field(default=None, description="Memory type")
clock: Optional[str] = Field(default=None, description="Memory clock")
clock_units: Optional[str] = Field(default=None, description="Memory clock units")


class MachineCapability(BaseModel):
cpu: ExtendedCpuProperties
memory: MemoryProperties


machine_properties_cached = None


@async_cache
async def get_machine_properties() -> MachineProperties:
"""Fetch machine properties such as architecture, CPU vendor, ...
These should not change while the supervisor is running.

In the future, some properties may have to be fetched from within a VM.
"""
cpu_info = cpuinfo.get_cpu_info() # Slow
hw = await get_hardware_info()
cpu_info = get_cpu_info(hw)
return MachineProperties(
cpu=CpuProperties(
architecture=cpu_info["architecture"],
vendor=cpu_info["vendor"],
),
)


@async_cache
async def get_machine_capability() -> MachineCapability:
hw = await get_hardware_info()
cpu_info = get_cpu_info(hw)
mem_info = get_memory_info(hw)

return MachineCapability(
cpu=ExtendedCpuProperties(
architecture=cpu_info["architecture"],
vendor=cpu_info["vendor"],
model=cpu_info["model"],
frequency=cpu_info["frequency"],
count=cpu_info["count"],
),
memory=MemoryProperties(
size=mem_info["size"],
units=mem_info["units"],
type=mem_info["type"],
clock=mem_info["clock"],
clock_units=mem_info["clock_units"],
architecture=cpu_info.get("raw_arch_string", cpu_info.get("arch_string_raw")),
vendor=cpu_info.get("vendor_id", cpu_info.get("vendor_id_raw")),
),
Expand All @@ -98,6 +154,7 @@ async def about_system_usage(_: web.Request):
"""Public endpoint to expose information about the system usage."""
period_start = datetime.now(timezone.utc).replace(second=0, microsecond=0)

machine_properties = await get_machine_properties()
usage: MachineUsage = MachineUsage(
cpu=CpuUsage(
count=psutil.cpu_count(),
Expand All @@ -116,12 +173,19 @@ async def about_system_usage(_: web.Request):
start_timestamp=period_start,
duration_seconds=60,
),
properties=get_machine_properties(),
properties=machine_properties,
)

return web.json_response(text=usage.json(exclude_none=True))


async def about_capability(_: web.Request):
"""Public endpoint to expose information about the CRN capability."""

capability: MachineCapability = await get_machine_capability()
return web.json_response(text=capability.json(exclude_none=False))


class Allocation(BaseModel):
"""An allocation is the set of resources that are currently allocated on this orchestrator.
It contains the item_hashes of all persistent VMs, instances, on-demand VMs and jobs.
Expand Down
3 changes: 2 additions & 1 deletion src/aleph/vm/orchestrator/supervisor.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
from aleph.vm.version import __version__

from .metrics import create_tables, setup_engine
from .resources import about_system_usage
from .resources import about_capability, about_system_usage
from .tasks import (
start_payment_monitoring_task,
start_watch_for_messages_task,
Expand Down Expand Up @@ -95,6 +95,7 @@ def setup_webapp():
web.get("/about/executions/details", about_executions),
web.get("/about/executions/records", about_execution_records),
web.get("/about/usage/system", about_system_usage),
web.get("/about/capability", about_capability),
web.get("/about/config", about_config),
# /control APIs are used to control the VMs and access their logs
web.post("/control/allocation/notify", notify_allocation),
Expand Down
15 changes: 15 additions & 0 deletions src/aleph/vm/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import asyncio
import dataclasses
import functools
import hashlib
import json
import logging
Expand Down Expand Up @@ -211,3 +212,17 @@ def file_hashes_differ(source: Path, destination: Path, checksum: Callable[[Path
return True

return checksum(source) != checksum(destination)


def async_cache(fn):
"""Simple async function cache decorator."""
cache = {}

@functools.wraps(fn)
async def wrapper(*args, **kwargs):
key = (args, frozenset(kwargs.items()))
if key not in cache:
cache[key] = await fn(*args, **kwargs)
return cache[key]

return wrapper
Loading
Loading