diff --git a/CMake/Findfstack.cmake b/CMake/Findfstack.cmake index cbf028b1..b90822bc 100644 --- a/CMake/Findfstack.cmake +++ b/CMake/Findfstack.cmake @@ -1,3 +1,5 @@ +find_package(PkgConfig REQUIRED) + pkg_check_modules(DPDK REQUIRED libdpdk) find_path(FSTACK_INCLUDE_DIRS NAMES ff_api.h PATHS /usr/local/include NO_DEFAULT_PATH) diff --git a/CMakeLists.txt b/CMakeLists.txt index 257a473e..054e5050 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -13,7 +13,6 @@ include(FetchContent) include(ProcessorCount) include(ExternalProject) include(CMake/build-from-src.cmake) -find_package(PkgConfig REQUIRED) # Options set(PHOTON_CXX_STANDARD "14" CACHE STRING "C++ standard") @@ -241,6 +240,7 @@ if (PHOTON_ENABLE_MIMIC_VDSO) target_compile_definitions(photon_obj PRIVATE ENABLE_MIMIC_VDSO=on) endif() if (PHOTON_ENABLE_FSTACK_DPDK) + target_compile_definitions(photon_obj PRIVATE ENABLE_FSTACK_DPDK) target_include_directories(photon_obj PRIVATE ${FSTACK_INCLUDE_DIRS}) endif() if (PHOTON_ENABLE_EXTFS) diff --git a/doc/blog/2023-07-29-photon-dpdk.md b/doc/blog/2023-07-29-photon-dpdk.md index 14a85552..bda09909 100644 --- a/doc/blog/2023-07-29-photon-dpdk.md +++ b/doc/blog/2023-07-29-photon-dpdk.md @@ -5,6 +5,9 @@ authors: [beef9999] tags: [DPDK, F-Stack] --- +import Tabs from '@theme/Tabs'; +import TabItem from '@theme/TabItem'; +   Since version 0.6, Photon can run on an userspace TCP/IP stack if enabled the `INIT_IO_FSTACK_DPDK` io engine.   [F-Stack](https://www.f-stack.org/) is an open-source project that has ported the entire **FreeBSD** @@ -22,12 +25,35 @@ to build a high performance net server. #### 1. Enable IOMMU -```shell -# Edit /etc/default/grub, expand GRUB_CMDLINE_LINUX with 'intel_iommu=on iommu=pt pci=realloc' +```mdx-code-block + + +``` + +```bash +# Edit /etc/default/grub +# Expand GRUB_CMDLINE_LINUX with 'intel_iommu=on iommu=pt pci=realloc' grub2-mkconfig -o /boot/grub2/grub.cfg reboot ``` +```mdx-code-block + + +``` + +```bash +# Edit /etc/default/grub +# Expand GRUB_CMDLINE_LINUX with 'intel_iommu=on iommu=pt' +grub-mkconfig -o /boot/grub/grub.cfg +reboot +``` + +```mdx-code-block + + +``` + Note the `pci=realloc` is a work-around solution for CentOS and RHEL. Without this, kernel would report `not enough MMIO resources for SR-IOV`, see this [issue](https://access.redhat.com/solutions/37376). @@ -39,13 +65,18 @@ echo 4 > /sys/class/net/eth0/device/sriov_numvfs ```   If you are having an Intel NIC, this step is likely to succeed. However, for the Mellanox one, -it might fail because of the lack of proper mlx driver in your kernel. -Then you would need to download the official driver from NVidia, and make a full install. +it might fail because of the lack of proper mlx driver in your kernel. +Please check the result by typing `lspci -nn | grep Ethernet` and see if the NICs' virtual function number is correct. -  There are many available releases in https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/, +  If succeeded, please jump to the part of 'Install DPDK'. + +  If failed, you may need to download the official driver from NVidia. +There are many available releases in https://network.nvidia.com/products/infiniband-drivers/linux/mlnx_ofed/, you should choose one that matches to your kernel version and OS version the best. An improper version might lead to compiling error when building kernel modules later. -My test machine is CentOS 7 with kernel 5.x, so I downloaded MLNX_OFED_LINUX-5.4-3.6.8.1-rhel7.2-x86_64.tgz. + +- For example, for CentOS 7 and kernel 5.x, you should choose MLNX_OFED_LINUX-5.4-3.6.8.1-rhel7.2-x86_64.tgz +- For Debian 10, it is MLNX_OFED_LINUX-5.8-5.1.1.2-debian10.13-x86_64.tgz #### 3. Install mlnx_ofed driver @@ -134,60 +165,92 @@ lspci -nn | grep 'Ethernet controller' ### Install DPDK -  The F-Stack version we choose is [1.22](https://github.com/F-Stack/f-stack/releases/tag/v1.22), -and it has explicitly required DPDK version to be [20.11](https://github.com/DPDK/dpdk/releases/tag/v20.11). +  The F-Stack version we choose is [1.22.1](https://github.com/F-Stack/f-stack/releases/tag/v1.22.1), +and it has a subdirectory called dpdk that contains the full DPDK 20.11 source code. +Let's start with the DPDK install first. -  Install dependencies: +```mdx-code-block + + +``` -```shell +```bash +cd f-stack-1.22.1/dpdk/ yum install python3-pip -yum install numactl-devel zlib-devel ninja -pip3 install meson pyelftools +yum install pkg-config numactl-devel zlib-devel ninja +pip3 install meson +``` + +```mdx-code-block + + +``` + +```bash +cd f-stack-1.22.1/dpdk/ +pip3 install ninja meson +apt install pkg-config python3-pyelftools libnuma-dev +``` + +```mdx-code-block + + ``` -  Build and install: +Build and install: ```shell -cd dpdk-20.11 CONFIG_RTE_LIBRTE_MLX5_PMD=y meson -Denable_kmods=true -Dtests=false build cd build ninja ninja install ``` -  Run simple test: +Allocate 10GB huge-pages ```shell -# Allocate 10GB huge-pages echo 5120 > /sys/kernel/mm/hugepages/hugepages-2048kB/nr_hugepages +``` + +Attach your PF (with main IP) and one of the VFs (idle) to the poll-mode-driver test -# Attach your PF (with main IP) and one of the VFs (idle) to the poll-mode-driver test +```shell ./build/app/dpdk-testpmd -l 0-3 -n 4 -a 0000:03:00.0 -a 0000:03:00.2 -- --nb-cores=2 --flow-isolate-all -i -a ``` -  The `--flow-isolate-all` option is a MUST do. It enables Flow Bifurcation and ensures that all the +Note: The `--flow-isolate-all` option is a MUST do. It enables Flow Bifurcation and ensures that all the undetermined flow will be forwarded to the Linux kernel. Because the default behavior is to drop all packets, so unless you configure the flow table or enable the `--flow-isolate-all` option, your network connection will be lost again ... ### Install F-Stack +Let's go back to the parent dir and install F-Stack. + #### Upgrade pkg-config   The `pkg-config` command in CentOS 7 is of version 0.27.1, and it has a [bug](https://bugs.freedesktop.org/show_bug.cgi?id=56699) that does not correctly handle gcc's `--whole-archive` option. As per F-Stack's document, we can upgrade it to [0.29.2](https://pkg-config.freedesktop.org/releases/pkg-config-0.29.2.tar.gz). +  Debian 10 is OK. + #### Modify make scripts 1. Edit `lib/Makefile`, comment out `DEBUG=...`. We want a release build. 2. Edit `lib/Makefile`, enable `FF_FLOW_ISOLATE=1`. It is the trigger of Flow Bifurcation for TCP. The hardcoded TCP port is 80. -3. Edit `mk/kern.mk`, add `-Wno-error=format-overflow` to `CWARNFLAGS`, in case a compiler warning being regarded as error. +3. For CentOS 7, edit `mk/kern.mk`, add `-Wno-error=format-overflow` to `CWARNFLAGS`, +in case a compiler warning being regarded as error. Debian 10 is OK. #### Build and install -```shell -export FF_PATH=/root/f-stack-1.22 # Change to your own dir +```mdx-code-block + + +``` + +```bash +export FF_PATH=/root/f-stack-1.22.1 # Change to your own dir export REGULAR_PKG_CONFIG_DIR=/usr/lib64/pkgconfig/ export DPDK_PKG_CONFIG_DIR=/usr/local/lib64/pkgconfig/ export PKG_CONFIG_PATH=$(pkg-config --variable=pc_path pkg-config):${REGULAR_PKG_CONFIG_DIR}:${DPDK_PKG_CONFIG_DIR} @@ -197,6 +260,25 @@ make -j make install ``` +```mdx-code-block + + +``` + +```bash +export FF_PATH=/root/f-stack-1.22.1 # Change to your own dir + +cd f-stack-1.22/lib +make -j +make install +``` + +```mdx-code-block + + +``` + + #### Configurations   F-Stack has a global config file at `/etc/f-stack.conf`. We need to make a few changes before running it. @@ -215,8 +297,11 @@ It looks quite alike the old echo server example, only a few lines of changes, b ```shell cd PhotonLibOS +git checkout release/0.8 cmake -B build -D PHOTON_BUILD_TESTING=1 -D PHOTON_ENABLE_FSTACK_DPDK=1 -D CMAKE_BUILD_TYPE=Release -cmake --build build -j -t fstack-dpdk-demo +cmake --build build -j 32 -t fstack-dpdk-demo ./build/output/fstack-dpdk-demo -``` \ No newline at end of file +``` + +Now you can set up an echo client on another host, and bench this server via port 80. diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 95aef37b..cf82f5d3 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -24,7 +24,7 @@ target_link_libraries(rpc-example-server PRIVATE photon_static) add_executable(sync-primitive sync-primitive/sync-primitive.cpp) target_link_libraries(sync-primitive PRIVATE photon_static) -if (ENABLE_FSTACK_DPDK) +if (PHOTON_ENABLE_FSTACK_DPDK) add_executable(fstack-dpdk-demo fstack-dpdk/fstack-dpdk-demo.cpp) - target_link_libraries(fstack-dpdk-demo PRIVATE fstack_dpdk photon_static) + target_link_libraries(fstack-dpdk-demo PRIVATE ${FSTACK_LIBRARIES} photon_static) endif () diff --git a/io/fstack-dpdk.cpp b/io/fstack-dpdk.cpp index 7e3f3d53..056669c8 100644 --- a/io/fstack-dpdk.cpp +++ b/io/fstack-dpdk.cpp @@ -18,6 +18,7 @@ limitations under the License. #include #include +#include #include #include @@ -78,7 +79,7 @@ class FstackDpdkEngine : public MasterEventEngine, public CascadingEventEngine, } int reset() override { - assert(false); + return -1; } ~FstackDpdkEngine() override { @@ -281,7 +282,7 @@ int fstack_bind(int sockfd, const struct sockaddr* addr, socklen_t addrlen) { } int fstack_accept(int sockfd, struct sockaddr* addr, socklen_t* addrlen, Timeout timeout) { - return DOIO_ONCE(ff_accept(sockfd, (linux_sockaddr*) addr, addrlen), + return photon::net::DOIO_ONCE(ff_accept(sockfd, (linux_sockaddr*) addr, addrlen), g_engine->wait_for_fd_readable(sockfd, timeout)); } @@ -294,22 +295,22 @@ int fstack_shutdown(int sockfd, int how) { } ssize_t fstack_send(int sockfd, const void* buf, size_t count, int flags, Timeout timeout) { - return DOIO_ONCE(ff_send(sockfd, buf, count, flags), + return photon::net::DOIO_ONCE(ff_send(sockfd, buf, count, flags), g_engine->wait_for_fd_writable(sockfd, timeout)); } ssize_t fstack_sendmsg(int sockfd, const struct msghdr* message, int flags, Timeout timeout) { - return DOIO_ONCE(ff_sendmsg(sockfd, message, flags), + return photon::net::DOIO_ONCE(ff_sendmsg(sockfd, message, flags), g_engine->wait_for_fd_writable(sockfd, timeout)); } ssize_t fstack_recv(int sockfd, void* buf, size_t count, int flags, Timeout timeout) { - return DOIO_ONCE(ff_recv(sockfd, buf, count, flags), + return photon::net::DOIO_ONCE(ff_recv(sockfd, buf, count, flags), g_engine->wait_for_fd_writable(sockfd, timeout)); } ssize_t fstack_recvmsg(int sockfd, struct msghdr* message, int flags, Timeout timeout) { - return DOIO_ONCE(ff_recvmsg(sockfd, message, flags), + return photon::net::DOIO_ONCE(ff_recvmsg(sockfd, message, flags), g_engine->wait_for_fd_writable(sockfd, timeout)); } diff --git a/io/fstack-dpdk.h b/io/fstack-dpdk.h index 27fd1f62..d84c2fe5 100644 --- a/io/fstack-dpdk.h +++ b/io/fstack-dpdk.h @@ -19,6 +19,7 @@ limitations under the License. #include #include #include +#include namespace photon { diff --git a/net/kernel_socket.cpp b/net/kernel_socket.cpp index bf2709b4..5abec733 100644 --- a/net/kernel_socket.cpp +++ b/net/kernel_socket.cpp @@ -649,12 +649,6 @@ class FstackDpdkSocketStream : public KernelSocketStream { public: using KernelSocketStream::KernelSocketStream; - FstackDpdkSocketStream(int socket_family, bool nonblocking) { - fd = fstack_socket(socket_family, SOCK_STREAM, 0); - if (fd < 0) return; - setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, 1); - } - ~FstackDpdkSocketStream() override { if (fd < 0) return; fstack_shutdown(fd, (int) ShutdownHow::ReadWrite); @@ -693,7 +687,9 @@ class FstackDpdkSocketClient : public KernelSocketClient { using KernelSocketClient::KernelSocketClient; KernelSocketStream* create_stream(int socket_family) override { - return new FstackDpdkSocketStream(socket_family, true); + int fd = fstack_socket(socket_family, SOCK_STREAM, 0); + if (fd < 0) return nullptr; + return new FstackDpdkSocketStream(fd); } int fd_connect(int fd, const sockaddr* remote, socklen_t addrlen) override { diff --git a/thread/stack-allocator.cpp b/thread/stack-allocator.cpp index 869dc42b..ccc07d5a 100644 --- a/thread/stack-allocator.cpp +++ b/thread/stack-allocator.cpp @@ -84,7 +84,7 @@ class PooledStackAllocator { pool.pop_back(); return ret; } - return __alloc(slotsize); + return nullptr; } void put(void* ptr) { pool.emplace_back(ptr); } }; @@ -107,8 +107,11 @@ class PooledStackAllocator { } auto ptr = slots[idx].get(); // got from pool - in_pool_size -= slots[idx].slotsize; - return ptr; + if (ptr) { + in_pool_size -= slots[idx].slotsize; + return ptr; + } + return __alloc(slots[idx].slotsize); } int dealloc(void* ptr, size_t size) { auto idx = get_slot(size);