diff --git a/README.md b/README.md index e752edc..57e60eb 100644 --- a/README.md +++ b/README.md @@ -3,109 +3,21 @@ Requirements ------------- -- [Install k0sctl](https://github.com/k0sproject/k0sctl#installation) -- Access to the STFC cloud via an openstack account, with [setup environment variables](https://stfc-cloud-docs.readthedocs.io/en/latest/howto/CreateVMFromCommandLine.html#setting-up-the-environment-to-select-project) on the terminal of choice. -- Install conda (recommend mambaforge) for managing the k8s repo or install python-kubernetes, ansible, and all of the kubernetes management software (kubernetes-client, kuberentes-server, etc) into your system/distro. +- Access to the STFC cloud via an openstack account Optional (Recommended for debugging and evaluating the cluster) ---------------------------------------------------------------- -- [Install Cilium CLI (networking)](https://docs.cilium.io/en/stable/gettingstarted/k8s-install-default/#install-the-cilium-cli) -- [Install hubble Client (networking webapp)](https://docs.cilium.io/en/v1.10/gettingstarted/hubble_setup/#install-the-hubble-client) - [K9s (K8s management in a terminal)](https://k9scli.io/topics/install/) - -Conda env setup ---------------- - -To create a conda environment that can sustain development of this repository (excluding k0sctl and setup of k0s itself) you can run the following command, whilst in the repository: - -```shell -conda env create -f k8s-conda-env.yml -``` - -Recommended installs ---------------------- - -After installing the conda env, activate it and install this plugin to helm. - +- Helm diff plugin: ```shell helm plugin install https://github.com/databus23/helm-diff ``` -Cloud setup and deploy k0s via k0sctl in terraform (both prod and staging) --------------------------------------------------------------------------- - -You can achieve this by using terraform (included in the conda environment) from inside the terraform directory. `terraform apply` can fail due to cloud instability, if it does, just run it again. You need to have setup the environment variables for openstack listed in the requirements at the top of this file. - -Ensure that the `terraform/main.tf` file uses your FedId for the SSH Key name, and the key is up to date with yours. - -```shell -terraform init -terraform apply -``` - -Setup an ssh-agent for connecting to the cluster with k0sctl. Example: - -```shell -eval "$(ssh-agent -c)" -ssh-add ~/.ssh/id_rsa -``` - -Use terraform to output the ansible inventory into your ansible directory - -```shell -terraform output -raw ansible_inventory > ../ansible/inventory.ini -terraform output -raw ansible_inventory_staging > ../ansible/inventory-staging.ini -terraform output -raw ansible_inventory_ci_cd > ../ansible/inventory-ci-cd.ini -``` - -Use terraform to output the haproxy.cfg - -```shell -terraform output -raw haproxy_config > ../ansible/haproxy.cfg -``` - -Use ansible to activate the firewall and create the load balancer required for the k0s cluster (It is recommended to run these repeatedly until they execute with no errors): - -```shell -cd ../ansible; ansible-playbook setup-nodes.yml -i inventory.ini --ask-vault-password; cd ../terraform -cd ../ansible; ansible-playbook setup-nodes-staging.yml -i inventory-staging.ini --ask-vault-password; cd ../terraform -cd ../ansible; ansible-playbook setup-ci-cd-nodes.yml -i inventory-ci-cd.ini; cd ../terraform -``` - -Use terraform to output the data and then apply that to construct the k0s cluster (if this fails you didn't add an ssh-agent). - -```shell -terraform output -raw k0s_cluster | k0sctl apply --no-wait --config - -terraform output -raw k0s_cluster_staging | k0sctl apply --no-wait --config - -terraform output -raw k0s_cluster_ci_cd | k0sctl apply --no-wait --config - -``` - -Export the kubeconfig to the top of the repository, whilst in the terraform directory - -```shell -terraform output -raw k0s_cluster | k0sctl kubeconfig --config - > ../kubeconfig -terraform output -raw k0s_cluster_staging | k0sctl kubeconfig --config - > ../kubeconfig-staging -terraform output -raw k0s_cluster_ci_cd | k0sctl kubeconfig --config - > ../kubeconfig-ci-cd -``` - -Export KUBECONFIG as an environment variable so that ansible can pick it up - -```shell -export KUBECONFIG=/path/to/repository/kubeconfig -``` - -Run the playbook for deploying K8s tools such as Traefik, Cilium, Longhorn, Prometheus, Promtail etc. (This will just deploy to prod, you need to change KUBECONFIG env var to the staging kubeconfig (kubeconfig-staging) file to do staging) - -```shell -cd ../ansible; ansible-playbook deploy-k8s-networking.yml --ask-vault-password ; cd ../terraform -cd ../ansible; ansible-playbook deploy-k8s-networking.yml -i inventory-staging.ini --ask-vault-password; cd ../terraform -``` - Setup ArgoCD ------------ -This section assumes that you have the context setup appropriately in the Kubeconfigs +This section assumes that you have the context setup appropriately in the Kubeconfigs and you are currently managing the management cluster Install ArgoCD: ```shell @@ -131,29 +43,6 @@ Change the password using the user settings to the one in Keeper so everyone who Go follow the GitOps repo [README](https://github.com/interactivereduction/gitops) now. -Gotchas -------- - -- `terraform apply` struggles with creating all the openstack VMs, this happens when doing it manually and is not related to terraform, it is due to cloud instability as far as we can tell. -- The Kafka dependent containers, such as `JobController` and `RunDetection` may need to be manually restarted, if the Liveness and Health probes are not setup correctly. - -Updating different parts of the cluster ---------------------------------------- - -In order to update the container versions on the cluster, for each of the containers that we produce, the following commands will be useful: - -RunDetection: - -```shell -kubectl set image -n ir deployment/rundetection rundetection=ghcr.io/interactivereduction/rundetection@sha256:commit-number -``` - -JobController: - -```shell -kubectl set image -n ir deployment/jobcontroller jobcontroller=ghcr.io/interactivereduction/jobcontroller@sha256:commit-number -``` - Developing using a local cluster -------------------------------- @@ -181,22 +70,3 @@ ansible-playbook deploy-dev-k8s-services.yml That's it! you have Interactive reduction running on a local Kubernetes cluster! -Creating a kafka producer for connecting to the cluster and sending things to a topic -------------------------------------------------------------------------------------- - -With the aim of sending data directly to the kafka topics with a producer in your terminal, first of all you will need the kubeconfig variable set and secondly you will need kafka installed from: - -Then you will need some variables set by running these commands, the only caveat is that k0s-app-worker-5 may not have any of the kafka brokers on it, so you need to ensure that it is pointing at a node that has one, you can do this by checking k9s or looking at where the kafka pods are running using kubectl: - -```shell -set KAFKA_NODE_PORT $(kubectl get service -n kafka kafka-cluster-kafka-external-bootstrap -o=jsonpath='{.spec.ports[0].nodePort}{"\n"}') -set KAFKA_NODE_IP $(kubectl get node k0s-app-worker-5 -o=jsonpath='{range .status.addresses[*]}{.type}{"\t"}{.address}{"\n"}') -``` - -Then you can actually connect to the kafka topic as a producer: - -```shell -~/kafka/bin/kafka-console-producer.sh --bootstrap-server $KAFKA_NODE_IP:$KAFKA_NODE_PORT --topic detected-runs -``` - -Once connected anything you type and then hit enter on will be sent to the kafka topic as a message! diff --git a/ansible/roles/isis-archive/meta/main.yml b/ansible/roles/isis-archive/meta/main.yml new file mode 100644 index 0000000..85c996d --- /dev/null +++ b/ansible/roles/isis-archive/meta/main.yml @@ -0,0 +1,14 @@ +--- +galaxy_info: + author: Pasarus + description: Mount the archive on the nodes + license: GPLv3 + + min_ansible_version: "5.0" + + platforms: + - name: Ubuntu + versions: + - focal + +dependencies: [] \ No newline at end of file diff --git a/ansible/roles/isis-archive/tasks/main.yml b/ansible/roles/isis-archive/tasks/main.yml new file mode 100644 index 0000000..30e3027 --- /dev/null +++ b/ansible/roles/isis-archive/tasks/main.yml @@ -0,0 +1,64 @@ +--- +- name: Mount the archive + become: true + block: + - name: Install cifs-utils and keyutils + apt: + update_cache: true + name: "{{item}}" + state: present + with_items: + - cifs-utils + - keyutils + when: "'workers' in group_names or inventory_hostname == 'localhost'" + + - name: Create mountpoint + ansible.builtin.file: + path: /archive + state: directory + mode: "u=rwx,g=rx,o=rx" + when: "'workers' in group_names or inventory_hostname == 'localhost'" + + - name: Ensure credentials file is present + ansible.builtin.lineinfile: + path: /archive.creds + search_string: "{{item}}" + line: "{{item}}" + create: true + mode: "u=rwx,g=,o=" + state: present + with_items: + - username={{ isis_archive.username }} + - password={{ isis_archive.password }} + - domain={{ isis_archive.domain }} + when: "'workers' in group_names or inventory_hostname == 'localhost'" + + - name: Ensure DNS config is setup for the archive + ansible.builtin.blockinfile: + path: /etc/systemd/resolved.conf + state: present + create: true + mode: "u=rw,g=r,o=r" + block: | + [Resolve] + Domains=isis.cclrc.ac.uk + when: "'workers' in group_names or inventory_hostname == 'localhost'" + + - name: Restart resolved to ensure DNS configuration is up to date + ansible.builtin.service: + name: systemd-resolved.service + state: restarted + when: "'workers' in group_names or inventory_hostname == 'localhost'" + + - name: Ensure mount is configured + ansible.builtin.mount: + path: /archive + src: "//isis.cclrc.ac.uk/inst$/" + fstype: cifs + opts: "noserverino,vers=2.1,credentials=/archive.creds,_netdev" + state: mounted + retries: 100 + delay: 1 + register: result + until: result.failed == false + when: "'workers' in group_names or inventory_hostname == 'localhost'" \ No newline at end of file diff --git a/ansible/roles/isis-ceph/meta/main.yml b/ansible/roles/isis-ceph/meta/main.yml new file mode 100644 index 0000000..ca9778c --- /dev/null +++ b/ansible/roles/isis-ceph/meta/main.yml @@ -0,0 +1,14 @@ +--- +galaxy_info: + author: Pasarus + description: Mount ceph on the nodes + license: GPLv3 + + min_ansible_version: "5.0" + + platforms: + - name: Ubuntu + versions: + - focal + +dependencies: [] \ No newline at end of file diff --git a/ansible/roles/isis-ceph/tasks/main.yml b/ansible/roles/isis-ceph/tasks/main.yml new file mode 100644 index 0000000..2abae74 --- /dev/null +++ b/ansible/roles/isis-ceph/tasks/main.yml @@ -0,0 +1,55 @@ +--- +- name: Mount ceph + become: true + block: + - name: Create autoreduce group + group: + name: autoreduce + gid: "{{ isis_ceph.uid }}" + state: present + when: "'workers' in group_names or inventory_hostname == 'localhost'" + + - name: Create autoreduce user + user: + name: autoreduce + shell: /bin/bash + state: present + uid: "{{ isis_ceph.uid }}" + group: autoreduce + when: "'workers' in group_names or inventory_hostname == 'localhost'" + + - name: Ensure dependencies are installed + apt: + update_cache: true + name: ceph + state: present + when: "'workers' in group_names or inventory_hostname == 'localhost'" + + - name: Create directory + ansible.builtin.file: + path: /ceph + state: directory + mode: "u=rwx,g=rx,o=rx" + when: "'workers' in group_names or inventory_hostname == 'localhost'" + + - name: Ensure ceph secret key is present + ansible.builtin.copy: + dest: /etc/ceph/deneb.key + content: "{{ isis_ceph.key }}" + owner: root + group: root + mode: "u=r,g=,o=" + when: "'workers' in group_names or inventory_hostname == 'localhost'" + + - name: Mount CEPH + ansible.builtin.mount: + path: /ceph + src: deneb-mon1.nubes.rl.ac.uk,deneb-mon2.nubes.rl.ac.uk,deneb-mon3.nubes.rl.ac.uk:/isis/instrument + fstype: ceph + opts: "name=isis_autoreduce,secretfile=/etc/ceph/deneb.key,noatime,_netdev" + state: mounted + retries: 100 + delay: 1 + register: result + until: result.failed == false + when: "'workers' in group_names or inventory_hostname == 'localhost'" \ No newline at end of file diff --git a/ansible/setup-local-machine.yml b/ansible/setup-local-machine.yml new file mode 100644 index 0000000..67bf2dc --- /dev/null +++ b/ansible/setup-local-machine.yml @@ -0,0 +1,15 @@ +--- +- name: Setup the local machine + hosts: localhost + vars_files: + - vars/vault.yml + roles: + - role: isis-archive + - role: isis-ceph + + tasks: + - name: Set permissions of /archive directory which is suitable for minikube to access it + become: true + file: + path: /archive + mode: '0777' \ No newline at end of file