diff --git a/.bumpversion.cfg b/.bumpversion.cfg index 9ae63ecc9..6331c36cc 100644 --- a/.bumpversion.cfg +++ b/.bumpversion.cfg @@ -1,5 +1,5 @@ [bumpversion] -current_version = 6.6.2 +current_version = 6.7.0 commit = True tag = False parse = (?P\d+)\.(?P\d+)\.(?P[A-z0-9-]+) diff --git a/.github/workflows/test.yaml b/.github/workflows/test.yaml index 1061ddc50..54176c391 100644 --- a/.github/workflows/test.yaml +++ b/.github/workflows/test.yaml @@ -16,13 +16,13 @@ jobs: run: shell: bash -eo pipefail -l {0} steps: - - uses: actions/checkout@v3 + - uses: actions/checkout@v4 - - uses: actions/setup-python@v4 + - uses: actions/setup-python@v5 with: python-version: "3.10" - - uses: actions/setup-java@v3 + - uses: actions/setup-java@v4 with: distribution: "temurin" # See 'Supported distributions' for available options java-version: "17" diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml index 545ead223..3432209bb 100644 --- a/.pre-commit-config.yaml +++ b/.pre-commit-config.yaml @@ -19,6 +19,24 @@ repos: - id: markdownlint args: ["--config", ".markdownlint.json"] + - repo: https://github.com/populationgenomics/pre-commits + rev: "v0.1.3" + hooks: + - id: cpg-id-checker + args: ["--extra-pattern", 'TOB\d+'] + exclude: >- + (?x)^( + test/test_generic_auditor\.py| + models/utils/sequencing_group_id_format\.py| + metamist/audit/README\.md + )$ + + - repo: https://github.com/pycqa/isort + rev: 5.13.2 + hooks: + - id: isort + name: isort (python) + - repo: https://github.com/ambv/black rev: 23.12.1 hooks: diff --git a/.pylintrc b/.pylintrc index 773475a32..6ccbd8d1c 100644 --- a/.pylintrc +++ b/.pylintrc @@ -23,7 +23,7 @@ disable=f-string-without-interpolation,inherit-non-class,too-few-public-methods, fixme,logging-fstring-interpolation,import-error,missing-module-docstring,line-too-long, too-many-return-statements,no-name-in-module,R0801,consider-using-set-comprehension, - arguments-differ,unspecified-encoding,invalid-name,logging-not-lazy,I1101 + arguments-differ,unspecified-encoding,invalid-name,logging-not-lazy,I1101,wrong-import-order # Overriding variable name patterns to allow short 1- or 2-letter variables attr-rgx=[a-z_][a-z0-9_]{0,30}$ diff --git a/README.md b/README.md index ec3150ae4..5916d3282 100644 --- a/README.md +++ b/README.md @@ -1,453 +1,62 @@ -# Sample Metadata +# Metamist [![codecov](https://codecov.io/gh/populationgenomics/metamist/branch/dev/graph/badge.svg?token=OI3XZYR9HK)](https://codecov.io/gh/populationgenomics/metamist) -Metamist is database that stores **de-identified** -omics metadata. -There are three components to the metamist system: +## Introduction -- System-versioned MariaDB database, -- Python web API to manage permissions, and store frequently used queries, - - Including a GraphQL API for querying the database -- An installable python library that wraps the Python web API (using OpenAPI generator) +**Metamist** is a database designed for storing **de-identified** -omics metadata. -Every resource in metamist belongs to a project. All resources are access -controlled through membership of the google groups: -`$dataset-sample-metadata-main-{read,write}`. Note that members of google-groups -are cached in a blob as group-membership identity lookups are slow. +## Purpose -## API +The project provides an interface to interact with the -omics database via the Python client as well as the GraphQL + HTTP APIs. -There are two ways to query metamist in Python: +## Features -1. Use the REST interface with the predefined requests -2. Use the GraphQL interface. +- **Project-Based Resource Organization**: Every resource in Metamist is associated with a specific project. +- **Access Control**: Access to resources is controlled through membership in specific Google Groups: + - `dataset-sample-metadata-main-read`: For read-only access. + - `dataset-sample-metadata-main-write`: For write access. +- **Efficiency Note**: Members of Google Groups are cached in a blob to optimize performance, as group-membership identity lookups can be slow. -To use the GraphQL interface in Python with the `sample_metadata` library, you can do the following: +## High-Level Architecture -```python -from sample_metadata.graphql import query +It comprises three key components: -_query = """ -query YourQueryNameHere($sampleId: String!) { - sample(id: $sampleId) { - id - externalId - } -} -""" +1. **System-Versioned MariaDB Database**: A robust database system for managing -omics metadata. -print(query(_query, {"sampleId": "CPG18"})) -``` +2. **Python Web API**: This component is responsible for: + - Managing permissions. + - Storing frequently used queries. + - Providing a GraphQL/HTTP API for efficient querying of the database. -## Structure +3. **Installable Python Library**: Wraps the Python Web API using the OpenAPI generator, facilitating easier interaction with the system. -![Database structure](resources/2021-10-27_db-diagram.png) +### Schema -### Sample IDs +As of Jan 15, 2024 this schema should reflect the data structure on the tables: -In an effort to reduce our dependency on potentially mutable external sample IDs with inconsistent format, -the metamist server generates an internal sample id for every sample. Internally they're an -incrementing integer, but they're transformed externally to have a prefix, and checksum - this allows durability -when transcribing sample IDs to reduce mistypes, and allow to quickly check whether a sample ID is valid. +![Database Structure](resources/2024-01-15_db-diagram.png) -> NB: The prefix and checksums are modified per environment (production, development, local) to avoid duplicates from these environments. +You can also find this at [DbDiagram](https://dbdiagram.io/d/Metamist-Schema-v6-6-2-65a48ac7ac844320aee60d16). -For example, let's consider the production environment which uses the prefix of `CPG` and a checksum offset of 0: +The codebase contains the following modules worth noting: -> A sample is given the internal ID `12345`, we calculate the Luhn checksum to be `5` (with no offset applied). -> We can then concatenate the results, for the final sample ID to be `CPG123455`. +- `models` -> General data models + enums +- `db/python/tables` -> Interaction with MariaDB / BigQuery +- `db/python/layers` -> Logic +- `api/graphql` : GraphQL +- `api/routes`: HTTP + OpenAPI -### Reporting sex +And metamist maintains two clients: -To avoid ambiguity in reporting of gender, sex and karyotype - the sample metadata system -stores these values separately on the `participant` as: +- `web` -> React app that consumes a generated Typescript API + GraphQL +- `metamist` -> autogenerated Python API -- `reported_gender` (string, expected `male` | `female` | _other values_) -- `reported_sex` (follows pedigree convention: `unknown=0 | null`, `male=1`, `female=2`) -- `inferred_karyotype` (string, eg: `XX` | `XY` | _other karyotypes_) +## Installation and Running Locally -If you import a pedigree, the sex value is written to the `reported_sex` attribute. +- [Installation and developer setup](docs/installation.md) -## Local develompent of SM +## License -The recommended way to develop the metamist system is to run a local copy of SM. - -> There have been some reported issues of running a local SM environment on an M1 mac. - -You can run MariaDB with a locally installed docker, or from within a docker container. -You can configure the MariaDB connection with environment variables. - -### Creating the environment - -Python dependencies for the `metamist` API package are listed in `setup.py`. -Additional dev requirements are listed in `requirements-dev.txt`, and packages for -the sever-side code are listed in `requirements.txt`. - -We _STRONGLY_ encourage the use of `pyenv` for managing Python versions. -Debugging and the server will run on a minimum python version of 3.10. - -To setup the python environment, you can run: - -```shell -virtualenv venv -source venv/bin/activate -pip install -r requirements.txt -pip install -r requirements-dev.txt -pip install --editable . -``` - -### Extra software - -You'll need to install the following software to develop metamist: - -- Node / NPM (recommend using nvm) -- MariaDB (using MariaDB in docker is also good) -- Java (for liquibase / openapi-generator) -- Liquibase -- OpenAPI generator -- wget (optional) - -Our recommendation is in the following code block: - -```shell -brew install wget -brew install java -brew install liquibase -``` - -Add the following to your `.zshrc` file: - -```shell - -# homebrew should export this on an M1 Mac -# the intel default is /usr/local -export HB_PREFIX=${HOMEBREW_PREFIX-/usr/local} - -# installing Java through brew recommendation -export CPPFLAGS="-I$HB_PREFIX/opt/openjdk/include" - -# installing liquibase through brew recommendation -export LIQUIBASE_HOME=$(brew --prefix)/opt/liquibase/libexec - -export PATH="$HB_PREFIX/bin:$PATH:$HB_PREFIX/opt/openjdk/bin" -``` - -#### Node through node-version manager (nvm) - -We aren't using too many node-specific features, anything from 16 should work fine, -this will install the LTS version: - -```shell -brew install nvm - -# you may need to add the the following to your .zshrc -# export NVM_DIR="$HOME/.nvm" -# [ -s "$HB_PREFIX/opt/nvm/nvm.sh" ] && \. "$HB_PREFIX/opt/nvm/nvm.sh" # This loads nvm -# [ -s "$HB_PREFIX/opt/nvm/etc/bash_completion.d/nvm" ] && \. "$HB_PREFIX/opt/nvm/etc/bash_completion.d/nvm" # This loads nvm bash_completion - -# install latest version of node + npm -nvm install --lts -``` - -#### OpenAPI generator - -You'll need this to generate the Python and Typescript API. - -```shell -npm install @openapitools/openapi-generator-cli -g -openapi-generator-cli version-manager set 5.3.0 - -# put these in your .zshrc -export OPENAPI_COMMAND="npx @openapitools/openapi-generator-cli" -alias openapi-generator="npx @openapitools/openapi-generator-cli" -``` - -#### MariaDB install - -If you're planning to install MariaDB locally, brew is the easiest: - -```shell - -brew install mariadb@10.8 -# start mariadb on computer start -brew services start mariadb@10.8 - -# make mariadb command available on path -export PATH="$HB_PREFIX/opt/mariadb@10.8/bin:$PATH" -``` - -#### Your .zshrc file - -If you installed all the software through brew and npm -like this guide suggests, your `.zshrc` may look like this: - - -```shell -alias openapi-generator="npx @openapitools/openapi-generator-cli" - -# homebrew should export this on an M1 Mac -# the intel default is /usr/local -export HB_PREFIX=${HOMEBREW_PREFIX-/usr/local} - -# metamist -export SM_ENVIRONMENT=LOCAL # good default to have -export SM_DEV_DB_USER=sm_api # makes it easier to copy liquibase update command -export OPENAPI_COMMAND="npx @openapitools/openapi-generator-cli" - -export PATH="$HB_PREFIX/bin:$HB_PREFIX/opt/mariadb@10.8/bin:$PATH:$HB_PREFIX/opt/openjdk/bin" - -export CPPFLAGS="-I$HB_PREFIX/opt/openjdk/include" -export LIQUIBASE_HOME=$(brew --prefix)/opt/liquibase/libexec - -# node -export NVM_DIR="$HOME/.nvm" -[ -s "$HB_PREFIX/opt/nvm/nvm.sh" ] && \. "$HB_PREFIX/opt/nvm/nvm.sh" # This loads nvm -[ -s "$HB_PREFIX/opt/nvm/etc/bash_completion.d/nvm" ] && \. "$HB_PREFIX/opt/nvm/etc/bash_completion.d/nvm" # This loads nvm bash_completion -``` - -### Database setup - -These are the default values for the SM database connection. -Please alter them if you use any different values when setting up the database. - -```shell -export SM_DEV_DB_USER=root # this is the default, but we now recommend sm_api -export SM_DEV_DB_PASSWORD= # empty password -export SM_DEV_DB_HOST=127.0.0.1 -export SM_DEV_DB_PORT=3306 # default mariadb port -export SM_DEV_DB_NAME=sm_dev; -``` - -Create the database in MariaDB (by default, we call it `sm_dev`): - -> In newer installs of MariaDB, the root user is protected by default. - -We'll setup a user called `sm_api`, and setup permissions - -```shell -sudo mysql -u root --execute " - CREATE DATABASE sm_dev; - CREATE USER sm_api@'%'; - CREATE USER sm_api@localhost; - CREATE ROLE sm_api_role; - GRANT sm_api_role TO sm_api@'%'; - GRANT sm_api_role TO sm_api@localhost; - SET DEFAULT ROLE sm_api_role FOR sm_api@'%'; - SET DEFAULT ROLE sm_api_role FOR sm_api@localhost; - GRANT ALL PRIVILEGES ON sm_dev.* TO sm_api_role; -" -``` - -Then, before you run you'll need to export the varied: - -```shell -# also put this in your .zshrc -export SM_DEV_DB_USER=sm_api -``` - -Download the `mariadb-java-client` and create the schema using liquibase: - -```shell -pushd db/ -wget https://repo1.maven.org/maven2/org/mariadb/jdbc/mariadb-java-client/3.0.3/mariadb-java-client-3.0.3.jar -liquibase \ - --changeLogFile project.xml \ - --url jdbc:mariadb://localhost/sm_dev \ - --driver org.mariadb.jdbc.Driver \ - --classpath mariadb-java-client-3.0.3.jar \ - --username ${SM_DEV_DB_USER:-root} \ - update -popd -``` - -#### Using Maria DB docker image - -Pull mariadb image - -```bash -docker pull mariadb:10.8.3 -``` - -Run a mariadb container that will server your database. `-p 3307:3306` remaps the port to 3307 in case if you local MySQL is already using 3306 - -```bash -docker stop mysql-p3307 # stop and remove if the container already exists -docker rm mysql-p3307 -# run with an empty root password -docker run -p 3307:3306 --name mysql-p3307 -e MYSQL_ALLOW_EMPTY_PASSWORD=true -d mariadb:10.8.3 -``` - -```bash -mysql --host=127.0.0.1 --port=3307 -u root -e 'CREATE DATABASE sm_dev;' -mysql --host=127.0.0.1 --port=3307 -u root -e 'show databases;' -``` - -Go into the `db/` subdirectory, download the `mariadb-java-client` and create the schema using liquibase: - -```bash - -pushd db/ -wget https://repo1.maven.org/maven2/org/mariadb/jdbc/mariadb-java-client/3.0.3/mariadb-java-client-3.0.3.jar -liquibase \ - --changeLogFile project.xml \ - --url jdbc:mariadb://127.0.0.1:3307/sm_dev \ - --driver org.mariadb.jdbc.Driver \ - --classpath mariadb-java-client-3.0.3.jar \ - --username root \ - update -popd -``` - -Finally, make sure you configure the server (making use of the environment variables) to point it to your local Maria DB server - -```bash -export SM_DEV_DB_PORT=3307 -``` - -### Running the server - -You'll want to set the following environment variables (permanently) in your -local development environment. - -The `SM_LOCALONLY_DEFAULTUSER` environment variable along with `ALLOWALLACCESS` to allow access to a local metamist server without providing a bearer token. This will allow you to test the front-end components that access data. This happens automatically on the production instance through the Google identity-aware-proxy. - -```shell -export SM_ALLOWALLACCESS=1 -export SM_LOCALONLY_DEFAULTUSER=$(whoami) -``` - -```shell -# ensures the SWAGGER page points to your local: (localhost:8000/docs) -# and ensures if you use the PythonAPI, it also points to your local -export SM_ENVIRONMENT=LOCAL -# skips permission checks in your local environment -export SM_ALLOWALLACCESS=true -# uses your username as the "author" in requests -export SM_LOCALONLY_DEFAULTUSER=$(whoami) - -# probably need this - - -# start the server -python3 -m api.server -# OR -# uvicorn --port 8000 --host 0.0.0.0 api.server:app -``` - -#### Running + debugging in VSCode - -The following `launch.json` is a good base to debug the web server in VSCode: - -```json -{ - "version": "0.2.0", - "configurations": [ - { - "name": "Run API", - "type": "python", - "request": "launch", - "module": "api.server", - "justMyCode": false, - "env": { - "SM_ALLOWALLACCESS": "true", - "SM_LOCALONLY_DEFAULTUSER": "-local", - "SM_ENVIRONMENT": "local", - "SM_DEV_DB_USER": "sm_api", - } - } - ] -} -``` - -We could now place breakpoints on the sample route (ie: `api/routes/sample.py`), and debug requests as they come in. - -Then in VSCode under the _Run and Debug_ tab (⌘⇧D), you can "Run API": - -![Run API](resources/debug-api.png) - -#### Quickstart: Generate and install the python installable API - -Generating the installable APIs (Python + Typescript) involves running -the server, getting the `/openapi.json`, and running `openapi-generator`. - -The `regenerate_api.py` script does this in a few ways: - -1. Uses a running server on `localhost:8000` -2. Runs a docker container from the `SM_DOCKER` environment variable -3. Spins up the server itself - -Most of the time, you'll use 1 or 3: - -```bash -# this will start the api.server, so make sure you have the dependencies installed, -python regenerate_api.py \ - && pip install . -``` - -If you'd prefer to use the Docker approach (eg: on CI), this command -will build the docker container and supply it to regenerate_api.py. - -```bash -# SM_DOCKER is a known env variable to regenerate_api.py -export SM_DOCKER="cpg/metamist-server:dev" -docker build --build-arg SM_ENVIRONMENT=local -t $SM_DOCKER -f deploy/api/Dockerfile . -python regenerate_api.py -``` - -#### Generating example data - -> You'll need to generate the installable API before running this step - -You can run the `generate_data.py` script to generate some -random data to look at. - -```shell -export SM_ENVIRONMENT=local # important -python test/data/generate_data.py -``` - -#### Developing the UI - -```shell -# Ensure you have started sm locally on your computer already, then in another tab open the UI. -# This will automatically proxy request to the server. -cd web -npm install -npm run compile -npm start -``` - -This will start a web server using Vite, running on [localhost:5173](http://localhost:5173). - - -### OpenAPI and Swagger - -The Web API uses `apispec` with OpenAPI3 annotations on each route to describe interactions with the server. We can generate a swagger UI and an installable -python module based on these annotations. - -Some handy links: - -- [OpenAPI specification](https://swagger.io/specification/) -- [Describing parameters](https://swagger.io/docs/specification/describing-parameters/) -- [Describing request body](https://swagger.io/docs/specification/describing-request-body/) -- [Media types](https://swagger.io/docs/specification/media-types/) - -The web API exposes this schema in two ways: - -- Swagger UI: `http://localhost:8000/docs` - - You can use this to construct requests to the server - - Make sure you fill in the Bearer token (at the top right ) -- OpenAPI schema: `http://localhost:8000/schema.json` - - Returns a JSON with the full OpenAPI 3 compliant schema. - - You could put this into the [Swagger editor](https://editor.swagger.io/) to see the same "Swagger UI" that `/api/docs` exposes. - - We generate the metamist installable Python API based on this schema. - -## Deployment - -The CPG deploy is managed through Cloud Run on the Google Cloud Platform. -The deploy github action builds the container, and is deployed. - -Additionally you can access metamist through the identity-aware proxy (IAP), -which handles the authentication through OAuth, allowing you to access the -front-end. +This project is licensed under the MIT License. You can see it in the [LICENSE](LICENSE) file in the root directory of this source tree. diff --git a/api/graphql/filters.py b/api/graphql/filters.py index 68ae3a3e5..398db3a00 100644 --- a/api/graphql/filters.py +++ b/api/graphql/filters.py @@ -1,4 +1,4 @@ -from typing import TypeVar, Generic, Callable, Any +from typing import Any, Callable, Generic, TypeVar import strawberry diff --git a/api/routes/__init__.py b/api/routes/__init__.py index 18edb5969..748d37e2b 100644 --- a/api/routes/__init__.py +++ b/api/routes/__init__.py @@ -1,11 +1,11 @@ -from api.routes.sample import router as sample_router -from api.routes.imports import router as import_router from api.routes.analysis import router as analysis_router from api.routes.assay import router as assay_router -from api.routes.participant import router as participant_router +from api.routes.billing import router as billing_router +from api.routes.enum import router as enum_router from api.routes.family import router as family_router +from api.routes.imports import router as import_router +from api.routes.participant import router as participant_router from api.routes.project import router as project_router -from api.routes.web import router as web_router -from api.routes.enum import router as enum_router +from api.routes.sample import router as sample_router from api.routes.sequencing_groups import router as sequencing_groups_router -from api.routes.billing import router as billing_router +from api.routes.web import router as web_router diff --git a/api/routes/assay.py b/api/routes/assay.py index 3770d5a76..01ac57e77 100644 --- a/api/routes/assay.py +++ b/api/routes/assay.py @@ -3,20 +3,14 @@ from fastapi import APIRouter from api.utils import get_project_readonly_connection -from api.utils.db import ( - Connection, - get_projectless_db_connection, -) +from api.utils.db import Connection, get_projectless_db_connection +from db.python.layers.assay import AssayLayer from db.python.tables.assay import AssayFilter from db.python.tables.project import ProjectPermissionsTable -from db.python.layers.assay import ( - AssayLayer, -) from db.python.utils import GenericFilter from models.models.assay import AssayUpsert from models.utils.sample_id_format import sample_id_transform_to_raw_list - router = APIRouter(prefix='/assay', tags=['assay']) diff --git a/api/routes/enum.py b/api/routes/enum.py index c655ddca3..e61bf5e3a 100644 --- a/api/routes/enum.py +++ b/api/routes/enum.py @@ -1,7 +1,8 @@ -from typing import Type from inspect import isclass +from typing import Type from fastapi import APIRouter + from api.utils.db import get_projectless_db_connection from db.python import enum_tables from db.python.enum_tables.enums import EnumTable diff --git a/api/routes/family.py b/api/routes/family.py index 8ca615b42..38b5298f2 100644 --- a/api/routes/family.py +++ b/api/routes/family.py @@ -1,19 +1,19 @@ # pylint: disable=invalid-name -import io -import csv import codecs +import csv +import io from datetime import date from typing import List, Optional -from fastapi import APIRouter, UploadFile, File, Query +from fastapi import APIRouter, File, Query, UploadFile from pydantic import BaseModel from starlette.responses import StreamingResponse from api.utils import get_projectless_db_connection from api.utils.db import ( + Connection, get_project_readonly_connection, get_project_write_connection, - Connection, ) from api.utils.export import ExportType from api.utils.extensions import guess_delimiter_by_upload_file_obj diff --git a/api/routes/imports.py b/api/routes/imports.py index d119875de..7f8f84bf2 100644 --- a/api/routes/imports.py +++ b/api/routes/imports.py @@ -1,15 +1,15 @@ -import csv import codecs +import csv from typing import Optional -from fastapi import APIRouter, UploadFile, File +from fastapi import APIRouter, File, UploadFile +from api.utils.db import Connection, get_project_write_connection +from api.utils.extensions import guess_delimiter_by_upload_file_obj from db.python.layers.participant import ( - ParticipantLayer, ExtraParticipantImporterHandler, + ParticipantLayer, ) -from api.utils.extensions import guess_delimiter_by_upload_file_obj -from api.utils.db import get_project_write_connection, Connection router = APIRouter(prefix='/import', tags=['import']) diff --git a/api/routes/participant.py b/api/routes/participant.py index fc2678fb1..61b3db382 100644 --- a/api/routes/participant.py +++ b/api/routes/participant.py @@ -4,20 +4,18 @@ from fastapi import APIRouter from fastapi.params import Query -from starlette.responses import StreamingResponse, JSONResponse +from starlette.responses import JSONResponse, StreamingResponse from api.utils import get_projectless_db_connection from api.utils.db import ( - get_project_write_connection, - get_project_readonly_connection, Connection, + get_project_readonly_connection, + get_project_write_connection, ) from api.utils.export import ExportType -from db.python.layers.participant import ( - ParticipantLayer, -) +from db.python.layers.participant import ParticipantLayer from models.models.participant import ParticipantUpsert -from models.models.sample import sample_id_format +from models.models.sequencing_group import sequencing_group_id_format router = APIRouter(prefix='/participant', tags=['participant']) @@ -113,35 +111,37 @@ async def update_many_participant_external_ids( @router.get( - '/{project}/external-pid-to-internal-sample-id', - operation_id='getExternalParticipantIdToInternalSampleId', + '/{project}/external-pid-to-sg-id', + operation_id='getExternalParticipantIdToSequencingGroupId', tags=['seqr'], ) -async def get_external_participant_id_to_internal_sample_id( +async def get_external_participant_id_to_sequencing_group_id( project: str, + sequencing_type: str = None, export_type: ExportType = ExportType.JSON, flip_columns: bool = False, connection: Connection = get_project_readonly_connection, ): """ - Get csv / tsv export of external_participant_id to internal_sample_id + Get csv / tsv export of external_participant_id to sequencing_group_id - Get a map of {external_participant_id} -> {internal_sample_id} - useful to matching joint-called samples in the matrix table to the participant + Get a map of {external_participant_id} -> {sequencing_group_id} + useful to matching joint-called sequencing groups in the matrix table to the participant Return a list not dictionary, because dict could lose participants with multiple samples. + :param sequencing_type: Leave empty to get all sequencing types :param flip_columns: Set to True when exporting for seqr """ player = ParticipantLayer(connection) # this wants project ID (connection.project) assert connection.project m = await player.get_external_participant_id_to_internal_sequencing_group_id_map( - project=connection.project + project=connection.project, sequencing_type=sequencing_type ) - rows = [[pid, sample_id_format(sid)] for pid, sid in m] + rows = [[pid, sequencing_group_id_format(sgid)] for pid, sgid in m] if flip_columns: rows = [r[::-1] for r in rows] @@ -153,7 +153,9 @@ async def get_external_participant_id_to_internal_sample_id( writer.writerows(rows) ext = export_type.get_extension() - filename = f'{project}-participant-to-sample-map-{date.today().isoformat()}{ext}' + filename = f'{project}-participant-to-sequencing-group-map-{date.today().isoformat()}{ext}' + if sequencing_type: + filename = f'{project}-{sequencing_type}-participant-to-sequencing-group-map-{date.today().isoformat()}{ext}' return StreamingResponse( # stream the whole file at once, because it's all in memory anyway iter([output.getvalue()]), diff --git a/api/routes/sample.py b/api/routes/sample.py index 646a2dc7b..338d81747 100644 --- a/api/routes/sample.py +++ b/api/routes/sample.py @@ -1,18 +1,17 @@ from fastapi import APIRouter, Body from api.utils.db import ( - get_project_write_connection, - get_project_readonly_connection, Connection, + get_project_readonly_connection, + get_project_write_connection, get_projectless_db_connection, ) from db.python.layers.sample import SampleLayer from db.python.tables.project import ProjectPermissionsTable from models.models.sample import SampleUpsert -from models.utils.sample_id_format import ( - # Sample, - sample_id_transform_to_raw, +from models.utils.sample_id_format import ( # Sample, sample_id_format, + sample_id_transform_to_raw, sample_id_transform_to_raw_list, ) diff --git a/api/routes/sequencing_groups.py b/api/routes/sequencing_groups.py index edfbe170d..f3be40236 100644 --- a/api/routes/sequencing_groups.py +++ b/api/routes/sequencing_groups.py @@ -1,18 +1,18 @@ from typing import Any + from fastapi import APIRouter from pydantic import BaseModel from api.utils.db import ( - get_project_readonly_connection, Connection, - get_projectless_db_connection, + get_project_readonly_connection, get_project_write_connection, + get_projectless_db_connection, ) from db.python.layers.sequencing_group import SequencingGroupLayer from models.models.sequencing_group import SequencingGroupUpsertInternal from models.utils.sample_id_format import sample_id_format -from models.utils.sequencing_group_id_format import ( - # Sample, +from models.utils.sequencing_group_id_format import ( # Sample, sequencing_group_id_format_list, sequencing_group_id_transform_to_raw, ) diff --git a/api/server.py b/api/server.py index 652aa4322..1879bec50 100644 --- a/api/server.py +++ b/api/server.py @@ -19,7 +19,7 @@ from db.python.utils import get_logger # This tag is automatically updated by bump2version -_VERSION = '6.6.2' +_VERSION = '6.7.0' logger = get_logger() diff --git a/api/utils/__init__.py b/api/utils/__init__.py index 448a08d7f..fe5ae4cdd 100644 --- a/api/utils/__init__.py +++ b/api/utils/__init__.py @@ -1,13 +1,14 @@ """Importing GCP libraries""" from collections import defaultdict -from typing import TypeVar, Callable, Iterable -from .openapi import get_openapi_schema_func +from typing import Callable, Iterable, TypeVar + from .db import ( authenticate, get_project_readonly_connection, get_project_write_connection, get_projectless_db_connection, ) +from .openapi import get_openapi_schema_func T = TypeVar('T') X = TypeVar('X') diff --git a/api/utils/dates.py b/api/utils/dates.py index 2ef961f01..6c8c57796 100644 --- a/api/utils/dates.py +++ b/api/utils/dates.py @@ -1,4 +1,4 @@ -from datetime import datetime, date, timedelta +from datetime import date, datetime, timedelta INVOICE_DAY_DIFF = 3 diff --git a/api/utils/extensions.py b/api/utils/extensions.py index 6fefaa54d..b7a755a31 100644 --- a/api/utils/extensions.py +++ b/api/utils/extensions.py @@ -1,7 +1,6 @@ import csv from typing import Optional - EXTENSION_TO_DELIM_MAP = { '.csv': ',', '.tsv': '\t', diff --git a/db/backup/backup.py b/db/backup/backup.py index 261ac48e6..68cb6a9b6 100644 --- a/db/backup/backup.py +++ b/db/backup/backup.py @@ -8,9 +8,7 @@ from datetime import datetime from typing import Literal -from google.cloud import storage -from google.cloud import logging -from google.cloud import secretmanager +from google.cloud import logging, secretmanager, storage STORAGE_CLIENT = storage.Client() LOGGING_CLIENT = logging.Client() diff --git a/db/backup/recovery_test.py b/db/backup/recovery_test.py index 9a4616a1a..0c79fd7c1 100644 --- a/db/backup/recovery_test.py +++ b/db/backup/recovery_test.py @@ -5,18 +5,18 @@ """ -import unittest -import subprocess import json import os -from typing import Tuple, Optional +import subprocess +import unittest from collections import namedtuple +from typing import Optional, Tuple + import google.cloud.secretmanager import mysql.connector from parameterized import parameterized from restore import pull_latest_backup, restore - BACKUP_BUCKET = 'cpg-sm-backups' LOCAL_BACKUP_FOLDER = 'latest_backup' diff --git a/db/backup/restore.py b/db/backup/restore.py index 8a6a05cc9..f4c7c446c 100644 --- a/db/backup/restore.py +++ b/db/backup/restore.py @@ -3,6 +3,7 @@ import os import subprocess + from google.cloud import storage BACKUP_BUCKET = 'cpg-sm-backups' diff --git a/db/python/connect.py b/db/python/connect.py index 971058f8f..4a5abcba4 100644 --- a/db/python/connect.py +++ b/db/python/connect.py @@ -70,9 +70,11 @@ async def audit_log_id(self): async with self._audit_log_lock: if not self._audit_log_id: - # pylint: disable=import-outside-toplevel + # make this import here, otherwise we'd have a circular import - from db.python.tables.audit_log import AuditLogTable + from db.python.tables.audit_log import ( # pylint: disable=import-outside-toplevel,R0401 + AuditLogTable, + ) at = AuditLogTable(self) self._audit_log_id = await at.create_audit_log( @@ -154,9 +156,9 @@ def get_connection_string(self): if self.port: _host += f':{self.port}' - options: dict[ - str, str | int - ] = {} # {'min_size': self.min_pool_size, 'max_size': self.max_pool_size} + options: dict[str, str | int] = ( + {} + ) # {'min_size': self.min_pool_size, 'max_size': self.max_pool_size} _options = '&'.join(f'{k}={v}' for k, v in options.items()) url = f'mysql://{u_p}@{_host}/{self.dbname}?{_options}' diff --git a/db/python/gcp_connect.py b/db/python/gcp_connect.py index 649a0abc6..72fbc97df 100644 --- a/db/python/gcp_connect.py +++ b/db/python/gcp_connect.py @@ -6,6 +6,7 @@ import google.cloud.bigquery as bq from google.cloud import pubsub_v1 + from db.python.utils import InternalError logging.basicConfig(level=logging.DEBUG) diff --git a/db/python/layers/assay.py b/db/python/layers/assay.py index 0e512e6e7..3328acd82 100644 --- a/db/python/layers/assay.py +++ b/db/python/layers/assay.py @@ -1,10 +1,10 @@ # pylint: disable=too-many-arguments from typing import Any -from db.python.utils import NoOpAenter from db.python.layers.base import BaseLayer, Connection -from db.python.tables.assay import AssayTable, AssayFilter +from db.python.tables.assay import AssayFilter, AssayTable from db.python.tables.sample import SampleTable +from db.python.utils import NoOpAenter from models.models.assay import AssayInternal, AssayUpsertInternal diff --git a/db/python/layers/search.py b/db/python/layers/search.py index 3e94a7ca9..98187250d 100644 --- a/db/python/layers/search.py +++ b/db/python/layers/search.py @@ -1,13 +1,13 @@ import asyncio from typing import List, Optional -from db.python.utils import NotFoundError from db.python.layers.base import BaseLayer, Connection from db.python.tables.family import FamilyTable from db.python.tables.participant import ParticipantTable from db.python.tables.project import ProjectPermissionsTable from db.python.tables.sample import SampleTable from db.python.tables.sequencing_group import SequencingGroupTable +from db.python.utils import NotFoundError from models.enums.search import SearchResponseType from models.models.sample import sample_id_format, sample_id_transform_to_raw from models.models.search import ( diff --git a/db/python/layers/seqr.py b/db/python/layers/seqr.py index cad435852..b455a177c 100644 --- a/db/python/layers/seqr.py +++ b/db/python/layers/seqr.py @@ -12,6 +12,7 @@ import slack_sdk import slack_sdk.errors from cloudpathlib import AnyPath + from cpg_utils.cloud import get_google_identity_token from api.settings import ( diff --git a/deploy/python/version.txt b/deploy/python/version.txt index 28179fc1f..f0e13c509 100644 --- a/deploy/python/version.txt +++ b/deploy/python/version.txt @@ -1 +1 @@ -6.6.2 +6.7.0 diff --git a/docs/installation.md b/docs/installation.md new file mode 100644 index 000000000..89d35b38b --- /dev/null +++ b/docs/installation.md @@ -0,0 +1,421 @@ +# Installation + +This document provides detailed instructions on how to install the project. Follow these steps to set up the project on your local system. + +## Prerequisites + +[Homebrew](https://brew.sh) is the simplest way to install system dependencies needed for this project. + +[Chocolatey](https://community.chocolatey.org/) is a good equivalent to Homebrew for package management in Windows. + +## System Requirements + + +- **Node/NPM** (recommend using [pnpm](https://pnpm.io/motivation) for this, but you can also use [nvm](https://github.com/nvm-sh/nvm)) +- **MariaDB** (using MariaDB in docker is also good) +- **Java** (for Liquibase / OpenAPI Generator) +- **Liquibase** +- **OpenAPI Generator** +- **pyenv** +- **wget** *(optional)* + +### Mac + +```bash +brew install pnpm # recommended over nvm +# OR +brew install nvm + +brew install java +brew install liquibase +brew install pyenv +brew install wget + +# skip if you wish to install via docker +brew install mariadb@10.8 + +``` + +### Windows + +Instructions for Windows should theoretically work but we have only tested this project to work on -nix systems. As such, we are unable to verify any discrepancies on Windows, and there could be slight variations in your setup. + +```bash +# Assuming you have Chocolatey +choco install pnpm # Recommended +# OR +choco install nvm + +choco install jdk8 +choco install liquibase +choco install pyenv-win +choco install wget + +# skip if you wish to install via docker +choco install mariadb --version=10.8.3 +``` + +## Installation Steps + +### Creating the environment + +- Python dependencies for the `metamist` API package are listed in `setup.py`. +- Additional dev requirements are listed in `requirements-dev.txt`. +- Packages for the sever-side code are listed in `requirements.txt`. + +We *STRONGLY* encourage the use of `pyenv` for managing Python versions. Debugging and the server will run on a minimum python version of 3.10. Refer to the [team-docs](https://github.com/populationgenomics/team-docs/blob/main/python.md) for more instructions on how to set this up. + +Use of a virtual environment to contain all requirements is highly recommended: + +```bash +virtualenv venv +source venv/bin/activate +pip install -r requirements.txt -r requirements-dev.txt + +# Installs metamist as a package +pip install --editable . +``` + +You will also need to set the following environment variables. Adjust the paths if you installed the dependencies using an alternate means: + +```bash +# homebrew should export this on an M1 Mac +# the intel default is /usr/local +export HB_PREFIX=${HOMEBREW_PREFIX-/usr/local} + +# installing Java through brew recommendation +export CPPFLAGS="-I$HB_PREFIX/opt/openjdk/include" +export PATH="$HB_PREFIX/bin:$PATH:$HB_PREFIX/opt/openjdk/bin" + +# installing liquibase through brew recommendation +export LIQUIBASE_HOME=$(brew --prefix)/opt/liquibase/libexec + +# mariadb +export PATH="$HB_PREFIX/opt/mariadb@10.8/bin:$PATH" + +# metamist config +export SM_ENVIRONMENT=LOCAL # good default to have +export SM_DEV_DB_USER=sm_api # makes it easier to copy liquibase update command +``` + +You can also add these to your shell config file e.g `.zshrc` or `.bashrc` for persistence to new sessions. + +#### PNPM/NVM Config + +Depending on your choice for using `pnpm` or `nvm` you will have to configure your shell for it. + +If you installed `pnpm`, you should have a similar snippet from the brew installation output: + +```shell +export PNPM_HOME="/Users/$(whoami)/Library/pnpm" +case ":$PATH:" in + *":$PNPM_HOME:"*) ;; + *) export PATH="$PNPM_HOME:$PATH" ;; +esac +``` + +Add this to your `.zshrc` to auto-load on next shell session. + +If you installed `nvm`, you will need to add lazy load since `nvm` has high load penalties. + +- For Oh-My-Zsh users, you can just add the `nvm` plugin to your `.zshrc` via these [instructions](https://github.com/ohmyzsh/ohmyzsh/blob/master/plugins/nvm/README.md) + +- If you do NOT have Oh-My-Zsh, you can use this [plugin](https://github.com/undg/zsh-nvm-lazy-load): + +```shell +git clone https://github.com/undg/zsh-nvm-lazy-load $ZSH/custom/plugins/zsh-nvm + +#Add this to your plugins variable in the `.zshrc` file and then source the file. +plugins=(... zsh-nvm-lazy-load) +``` + +Once set up, install the OpenAPI Generator: + +- For `pnpm`: + +```shell +# Install npm via pnpm +# This also activates the env for you, replace `use` with `add` to only install it. +pnpm env use --global lts +pnpm install @openapitools/openapi-generator-cli -g +``` + +Add this to your `.zshrc`: + +```shell +# openapi +export OPENAPI_COMMAND="pnpm dlx @openapitools/openapi-generator-cli" +alias openapi-generator="pnpm dlx @openapitools/openapi-generator-cli" +``` + +- For `nvm`: + +```shell +# Install npm via nvm +nvm install --lts +npm install @openapitools/openapi-generator-cli -g +``` + +Add this to your `.zshrc`: + +```shell +# openapi +export OPENAPI_COMMAND="npx @openapitools/openapi-generator-cli" +alias openapi-generator="npx @openapitools/openapi-generator-cli" +``` + +Finally, set the version: + +```shell +openapi-generator-cli version-manager set 5.3.0 +``` + +### Database Setup - Native Installation + +Set the following environment variables: + +```bash +export SM_DEV_DB_USER=sm_api +export SM_DEV_DB_PASSWORD= # empty password +export SM_DEV_DB_HOST=127.0.0.1 +export SM_DEV_DB_PORT=3306 # default mariadb port +export SM_DEV_DB_NAME=sm_dev; +``` + +Next, create the database `sm_dev` in MariaDB. + +> In newer versions of MariaDB, the root user is protected. + +Create a new user `sm_api` and provide permissions: + +```bash +sudo mysql -u root --execute " + CREATE DATABASE sm_dev; + CREATE USER sm_api@'%'; + CREATE USER sm_api@localhost; + CREATE ROLE sm_api_role; + GRANT sm_api_role TO sm_api@'%'; + GRANT sm_api_role TO sm_api@localhost; + SET DEFAULT ROLE sm_api_role FOR sm_api@'%'; + SET DEFAULT ROLE sm_api_role FOR sm_api@localhost; + GRANT ALL PRIVILEGES ON sm_dev.* TO sm_api_role; +" +``` + +Using `liquibase` we can now set up the tables as per the schema in `db/project.xml`: + +```bash +pushd db/ +wget https://repo1.maven.org/maven2/org/mariadb/jdbc/mariadb-java-client/3.0.3/mariadb-java-client-3.0.3.jar +liquibase \ + --changeLogFile project.xml \ + --url jdbc:mariadb://localhost/sm_dev \ + --driver org.mariadb.jdbc.Driver \ + --classpath mariadb-java-client-3.0.3.jar \ + --username ${SM_DEV_DB_USER:-root} \ + update +popd +``` + +### Database Setup - Docker Installation + +Ensure you have Docker installed or follow [this guide](https://docs.docker.com/engine/install/) to setup. + +Pull the image: + +```bash +docker pull mariadb:10.8.3 +``` + +Run the container on port 3306: + +```bash +docker run --name mariadb-p3306 -e MYSQL_ALLOW_EMPTY_PASSWORD=1 -p 3306:3306 -d docker.io/library/mariadb:10.8.3 +``` + +If you have a local MySQL instance already running on port 3306, you can map the docker container to run on 3307: + +```bash +docker run --name mariadb-p3307 -e MYSQL_ALLOW_EMPTY_PASSWORD=1 -p 3307:3306 -d docker.io/library/mariadb:10.8.3 +``` + +You can now execute bash commands inside a shell: + +```bash +docker exec -it mariadb-p3306 bash +``` + +Set up the database with the `sm_api` user and appropriate permissions: + +```bash +mysql -u root --execute " + CREATE DATABASE sm_dev; + CREATE USER sm_api@'%'; + CREATE USER sm_api@localhost; + CREATE ROLE sm_api_role; + GRANT sm_api_role TO sm_api@'%'; + GRANT sm_api_role TO sm_api@localhost; + SET DEFAULT ROLE sm_api_role FOR sm_api@'%'; + SET DEFAULT ROLE sm_api_role FOR sm_api@localhost; + GRANT ALL PRIVILEGES ON sm_dev.* TO sm_api_role; +" +``` + +Exit the container bash shell once done and on the host, run liquibase with the correct port mapping to set up the tables: + +```bash +pushd db/ +wget https://repo1.maven.org/maven2/org/mariadb/jdbc/mariadb-java-client/3.0.3/mariadb-java-client-3.0.3.jar +liquibase \ + --changeLogFile project.xml \ + --url jdbc:mariadb://127.0.0.1:3306/sm_dev \ + --driver org.mariadb.jdbc.Driver \ + --classpath mariadb-java-client-3.0.3.jar \ + --username root \ + update +popd +``` + +Ensure the database port environment variable matches the mapping above: + +```bash +export SM_DEV_DB_PORT=3306 # or 3307 +``` + +## Running the server + +You'll want to set the following environment variables (permanently) in your local development environment. + +The `SM_ENVIRONMENT`, `SM_LOCALONLY_DEFAULTUSER` and `SM_ALLOWALLACCESS` environment variables allow access to a local metamist server without providing a bearer token. + +This will allow you to test the front-end components that access data. This happens automatically on the production instance through the Google identity-aware-proxy. + +```bash +# ensures the SWAGGER page points to your local: (localhost:8000/docs) +# and ensures if you use the PythonAPI, it also points to your local +export SM_ENVIRONMENT=LOCAL +# skips permission checks in your local environment +export SM_ALLOWALLACCESS=true +# uses your username as the "author" in requests +export SM_LOCALONLY_DEFAULTUSER=$(whoami) +``` + +With those variables set, it is a good time to populate some test data if this is your first time running this server: + +```bash +python3 test/data/generate_data.py +``` + +You can now run the server: + +```bash +# start the server +python3 -m api.server +# OR +# uvicorn --port 8000 --host 0.0.0.0 api.server:app +``` + + +## Running Locally for Dev + +### Running and Debugging in VS Code + +The following `launch.json` is a good base to debug the web server in VS Code: + +```json +{ + "version": "0.2.0", + "configurations": [ + { + "name": "Run API", + "type": "python", + "request": "launch", + "module": "api.server", + "justMyCode": false, + "env": { + "SM_ALLOWALLACCESS": "true", + "SM_LOCALONLY_DEFAULTUSER": "-local", + "SM_ENVIRONMENT": "local", + "SM_DEV_DB_USER": "sm_api", + } + } + ] +} +``` + +You can now place breakpoints anywhere and debug the API with "Run API" under the *Run and Debug* tab (⌘⇧D) or (Ctrl+Shift+D): + +![Run and Debug](../resources/debug-api.png) + +### Generate and install the python installable API + +After making any changes to the logic, it is worth regenerating the API with the OpenAPI Generator. + +Generating the installable APIs (Python + Typescript) involves running the server, getting the `/openapi.json`, and running `openapi-generator`. + +The `regenerate_api.py` script does this for us in a few ways: + +1. Uses a running server on `localhost:8000` +2. Runs a docker container from the `SM_DOCKER` environment variable. +3. Spins up the server itself. + +You can simply run: + +```bash +# this will start the api.server, so make sure you have the dependencies installed, +python regenerate_api.py \ + && pip install . +``` + +or if you prefer the Docker approach (eg: for CI), this command will build the docker container and supply it to `regenerate_api.py`: + +```bash +# SM_DOCKER is a known env variable to regenerate_api.py +export SM_DOCKER="cpg/metamist-server:dev" +docker build --build-arg SM_ENVIRONMENT=local -t $SM_DOCKER -f deploy/api/Dockerfile . +python regenerate_api.py +``` + +### Developing the UI + +```bash +# Ensure you have started metamist server locally on your computer already, then in another tab open the UI. +# This will automatically proxy request to the server. +cd web +npm install +npm run compile +npm start +``` + +This will start a web server using Vite, running on `localhost:5173`. + +### OpenAPI and Swagger + +The Web API uses `apispec` with OpenAPI3 annotations on each route to describe interactions with the server. We can generate a swagger UI and an installable +python module based on these annotations. + +Some handy links: + +- [OpenAPI specification](https://swagger.io/specification/) +- [Describing parameters](https://swagger.io/docs/specification/describing-parameters/) +- [Describing request body](https://swagger.io/docs/specification/describing-request-body/) +- [Media types](https://swagger.io/docs/specification/media-types/) + +The web API exposes this schema in two ways: + +- Swagger UI: `http://localhost:8000/docs` + - You can use this to construct requests to the server + - Make sure you fill in the Bearer token (at the top right ) +- OpenAPI schema: `http://localhost:8000/schema.json` + - Returns a JSON with the full OpenAPI 3 compliant schema. + - You could put this into the [Swagger editor](https://editor.swagger.io/) to see the same "Swagger UI" that `/api/docs` exposes. + - We generate the metamist installable Python API based on this schema. + +## Deployment + +The CPG deploy is managed through Cloud Run on the Google Cloud Platform. +The deploy github action builds the container, and is deployed. + +Additionally you can access metamist through the identity-aware proxy (IAP), +which handles the authentication through OAuth, allowing you to access the +front-end. diff --git a/etl/extract/main.py b/etl/extract/main.py index 6b05cbbaf..8e3971042 100644 --- a/etl/extract/main.py +++ b/etl/extract/main.py @@ -7,10 +7,10 @@ import flask import functions_framework import google.cloud.bigquery as bq -from cpg_utils.cloud import email_from_id_token - from google.cloud import pubsub_v1 # type: ignore +from cpg_utils.cloud import email_from_id_token + BIGQUERY_TABLE = os.getenv('BIGQUERY_TABLE') PUBSUB_TOPIC = os.getenv('PUBSUB_TOPIC') diff --git a/etl/load/main.py b/etl/load/main.py index b0a2a0a26..58d7ea5ce 100644 --- a/etl/load/main.py +++ b/etl/load/main.py @@ -9,11 +9,8 @@ import flask import functions_framework import google.cloud.bigquery as bq - -from google.cloud import pubsub_v1 # type: ignore - import pkg_resources - +from google.cloud import pubsub_v1 # type: ignore BIGQUERY_TABLE = os.getenv('BIGQUERY_TABLE') BIGQUERY_LOG_TABLE = os.getenv('BIGQUERY_LOG_TABLE') diff --git a/metamist/audit/audit_upload_bucket.py b/metamist/audit/audit_upload_bucket.py index 772fff409..22827a532 100644 --- a/metamist/audit/audit_upload_bucket.py +++ b/metamist/audit/audit_upload_bucket.py @@ -4,21 +4,20 @@ and sequencing groups that have no aligned CRAM. """ -from datetime import datetime -import sys +import asyncio import logging import os -import asyncio +import sys +from datetime import datetime from functools import cache + import click from cpg_utils.config import get_config - from metamist.audit.generic_auditor import GenericAuditor from metamist.graphql import gql, query - FASTQ_EXTENSIONS = ('.fq.gz', '.fastq.gz', '.fq', '.fastq') BAM_EXTENSIONS = ('.bam',) CRAM_EXTENSIONS = ('.cram',) diff --git a/metamist/audit/audithelper.py b/metamist/audit/audithelper.py index d20810ad9..7b58fb2a1 100644 --- a/metamist/audit/audithelper.py +++ b/metamist/audit/audithelper.py @@ -6,6 +6,7 @@ from typing import Any from cloudpathlib import AnyPath + from cpg_utils.cloud import get_path_components_from_gcp_path from metamist.parser.cloudhelper import CloudHelper diff --git a/metamist/audit/delete_assay_files_from_audit.py b/metamist/audit/delete_assay_files_from_audit.py index d3480d2c8..c60e42439 100644 --- a/metamist/audit/delete_assay_files_from_audit.py +++ b/metamist/audit/delete_assay_files_from_audit.py @@ -13,16 +13,18 @@ """ import csv -from datetime import datetime import logging import os import sys +from datetime import datetime + import click +from cloudpathlib import AnyPath, CloudPath from google.cloud import storage -from cloudpathlib import CloudPath, AnyPath + from cpg_utils.config import get_config -from metamist.audit.audithelper import AuditHelper +from metamist.audit.audithelper import AuditHelper CLIENT = storage.Client() diff --git a/metamist/audit/generic_auditor.py b/metamist/audit/generic_auditor.py index 032522fc6..c69eacf40 100644 --- a/metamist/audit/generic_auditor.py +++ b/metamist/audit/generic_auditor.py @@ -1,13 +1,14 @@ -from collections import defaultdict, namedtuple -from functools import cache import logging import os -from typing import Any +from collections import defaultdict, namedtuple from datetime import datetime +from functools import cache +from typing import Any from gql.transport.requests import log as requests_logger + from metamist.audit.audithelper import AuditHelper -from metamist.graphql import query_async, gql +from metamist.graphql import gql, query_async ANALYSIS_TYPES_QUERY = gql( """ diff --git a/metamist/graphql/__init__.py b/metamist/graphql/__init__.py index 2cd00b996..5696293f7 100644 --- a/metamist/graphql/__init__.py +++ b/metamist/graphql/__init__.py @@ -5,21 +5,21 @@ - validate queries with metamist schema (by fetching the schema) """ import os -from typing import Dict, Any +from typing import Any, Dict -from gql import Client, gql as gql_constructor +from gql import Client +from gql import gql as gql_constructor from gql.transport.aiohttp import AIOHTTPTransport from gql.transport.aiohttp import log as aiohttp_logger from gql.transport.requests import RequestsHTTPTransport from gql.transport.requests import log as requests_logger -from cpg_utils.cloud import get_google_identity_token - # this does not import itself, it imports the module from graphql import DocumentNode # type: ignore -import metamist.configuration +from cpg_utils.cloud import get_google_identity_token +import metamist.configuration _sync_client: Client | None = None _async_client: Client | None = None diff --git a/metamist/parser/cloudhelper.py b/metamist/parser/cloudhelper.py index c4f03aa4d..a03c89b20 100644 --- a/metamist/parser/cloudhelper.py +++ b/metamist/parser/cloudhelper.py @@ -1,14 +1,13 @@ # pylint: disable=no-member -import os import logging -from typing import Iterable, Callable, TypeVar +import os from collections import defaultdict from datetime import datetime +from typing import Callable, Iterable, TypeVar from cloudpathlib import AnyPath, GSPath from google.cloud import storage - # type declarations for GroupBy T = TypeVar('T') X = TypeVar('X') diff --git a/metamist/parser/generic_metadata_parser.py b/metamist/parser/generic_metadata_parser.py index ac3145249..d89104df0 100644 --- a/metamist/parser/generic_metadata_parser.py +++ b/metamist/parser/generic_metadata_parser.py @@ -8,13 +8,12 @@ import click -from metamist.parser.generic_parser import ( +from metamist.parser.generic_parser import ( # noqa GenericParser, GroupedRow, ParsedAnalysis, ParsedAssay, ParsedSequencingGroup, - # noqa SingleRow, run_as_sync, ) diff --git a/metamist/parser/sample_file_map_parser.py b/metamist/parser/sample_file_map_parser.py index 20f8eb5b9..077aca912 100644 --- a/metamist/parser/sample_file_map_parser.py +++ b/metamist/parser/sample_file_map_parser.py @@ -5,10 +5,7 @@ import click -from metamist.parser.generic_metadata_parser import ( - GenericMetadataParser, - run_as_sync, -) +from metamist.parser.generic_metadata_parser import GenericMetadataParser, run_as_sync from metamist.parser.generic_parser import SingleRow PARTICIPANT_COL_NAME = 'individual_id' diff --git a/metamist_infrastructure/__init__.py b/metamist_infrastructure/__init__.py index 9007ac9a6..420050dd9 100644 --- a/metamist_infrastructure/__init__.py +++ b/metamist_infrastructure/__init__.py @@ -3,5 +3,6 @@ """ import sys + if 'unittest' not in sys.modules: from metamist_infrastructure.driver import MetamistInfrastructure diff --git a/metamist_infrastructure/__main__.py b/metamist_infrastructure/__main__.py index 7cc77df3d..efdf0c08d 100644 --- a/metamist_infrastructure/__main__.py +++ b/metamist_infrastructure/__main__.py @@ -9,6 +9,7 @@ from typing import NamedTuple from cpg_infra.config import CPGInfrastructureConfig + from metamist_infrastructure import MetamistInfrastructure GCP_PROJECT = os.getenv('METAMIST_INFRA_GCP_PROJECT') diff --git a/metamist_infrastructure/driver.py b/metamist_infrastructure/driver.py index 96ef95207..8e69d2577 100644 --- a/metamist_infrastructure/driver.py +++ b/metamist_infrastructure/driver.py @@ -9,6 +9,7 @@ import pulumi import pulumi_gcp as gcp + from cpg_infra.plugin import CpgInfrastructurePlugin from cpg_infra.utils import archive_folder diff --git a/metamist_infrastructure/slack_notification.py b/metamist_infrastructure/slack_notification.py index d0fb33ebb..89a9ab1ec 100644 --- a/metamist_infrastructure/slack_notification.py +++ b/metamist_infrastructure/slack_notification.py @@ -9,6 +9,7 @@ import pulumi import pulumi_gcp as gcp + from cpg_infra.utils import archive_folder from cpg_utils.cloud import read_secret diff --git a/models/models/sequencing_group.py b/models/models/sequencing_group.py index 1ccd1a991..4f4ed878e 100644 --- a/models/models/sequencing_group.py +++ b/models/models/sequencing_group.py @@ -8,7 +8,6 @@ sequencing_group_id_transform_to_raw, ) - SequencingGroupInternalId = int SequencingGroupExternalId = str diff --git a/models/models/web.py b/models/models/web.py index d368f29ab..0c96ed90f 100644 --- a/models/models/web.py +++ b/models/models/web.py @@ -2,7 +2,7 @@ import dataclasses from models.base import SMBase -from models.models.participant import NestedParticipantInternal, NestedParticipant +from models.models.participant import NestedParticipant, NestedParticipantInternal class WebProject(SMBase): diff --git a/models/utils/sample_id_format.py b/models/utils/sample_id_format.py index 31bd873eb..154ecb410 100644 --- a/models/utils/sample_id_format.py +++ b/models/utils/sample_id_format.py @@ -1,5 +1,6 @@ from typing import Iterable -from api.settings import SAMPLE_PREFIX, SAMPLE_CHECKSUM_OFFSET + +from api.settings import SAMPLE_CHECKSUM_OFFSET, SAMPLE_PREFIX from models.utils.luhn import luhn_compute, luhn_is_valid diff --git a/models/utils/sequencing_group_id_format.py b/models/utils/sequencing_group_id_format.py index 98a374b63..5be3e164d 100644 --- a/models/utils/sequencing_group_id_format.py +++ b/models/utils/sequencing_group_id_format.py @@ -1,6 +1,7 @@ from typing import Iterable -from api.settings import SEQUENCING_GROUP_PREFIX, SEQUENCING_GROUP_CHECKSUM_OFFSET -from models.utils.luhn import luhn_is_valid, luhn_compute + +from api.settings import SEQUENCING_GROUP_CHECKSUM_OFFSET, SEQUENCING_GROUP_PREFIX +from models.utils.luhn import luhn_compute, luhn_is_valid def sequencing_group_id_transform_to_raw_list( diff --git a/pyproject.toml b/pyproject.toml index a9d613157..d8e889718 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,3 +3,24 @@ line-length = 88 skip-string-normalization = true exclude = "metamist/" include = "metamist/parser/" + +[tool.isort] +py_version = 311 +profile = "black" +line_length = 88 +sections = ["FUTURE", "STDLIB", "THIRDPARTY", "HAIL", "CPG", "FIRSTPARTY", "LOCALFOLDER"] +known_hail = [ + "hail", + "hailtop", +] +# Adjust these for each repository, e.g., removing those that should be +# local rather than CPG. Also fill in extend_skip below if there are any +# subdirectories that should be ignored. +known_cpg = [ + "analysis_runner", + "cpg_infra", + "cpg_utils", + "cpg_workflows", + "gnomad", + "hail_scripts", +] diff --git a/resources/2024-01-15_db-diagram.png b/resources/2024-01-15_db-diagram.png new file mode 100644 index 000000000..8a34e0032 Binary files /dev/null and b/resources/2024-01-15_db-diagram.png differ diff --git a/scripts/add_cram_size.py b/scripts/add_cram_size.py index 1872960c4..1e00b04ef 100644 --- a/scripts/add_cram_size.py +++ b/scripts/add_cram_size.py @@ -3,22 +3,21 @@ This script goes through all CRAMS in sample-metadata, gets the size, and updates the meta['size'] attribute on the analysis. """ -import re +import asyncio import logging import os -import asyncio +import re from typing import Dict, List -from google.cloud import storage from google.api_core.exceptions import NotFound +from google.cloud import storage from api.utils import group_by - from metamist.apis import AnalysisApi, ProjectApi from metamist.model.analysis_query_model import AnalysisQueryModel -from metamist.model.analysis_update_model import AnalysisUpdateModel from metamist.model.analysis_status import AnalysisStatus from metamist.model.analysis_type import AnalysisType +from metamist.model.analysis_update_model import AnalysisUpdateModel from metamist.parser.generic_parser import chunk logger = logging.getLogger(__name__) diff --git a/scripts/check_md5s.py b/scripts/check_md5s.py index 75104c722..1bea22388 100644 --- a/scripts/check_md5s.py +++ b/scripts/check_md5s.py @@ -2,9 +2,10 @@ from typing import Set import click -import hailtop.batch as hb from google.cloud import storage +import hailtop.batch as hb + DRIVER_IMAGE = 'australia-southeast1-docker.pkg.dev/analysis-runner/images/driver:8cc869505251c8396fefef01c42225a7b7930a97-hail-0.2.73.devc6f6f09cec08' diff --git a/scripts/check_md5s_from_metamist.py b/scripts/check_md5s_from_metamist.py index 0fa39546e..a57a1ee2b 100644 --- a/scripts/check_md5s_from_metamist.py +++ b/scripts/check_md5s_from_metamist.py @@ -1,14 +1,16 @@ -from collections import namedtuple import math import os +from collections import namedtuple from shlex import quote import click + import hailtop.batch as hb -from cpg_utils.hail_batch import remote_tmpdir + from cpg_utils.config import get_config +from cpg_utils.hail_batch import remote_tmpdir -from metamist.apis import SampleApi, AssayApi +from metamist.apis import AssayApi, SampleApi LocationTuple = namedtuple( 'LocationTuple', ['cpg_sample_id', 'location', 'checksum', 'size'] diff --git a/scripts/check_sequence_files.py b/scripts/check_sequence_files.py index 318945c22..e8c22414f 100755 --- a/scripts/check_sequence_files.py +++ b/scripts/check_sequence_files.py @@ -4,14 +4,15 @@ Find sequencing files that exist in the bucket, but are not ingested. This pairs well will the cleanup_fastqs.py script. """ -import os import asyncio import logging +import os from collections import defaultdict -from typing import Set, Dict, Any, List +from typing import Any, Dict, List, Set from google.cloud import storage -from metamist.apis import WebApi, SampleApi, AssayApi, ProjectApi + +from metamist.apis import AssayApi, ProjectApi, SampleApi, WebApi # Global vars logger = logging.getLogger(__file__) diff --git a/scripts/create_md5s.py b/scripts/create_md5s.py index f99f64906..fe7c78ae5 100644 --- a/scripts/create_md5s.py +++ b/scripts/create_md5s.py @@ -1,9 +1,10 @@ import os import click -from cpg_utils.hail_batch import get_batch, get_config, copy_common_env from google.cloud import storage +from cpg_utils.hail_batch import copy_common_env, get_batch, get_config + def create_md5s_for_files_in_directory(skip_filetypes: tuple[str, str], force_recreate: bool, gs_dir): """Validate files with MD5s in the provided gs directory""" diff --git a/scripts/create_test_subset.py b/scripts/create_test_subset.py index 14f8ccd00..448d89b54 100755 --- a/scripts/create_test_subset.py +++ b/scripts/create_test_subset.py @@ -21,14 +21,14 @@ from google.cloud import storage -from metamist.apis import AnalysisApi, AssayApi, SampleApi, FamilyApi, ParticipantApi +from metamist.apis import AnalysisApi, AssayApi, FamilyApi, ParticipantApi, SampleApi from metamist.graphql import gql, query from metamist.models import ( - AssayUpsert, - SampleUpsert, Analysis, AnalysisStatus, AnalysisUpdateModel, + AssayUpsert, + SampleUpsert, SequencingGroupUpsert, ) diff --git a/scripts/etl_caller_example.py b/scripts/etl_caller_example.py index 32544c942..8f353bd91 100644 --- a/scripts/etl_caller_example.py +++ b/scripts/etl_caller_example.py @@ -7,11 +7,12 @@ pip install requests google-auth requests urllib3 """ import os -import requests -import google.oauth2.id_token + import google.auth.transport.requests -from urllib3 import Retry +import google.oauth2.id_token +import requests from requests.adapters import HTTPAdapter +from urllib3 import Retry URL = 'https://metamist-etl-mnrpw3mdza-ts.a.run.app' TYPE = 'NAME_OF_EXTERNAL_PARTY/v1' diff --git a/scripts/fix_family_ids.py b/scripts/fix_family_ids.py index 8357deda5..c714916cd 100644 --- a/scripts/fix_family_ids.py +++ b/scripts/fix_family_ids.py @@ -2,7 +2,9 @@ Small script to update external family IDs """ import json + import click + from metamist.apis import FamilyApi from metamist.model.family_update_model import FamilyUpdateModel diff --git a/scripts/fix_participant_ids.py b/scripts/fix_participant_ids.py index 80990649b..f8ed64dcb 100644 --- a/scripts/fix_participant_ids.py +++ b/scripts/fix_participant_ids.py @@ -2,7 +2,9 @@ Small script to update external participant IDs """ import json + import click + from metamist.apis import ParticipantApi diff --git a/scripts/parse_participant_meta.py b/scripts/parse_participant_meta.py index dac68a01e..a351842f6 100644 --- a/scripts/parse_participant_meta.py +++ b/scripts/parse_participant_meta.py @@ -12,7 +12,6 @@ import traceback import click - from google.cloud import storage from metamist import ApiException diff --git a/scripts/process_ont_products.py b/scripts/process_ont_products.py index d5ba354a0..b160d8696 100644 --- a/scripts/process_ont_products.py +++ b/scripts/process_ont_products.py @@ -11,10 +11,7 @@ from metamist.model.analysis import Analysis from metamist.model.analysis_status import AnalysisStatus from metamist.parser.cloudhelper import CloudHelper -from metamist.parser.generic_metadata_parser import ( - GenericMetadataParser, - run_as_sync, -) +from metamist.parser.generic_metadata_parser import GenericMetadataParser, run_as_sync from metamist.parser.generic_parser import CustomDictReader, chunk OntAnalysesPreparer = namedtuple( diff --git a/setup.py b/setup.py index 0b812b931..1c1d579c0 100644 --- a/setup.py +++ b/setup.py @@ -19,7 +19,7 @@ setup( name=PKG, # This tag is automatically updated by bump2version - version='6.6.2', + version='6.7.0', description='Python API for interacting with the Sample API system', long_description=readme, long_description_content_type='text/markdown', diff --git a/test/data/generate_seqr_project_data.py b/test/data/generate_seqr_project_data.py index ec3424217..5a3f7f5bf 100644 --- a/test/data/generate_seqr_project_data.py +++ b/test/data/generate_seqr_project_data.py @@ -7,9 +7,8 @@ import random import sys import tempfile - from pprint import pprint -from typing import Set, List +from typing import List, Set from metamist.apis import AnalysisApi, FamilyApi, ParticipantApi, ProjectApi, SampleApi from metamist.graphql import gql, query_async diff --git a/test/test_assay.py b/test/test_assay.py index d53e79eb8..6c26d26ad 100644 --- a/test/test_assay.py +++ b/test/test_assay.py @@ -2,12 +2,11 @@ from pymysql.err import IntegrityError -from db.python.utils import NotFoundError from db.python.enum_tables import AssayTypeTable from db.python.layers.assay import AssayLayer from db.python.layers.sample import SampleLayer from db.python.tables.assay import AssayFilter -from db.python.utils import GenericFilter +from db.python.utils import GenericFilter, NotFoundError from models.models.assay import AssayUpsertInternal from models.models.sample import SampleUpsertInternal diff --git a/test/test_generate_data.py b/test/test_generate_data.py index 56be80269..de7995c1d 100644 --- a/test/test_generate_data.py +++ b/test/test_generate_data.py @@ -1,7 +1,8 @@ import unittest from test.data.generate_data import QUERY_ENUMS, QUERY_SG_ID -from metamist.graphql import validate, configure_sync_client + from api.graphql.schema import schema # type: ignore +from metamist.graphql import configure_sync_client, validate class ValidateGenerateDataQueries(unittest.TestCase): diff --git a/test/test_generic_auditor.py b/test/test_generic_auditor.py index 69cb6a324..491c97d05 100644 --- a/test/test_generic_auditor.py +++ b/test/test_generic_auditor.py @@ -1,3 +1,5 @@ +# noqa: B006 + import unittest import unittest.mock from collections import namedtuple @@ -5,8 +7,6 @@ from metamist.audit.generic_auditor import GenericAuditor -# noqa: B006 - class TestGenericAuditor(unittest.TestCase): """Test the audit helper functions""" diff --git a/test/test_get_participants.py b/test/test_get_participants.py index 195470165..48680611f 100644 --- a/test/test_get_participants.py +++ b/test/test_get_participants.py @@ -1,4 +1,5 @@ from test.testbase import DbIsolatedTest, run_as_sync + from db.python.layers.participant import ParticipantLayer from models.models.participant import ParticipantUpsertInternal diff --git a/test/test_metamist.py b/test/test_metamist.py index aeb6cab5f..809a5b8ca 100644 --- a/test/test_metamist.py +++ b/test/test_metamist.py @@ -1,4 +1,5 @@ import unittest + from metamist.models import AssayUpsert diff --git a/test/test_models.py b/test/test_models.py index 061754b1f..2f395a5c8 100644 --- a/test/test_models.py +++ b/test/test_models.py @@ -3,8 +3,8 @@ from models.models import ( ParticipantUpsert, ParticipantUpsertInternal, - SampleUpsertInternal, SampleUpsert, + SampleUpsertInternal, ) from models.utils.sample_id_format import sample_id_format diff --git a/test/test_parse_file_map.py b/test/test_parse_file_map.py index 6b4ecea59..22e7b700f 100644 --- a/test/test_parse_file_map.py +++ b/test/test_parse_file_map.py @@ -1,6 +1,6 @@ from io import StringIO -from unittest.mock import patch from test.testbase import DbIsolatedTest, run_as_sync +from unittest.mock import patch from metamist.parser.generic_parser import ParsedParticipant from metamist.parser.sample_file_map_parser import SampleFileMapParser diff --git a/test/test_sequencing_groups.py b/test/test_sequencing_groups.py index d1006b98b..3139f13c0 100644 --- a/test/test_sequencing_groups.py +++ b/test/test_sequencing_groups.py @@ -1,11 +1,12 @@ from test.testbase import DbIsolatedTest, run_as_sync -from db.python.utils import GenericFilter + +from db.python.layers import SampleLayer, SequencingGroupLayer from db.python.tables.sequencing_group import SequencingGroupFilter -from db.python.layers import SequencingGroupLayer, SampleLayer +from db.python.utils import GenericFilter from models.models import ( - SequencingGroupUpsertInternal, AssayUpsertInternal, SampleUpsertInternal, + SequencingGroupUpsertInternal, ) diff --git a/test/test_update_participant_family.py b/test/test_update_participant_family.py index 23315e222..e8517dd91 100644 --- a/test/test_update_participant_family.py +++ b/test/test_update_participant_family.py @@ -1,9 +1,10 @@ from test.testbase import DbIsolatedTest, run_as_sync + from pymysql.err import IntegrityError -from models.models import ParticipantUpsertInternal -from db.python.layers.participant import ParticipantLayer from db.python.layers.family import FamilyLayer +from db.python.layers.participant import ParticipantLayer +from models.models import ParticipantUpsertInternal class TestParticipantFamily(DbIsolatedTest): diff --git a/test/test_upsert.py b/test/test_upsert.py index 0e4c1e78a..02d46a378 100644 --- a/test/test_upsert.py +++ b/test/test_upsert.py @@ -1,12 +1,10 @@ from test.testbase import DbIsolatedTest, run_as_sync -from db.python.layers.participant import ( - ParticipantLayer, -) +from db.python.layers.participant import ParticipantLayer +from models.models.assay import AssayUpsertInternal from models.models.participant import ParticipantUpsertInternal from models.models.sample import SampleUpsertInternal from models.models.sequencing_group import SequencingGroupUpsertInternal -from models.models.assay import AssayUpsertInternal default_assay_meta = { 'sequencing_type': 'genome', diff --git a/tob_metadata_harmonisation.py b/tob_metadata_harmonisation.py index 1008719af..4a259297e 100644 --- a/tob_metadata_harmonisation.py +++ b/tob_metadata_harmonisation.py @@ -9,12 +9,10 @@ """ from typing import Dict -from metamist.graphql import gql, query - from metamist.apis import AssayApi +from metamist.graphql import gql, query from metamist.models import AssayUpsert - SG_QUERY = gql( """ query MyQuery($active_status: Boolean!) { diff --git a/web/package.json b/web/package.json index 69ffc33f6..40e260617 100644 --- a/web/package.json +++ b/web/package.json @@ -1,6 +1,6 @@ { "name": "metamist", - "version": "6.6.2", + "version": "6.7.0", "private": true, "dependencies": { "@apollo/client": "^3.7.3",