diff --git a/src/ingest-pipeline/airflow/dags/cwl/portal-containers b/src/ingest-pipeline/airflow/dags/cwl/portal-containers index 7e22a888..a9f03dc9 160000 --- a/src/ingest-pipeline/airflow/dags/cwl/portal-containers +++ b/src/ingest-pipeline/airflow/dags/cwl/portal-containers @@ -1 +1 @@ -Subproject commit 7e22a8884bb3e59fc31db01275002b81598aa068 +Subproject commit a9f03dc918d86ecb3154d0e79796451ddaa72a41 diff --git a/src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq b/src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq index ce04e2cf..07f43493 160000 --- a/src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq +++ b/src/ingest-pipeline/airflow/dags/cwl/salmon-rnaseq @@ -1 +1 @@ -Subproject commit ce04e2cf5cd180448eb6107806b20d867d0411c6 +Subproject commit 07f434937a27f0159030b86d9c22efaf8a1d0553 diff --git a/src/ingest-pipeline/airflow/dags/cwl/sprm b/src/ingest-pipeline/airflow/dags/cwl/sprm index 3bc79b4a..d9d98c65 160000 --- a/src/ingest-pipeline/airflow/dags/cwl/sprm +++ b/src/ingest-pipeline/airflow/dags/cwl/sprm @@ -1 +1 @@ -Subproject commit 3bc79b4abb37243f0b8ff48d8ecdf95882f0b199 +Subproject commit d9d98c65542bd15472a504b08d8f4411c23afc2c diff --git a/src/ingest-pipeline/airflow/dags/salmon_rnaseq.py b/src/ingest-pipeline/airflow/dags/salmon_rnaseq.py index 61eb814c..40819b3a 100644 --- a/src/ingest-pipeline/airflow/dags/salmon_rnaseq.py +++ b/src/ingest-pipeline/airflow/dags/salmon_rnaseq.py @@ -320,9 +320,15 @@ def get_salmon_dag_params(assay: str) -> SequencingDagParameters: SequencingDagParameters( dag_id="salmon_rnaseq_10x", pipeline_name="salmon-rnaseq", - assay="10x", + assay="10x_v3", dataset_type="salmon_rnaseq_10x", ), + SequencingDagParameters( + dag_id="salmon_rnaseq_10x_sn", + pipeline_name="salmon-rnaseq", + assay="10x_v3_sn", + dataset_type="salmon_rnaseq_10x_sn", + ), get_salmon_dag_params("sciseq"), get_salmon_dag_params("slideseq"), get_salmon_dag_params("snareseq"), diff --git a/src/ingest-pipeline/airflow/dags/sc_atac_seq.py b/src/ingest-pipeline/airflow/dags/sc_atac_seq.py index 1796f462..244f3bee 100644 --- a/src/ingest-pipeline/airflow/dags/sc_atac_seq.py +++ b/src/ingest-pipeline/airflow/dags/sc_atac_seq.py @@ -79,6 +79,7 @@ def build_cwltool_cmd1(**kwargs): command = [ *get_cwltool_base_cmd(tmpdir), + cwl_workflows[0], "--assay", params.assay, "--outdir", diff --git a/src/ingest-pipeline/airflow/dags/workflow_map.yml b/src/ingest-pipeline/airflow/dags/workflow_map.yml index 3b09203d..8c4e0069 100644 --- a/src/ingest-pipeline/airflow/dags/workflow_map.yml +++ b/src/ingest-pipeline/airflow/dags/workflow_map.yml @@ -58,7 +58,7 @@ workflow_map: 'workflow': 'sc_atac_seq_sn' - 'collection_type': 'generic_metadatatsv' 'assay_type': 'snRNAseq' - 'workflow': 'salmon_rnaseq_10x' + 'workflow': 'salmon_rnaseq_10x_sn' - 'collection_type': 'bulkatacseq_collection' 'assay_type': 'ATACseq-bulk' 'workflow': 'bulk_atacseq' diff --git a/src/ingest-pipeline/misc/tools/metadata_tsv_splitter.ipynb b/src/ingest-pipeline/misc/tools/metadata_tsv_splitter.ipynb index 16833ea7..9d9c6f10 100644 --- a/src/ingest-pipeline/misc/tools/metadata_tsv_splitter.ipynb +++ b/src/ingest-pipeline/misc/tools/metadata_tsv_splitter.ipynb @@ -9,7 +9,13 @@ "import numpy as np\n", "import pandas as pd\n", "import os\n", - "import csv" + "import csv\n", + "from pathlib import Path\n", + "from pprint import pprint\n", + "from collections import defaultdict\n", + "from datetime import datetime\n", + "from urllib.parse import urlparse, parse_qs\n", + "from io import StringIO" ] }, { @@ -25,15 +31,25 @@ "metadata": {}, "outputs": [], "source": [ - "input_dir = \"/home/welling/git/hubmap/ingest-pipeline/src/ingest-pipeline/md/florida_10x_md\"\n", - "in_fname = os.path.join(input_dir, \"UFLA_10XscRNAseq_metadata_050520_-_UFLA_10XscRNAseq_metadata_050520.tsv\")" + "# Where the constructed tree of metadata files will go\n", + "build_tree_root = Path(\"/home/welling/git/hubmap/ingest-pipeline/src/ingest-pipeline/misc/tools/build_tree_root\")" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "Read the records" + "input_dir = \"/home/welling/git/hubmap/ingest-pipeline/src/ingest-pipeline/md/all_md\"\n", + "#base_path = Path(input_dir) / 'UCSD_R2_snRNAseq'\n", + "#base_path = Path(input_dir) / 'UFLA_R2_10X'\n", + "#base_path = Path(input_dir) / 'UFLA_R2_CODEX'\n", + "#base_path = Path(input_dir) / 'STAN_R2_snRNA'\n", + "base_path = Path(input_dir) / 'CALT_R2_sciATAC'\n", + "#in_fname = os.path.join(input_dir, \"UFLA_CODEX_LY_Metadata_110920.tsv\")\n", + "#for path in base_path.glob('**/*.xlsx'):\n", + "# print(path.stem)" ] }, { @@ -42,20 +58,36 @@ "metadata": {}, "outputs": [], "source": [ - "recs = []\n", - "with open(in_fname, 'r', newline='') as f:\n", - " dialect = csv.Sniffer().sniff(f.read(128))\n", - " f.seek(0)\n", - " reader = csv.DictReader(f, dialect=dialect)\n", - " for row in reader:\n", - " recs.append({k : v for k, v in row.items()})" + "df_d = {}\n", + "metadata_df = None\n", + "for path in base_path.glob('**/*.xlsx'):\n", + " print(path)\n", + " df = pd.read_excel(path)\n", + " true_stem = Path(path.stem)\n", + " while Path(true_stem.stem) != true_stem:\n", + " true_stem = Path(true_stem.stem)\n", + " df_d[true_stem] = df\n", + " if 'assay_type' in df.columns:\n", + " if metadata_df is None:\n", + " metadata_df = df.copy()\n", + " else:\n", + " metadata_df = metadata_df.append(df)\n", + " print(f'{true_stem} -> {df_d[true_stem].columns}')\n", + "\n", + "# special logic needed to straighten out CALT sciATAC\n", + "for key in df_d:\n", + " if 'contributors' in str(key):\n", + " df_d[Path('contributors')] = df_d[key]\n", + " break" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], "source": [ - "Check what we've read. Beware of columns named '' or ' ' - they indicate a trailing tab and thus an empty column in the input file!" + "assert metadata_df is not None, \"metadata file not found\"" ] }, { @@ -64,7 +96,7 @@ "metadata": {}, "outputs": [], "source": [ - "print(reader.fieldnames)" + "metadata_df.columns" ] }, { @@ -73,7 +105,14 @@ "metadata": {}, "outputs": [], "source": [ - "print(recs[0])" + "metadata_df" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Begin block of mapping data used to guess uuids from tissue display ids ###" ] }, { @@ -82,7 +121,160 @@ "metadata": {}, "outputs": [], "source": [ - "print(dialect.delimiter) # should be tab" + "samp_to_uuid_map = {}\n", + "stan_snrna_uuid_map = { # maps sample TMC ID to uuid\n", + " \"STAN0008-LI-1-1\":\"a04d0138ed6b28810c5afa01d392bbd5\", # misplaced\n", + " \"STAN0008-LI-2-1\":\"a078805198f9f7f022b83de898a608a9\", # misplaced\n", + " \"STAN0008-LI-3-1\":\"c3d36358b184be55ac977abea5755447\", # misplaced\n", + " \"STAN0008-LI-4-1\":\"57288d8a0a9374ea83f90582df8eafa2\", # misplaced\n", + " \"STAN0007-LI-1-1\":\"f1937797246fa4592bff6166d7666de5\", # misplaced\n", + " \"STAN0007-LI-3-1\":\"e18254c67c8a0bb625f3748a2501a0bb\", # misplaced\n", + " \"STAN0007-LI-4-1\":\"492574f47224661fe8674f60373e44f4\", # misplaced\n", + " \"STAN0007-LI-2-1\":\"da93581ef554e25ec1c7a12500a56b74\", # misplaced\n", + "}\n", + "samp_to_uuid_map.update(stan_snrna_uuid_map)\n", + "calt_sciatacseq_txt = \"\"\"\n", + "0bf827ea01b64963d39a10cac69bc386,CALT0006-HT-2\n", + "2d4d2f368c6f74cc3aa17177924003b8,CALT0012-HT-1\n", + "48da185436f006156d7e5c1941bfb147,CALT0005-SP-1\n", + "58ebb89caf1512e9452d1f9e0e1efa8e,CALT0003-HT\n", + "616a1aa904dfb1299f86910db2a20fbe,CALT0011-LV-1\n", + "76bfd5a517c681e5f672fecff2057111,CALT0012-HT-2\n", + "8ea82dc9f26bb2c01f19ddd19b3812b6,CALT0004-PA-2\n", + "8f6b8e19c21a664d67a467c3a08b5630,CALT0003-HT-2\n", + "93cc8c450db50a224dce243a43131d3c,CALT0010-HT-1\n", + "a0df02bda8befa491f86b0d41f2810ed,CALT0005-RL-1\n", + "a6a7f2b0b419aefb6f8ffb9bfa9ce7d5,CALT0004-RL-1\n", + "acaf75b8292db4a79dc14e3021742217,CALT0005-HT-1\n", + "ad26d1046084c5640f911a84e5cd0cee,CALT0003-HT-5\n", + "b2db3414cedf8805d20df3cf753842ca,CALT0011-HT-1\n", + "bc19b2d489ddef9e135a67bcc9746695,CALT0006-PA-1\n", + "bd435ed6aa55e9f57d783ce630d746bf,CALT0003-HT-3\n", + "bf88e07da70ee088e31c7f568e41b196,CALT0011-HT-2\n", + "d4fc9da8a21cbb323d55f38983fb3dbb,CALT0006-HT-1\n", + "dd39ed081ffc887d85fc8225c71b37dc,CALT0009-HT-1\n", + "e4b371ea3ed4c3ca77791b34b829803f,CALT0004-HT-1\n", + "ead5cc01250b4f9ea73dd91503c313a5,CALT0007-HT-1\n", + "eb4958e8b5dd073e8a4a80bd613b2d64,CALT0009-LV-1\n", + "f1b9f55b12e16d1e11a5ebbd863b5787,CALT0005-PA-1\n", + "\"\"\"\n", + "\n", + "calt_sciatacseq_map = {}\n", + "for line in StringIO(calt_sciatacseq_txt):\n", + " words = line.strip().split(',')\n", + " if len(words) == 2:\n", + " uuid, samp = words\n", + " calt_sciatacseq_map[samp] = uuid\n", + "samp_to_uuid_map.update(calt_sciatacseq_map)\n", + "pprint(samp_to_uuid_map)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### End block of mapping data for Stanford snRNAseq uuids ###" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## The following block produces inv_uuid_map, used for the special case of UCSD snRNAseq data ##" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true, + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "#\n", + "# /tmp/junk.txt below was running a 'find -type f -print' on the HIVE host and\n", + "# grepping for 'contributors'\n", + "#\n", + "#uuid_map = {}\n", + "#for line in open('/tmp/junk.txt'):\n", + "# words = line.strip().split('/')\n", + "# uuid_map[words[1]] = words[-1]\n", + "uuid_map = {\n", + " '0487454555924b54dd3f5b5232e3c77e': 'BUKMAP_20190529L_10X-R_contributors.tsv',\n", + " '05197e30394fd88affff0a9c214c8c4c': 'LAPMAP_20200317J_10X-R_contributors.tsv',\n", + " '065295e6e58b1d3555a261c1bfe3b3fe': 'LAPMAP_20200317M_10X-R_contributors.tsv',\n", + " '06ff98c01295ca5ea504a676f73f9a09': 'BUKMAP_20200304B_10X-R_contributors.tsv',\n", + " '0736735768692d6ca0cd96149b743be1': 'LAPMAP_20200317N_10X-R_contributors.tsv',\n", + " '0c3ce767d87527f41fd705fd469390a0': 'BUKMAP_20200302A_10X-R_contributors.tsv',\n", + " '17be751d961c5baf6c1dbe2e70c5d93c': 'BUKMAP_20191104B_10X-R_contributors.tsv',\n", + " '1b83223cf1f4446c625adbfb375ab3fd': 'BUKMAP_20200205D_10X-R_contributors.tsv',\n", + " '24eaa9730abe57c1c22f74573b846a6f': 'BUKMAP_20191029_10X-R_contributors.tsv',\n", + " '26b642ddbae00e7ff6570ddd57557e26': 'LAPMAP_20200317I_10X-R_contributors.tsv',\n", + " '2d27debfce3d25040af54fb77b25427b': 'BUKMAP_20200707A_10X-R_contributors.tsv',\n", + " '3b1490026022f850e4d3c3fb5e2283c9': 'LAPMAP_20191217E_10X-R_contributors.tsv',\n", + " '3fe18ec025f612ca2c5308d4c234da50': 'LAPMAP_20200317H_10X-R_contributors.tsv',\n", + " '46e8ffd2350efd19f771c6fb6a51f6cc': 'BUKMAP_20200304F_10X-R_contributors.tsv',\n", + " '488f364142c308a9692e0b529f6697dd': 'BUKMAP_20190822F_10X-R_contributors.tsv',\n", + " '4bef8fa6eab2d3eb8734bf418c0634ef': 'BUKMAP_20190829B_10X-R_contributors.tsv',\n", + " '4ea7a4cf1a6ff0df0cc33c1236633112': 'BUKMAP_20200205F_10X-R_contributors.tsv',\n", + " '68e6dfa4807ca615883f73a5067115cb': 'LAPMAP_20200317L_10X-R_contributors.tsv',\n", + " '69a0ada10f4f119f99ce5f66cf3b1a94': 'BUKMAP_20200702C_10X-R_contributors.tsv',\n", + " '6a75230d8d1063fcc8568537212211f5': 'BUKMAP_20200302B_10X-R_contributors.tsv',\n", + " '8e5c8f0cc61aad4fcbc5cc119bdf4c96': 'LAPMAP_20200317K_10X-R_contributors.tsv',\n", + " '9049f48b97dc5edc737b67783a47e918': 'BUKMAP_20200702D_10X-R_contributors.tsv',\n", + " '99c5c80509be87d2356d19a9ed8b22ff': 'BUKMAP_20191104A_10X-R_contributors.tsv',\n", + " 'a8652e9e3c545e61e1ffe9d54a8f1fd2': 'LAPMAP_20191217G_10X-R_contributors.tsv',\n", + " 'c686b93a809ec1f54a0d96bc25d3d207': 'BUKMAP_20190607L_10X-R_contributors.tsv',\n", + " 'cd887a6beabc794992876ad7ee591f69': 'BUKMAP_20200304A_10X-R_contributors.tsv',\n", + " 'cfc125d6d916f121e92a8406a0502a38': 'BUKMAP_20200707C_10X-R_contributors.tsv',\n", + " 'ec88a6b161dce97a2361b1479c69a036': 'BUKMAP_20191009_10X-R_contributors.tsv',\n", + " 'f1b130f1200ae1fabe56cb506245490c': 'BUKMAP_20191010_10X-R_contributors.tsv',\n", + " 'fa6d9c732c7f239422ec6b357136fcd4': 'BUKMAP_20200707B_10X-R_contributors.tsv',\n", + " 'fd0c0fcde5a331c9dfff52b520c7d792': 'BUKMAP_20200205A_10X-R_contributors.tsv'\n", + "}\n", + "pprint(uuid_map)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "scrolled": true, + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "# Checking that we can use filenames as unique keys\n", + "counts = defaultdict(int)\n", + "for key, val in uuid_map.items():\n", + " counts[val] += 1\n", + "for key, val in counts.items():\n", + " print(f'{key}: {val}')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [ + "hide-input" + ] + }, + "outputs": [], + "source": [ + "inv_uuid_map = {val:key for key, val in uuid_map.items()}" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## End block of special-purpose support code for UCSD snRNAseq ##" ] }, { @@ -102,21 +294,46 @@ " return s and len(s) == 32 and all([c in '0123456789abcdef' for c in list(s)])\n", "\n", "def get_uuid(s):\n", - " words = s.split('/')\n", - " while words:\n", - " if is_uuid(words[0]):\n", - " return words[0]\n", + " if s.startswith(('http:', 'https:')):\n", + " parsed = urlparse(s)\n", + " if parsed.netloc == 'app.globus.org':\n", + " origin_path = parse_qs(urlparse(s).query)['origin_path'][0]\n", + " return get_uuid(origin_path)\n", " else:\n", - " words = words[1:]\n", - " return None" + " raise RuntimeError(f'Unrecognized URL {s}')\n", + " else:\n", + " words = s.split('/')\n", + " while words:\n", + " if is_uuid(words[0]):\n", + " return words[0]\n", + " else:\n", + " words = words[1:]\n", + "\n", + "#for idx, row in metadata_df.iterrows():\n", + "# print(get_uuid(row['data_path']))" ] }, { - "cell_type": "markdown", + "cell_type": "code", + "execution_count": null, "metadata": {}, + "outputs": [], + "source": [ + "def reformat_datetime(dt_str):\n", + " return datetime.fromisoformat(dt_str).strftime(\"%Y-%m-%d %H:%M\")" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], "source": [ - "Write the output single-line metadata.tsv files.\n", - "*Output is written to the directory in which this notebook is running.*" + "def get_true_stem(some_path):\n", + " true_stem = Path(some_path.stem)\n", + " while true_stem != Path(true_stem.stem):\n", + " true_stem = Path(true_stem.stem)\n", + " return true_stem" ] }, { @@ -125,18 +342,80 @@ "metadata": {}, "outputs": [], "source": [ - "for rawrec in recs:\n", - " rec = rawrec.copy() # don't stomp on original\n", - " #display(rec)\n", - " uuid = get_uuid(rec['data_path'])\n", - " out_fname = '{}-metadata.tsv'.format(uuid)\n", - " print(out_fname)\n", - " rec['metadata_path'] = '.'\n", - " rec['data_path'] = '.'\n", - " with open(out_fname, 'w', newline='') as f:\n", - " writer = csv.DictWriter(f, dialect=dialect, fieldnames=reader.fieldnames)\n", - " writer.writeheader()\n", - " writer.writerow(rec)" + "def fix_antibodies_df(df):\n", + " \"\"\"\n", + " This adds columns to get the antibodies dataframe past the current versions of the\n", + " antibodies.tsv table schema.\n", + " \"\"\"\n", + " column_names = [elt for elt in df.columns]\n", + " assert 'conjugated_cat_number' in column_names, 'conjugated_cat_number is not present'\n", + " offset = [idx for idx, val in enumerate(column_names) if val == 'conjugated_cat_number'][0]\n", + " new_column_names = (['version']\n", + " + column_names[:offset]\n", + " + ['concentration_value', 'concentration_unit']\n", + " + column_names[offset:])\n", + " new_column_names = (column_names[:offset]\n", + " + column_names[offset:])\n", + " print(new_column_names)\n", + " out_df = df.copy().reindex(columns=new_column_names)\n", + " #out_df['version'] = 2\n", + " return out_df\n", + "\n", + "#print([k for k in df_d])\n", + "#display(fix_antibodies_df(df_d[Path('UFLA_antibodies_121120')]).head())" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "assert not build_tree_root.exists(), f'delete or move aside {build_tree_root}'\n", + "build_tree_root.mkdir()\n", + "for idx, row in metadata_df.iterrows():\n", + " c_p = row['contributors_path']\n", + " if row['contributors_path'] in inv_uuid_map:\n", + " uuid = inv_uuid_map[row['contributors_path']]\n", + " elif row['tissue_id'] in samp_to_uuid_map:\n", + " uuid = samp_to_uuid_map[row['tissue_id']]\n", + " else:\n", + " uuid = get_uuid(row['data_path'])\n", + " if not uuid:\n", + " print(f'No uuid found for record {idx}')\n", + " continue\n", + " print(f'row {idx} -> {uuid}')\n", + " uuid_path = build_tree_root / uuid\n", + " uuid_path.mkdir()\n", + " path_str = row['contributors_path']\n", + " if path_str.startswith('/'): # common error\n", + " path_str = path_str[1:]\n", + " contributors_path = Path(path_str)\n", + " if 'antibodies_path' in row:\n", + " path_str = row['antibodies_path']\n", + " if path_str.startswith('/'): # common error\n", + " path_str = path_str[1:]\n", + " antibodies_path = Path(path_str)\n", + " row['antibodies_path'] = str(Path('extras').joinpath(antibodies_path))\n", + " else:\n", + " antibodies_path = None\n", + " print(contributors_path.stem)\n", + " print([k for k in df_d])\n", + " assert get_true_stem(contributors_path) in df_d, f\"Cannot find contributors dataframe {contributors_path}\"\n", + " row['contributors_path'] = str(Path('extras').joinpath(contributors_path))\n", + " row['data_path'] = '.'\n", + " for col in metadata_df.columns:\n", + " if col.endswith('_datetime'):\n", + " row[col] = reformat_datetime(str(row[col]))\n", + " row_df = pd.DataFrame([row])\n", + " row_df.to_csv(uuid_path / f'{uuid}-metadata.tsv', header=True, sep='\\t', index=False)\n", + " (uuid_path / 'extras').mkdir()\n", + " df_d[get_true_stem(contributors_path)].to_csv(uuid_path / row['contributors_path'],\n", + " header=True, sep='\\t', index=False)\n", + " if antibodies_path:\n", + " df = df_d[get_true_stem(antibodies_path)]\n", + " fix_antibodies_df(df).to_csv(uuid_path / row['antibodies_path'],\n", + " header=True, sep='\\t', index=False)\n" ] }, { @@ -145,6 +424,62 @@ "metadata": {}, "outputs": [], "source": [] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### This block should get reintegrated into the flow ###" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "recs = []\n", + "with open(in_fname, 'r', newline='') as f:\n", + " dialect = csv.Sniffer().sniff(f.read(128))\n", + " f.seek(0)\n", + " reader = csv.DictReader(f, dialect=dialect)\n", + " for row in reader:\n", + " recs.append({k : v for k, v in row.items()})" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Check what we've read. Beware of columns named '' or ' ' - they indicate a trailing tab and thus an empty column in the input file!" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(reader.fieldnames)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(recs[0])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "print(dialect.delimiter) # should be tab" + ] } ], "metadata": { diff --git a/src/ingest-pipeline/misc/tools/new_dataset_survey.py b/src/ingest-pipeline/misc/tools/new_dataset_survey.py index f29febdb..1fca60f6 100755 --- a/src/ingest-pipeline/misc/tools/new_dataset_survey.py +++ b/src/ingest-pipeline/misc/tools/new_dataset_survey.py @@ -75,7 +75,8 @@ def main(): main """ parser = argparse.ArgumentParser() - parser.add_argument("uuid_txt", help="input files containing uuids") + parser.add_argument("uuid_txt", + help="input .txt file containing uuids or .csv or .tsv file with uuid column") parser.add_argument("--out", help="name of the output .tsv file", required=True) args = parser.parse_args() auth_tok = input('auth_tok: ') diff --git a/src/ingest-pipeline/misc/tools/post_validation_cleanup.sh b/src/ingest-pipeline/misc/tools/post_validation_cleanup.sh new file mode 100755 index 00000000..c8ef299d --- /dev/null +++ b/src/ingest-pipeline/misc/tools/post_validation_cleanup.sh @@ -0,0 +1,42 @@ +#!/bin/bash -ex + +uuid=$1 +echo $uuid +echo `basename "$PWD"` +if [[ `basename "$PWD"` != $uuid ]]; then + echo "run this from the $uuid directory" + exit -1 +else + if [ -e validation_report.txt ] ; then + if [[ `cat validation_report.txt` == 'No errors!' ]] ; then + rm validation_report.txt + else + echo "Validation report is not clean" + exit -1 + fi + else + echo "No validation report found" + exit -1 + fi + pushd extras + for fname in *metadata.tsv.orig *contributors.tsv.orig ; do + if [ -e ${fname} ] ; then + rm $fname + else + echo "nothing to remove for ${fname}" + fi + done + for fname in *.fastq ; do + if [ ! -e ${fname} ] ; then + echo 'no fastq files found' + break + fi + if [ -e ../${fname}.gz ] ; then + rm $fname + fi + done + popd +fi + + + diff --git a/src/ingest-pipeline/misc/tools/restructure_dataset.sh b/src/ingest-pipeline/misc/tools/restructure_dataset.sh new file mode 100755 index 00000000..8243a245 --- /dev/null +++ b/src/ingest-pipeline/misc/tools/restructure_dataset.sh @@ -0,0 +1,38 @@ +#!/bin/bash -ex + +uuid=$1 +echo $uuid +echo `basename "$PWD"` +if [[ `basename "$PWD"` != $uuid ]]; then + echo "run this from the $uuid directory" + exit -1 +else + mkdir -p extras + for fname in *metadata.tsv *contributors.tsv ; do + if [ -e ${fname} ] ; then + mv $fname extras/${fname}.orig + else + echo "nothing to move for ${fname}" + fi + done + for fname in *.fastq ; do + if [ ! -e ${fname} ] ; then + echo 'no fastq files found' + break + fi + if [ -e ${fname}.gz ] ; then + mv $fname extras + else + gzip $fname + fi + done +fi +metafiles_tar="/tmp/build_tree.tar" +pushd .. +for fname in `tar -tf $metafiles_tar | grep $uuid` ; do + echo $fname + tar -xvf $metafiles_tar $fname +done +popd + + diff --git a/src/ingest-pipeline/misc/tools/survey.py b/src/ingest-pipeline/misc/tools/survey.py index 5cf75498..671be57b 100755 --- a/src/ingest-pipeline/misc/tools/survey.py +++ b/src/ingest-pipeline/misc/tools/survey.py @@ -167,7 +167,7 @@ def describe(self, prefix='', file=sys.stdout): self.kids[kid].describe(prefix=prefix+' ', file=file) - def build_rec(self): + def build_rec(self, include_all_children=False): """ Returns a dict containing: @@ -180,6 +180,9 @@ def build_rec(self): QA_child.data_types[0] (verifying there is only 1 entry) QA_child.status (which must be QA or Published) note + + If include_all_children=True, all child datasets are included rather + than just those that are QA or Published. """ rec = {'uuid': self.uuid, 'display_doi': self.display_doi, 'status': self.status, 'group_name': self.group_name} @@ -201,20 +204,26 @@ def build_rec(self): rec['sample_display_doi'] = samp.display_doi else: rec['sample_display_doi'] = 'multiple' - qa_kids = [self.kids[uuid] for uuid in self.kids if self.kids[uuid].status in ['QA', 'Published']] - if any(qa_kids): - if len(qa_kids) > 1: - rec['note'] = 'Multiple QA derived datasets' - this_kid = qa_kids[0] - rec['qa_child_uuid'] = this_kid.uuid - rec['qa_child_display_doi'] = this_kid.display_doi - rec['qa_child_data_type'] = this_kid.data_types[0] - rec['qa_child_status'] = this_kid.status + if include_all_children: + filtered_kids = [self.kids[uuid] for uuid in self.kids] + uuid_hdr, doi_hdr, data_type_hdr, status_hdr, note_note = ('child_uuid', 'child_display_doi', + 'child_data_type', 'child_status', + 'Multiple derived datasets') + else: + filtered_kids = [self.kids[uuid] for uuid in self.kids if self.kids[uuid].status in ['QA', 'Published']] + uuid_hdr, doi_hdr, data_type_hdr, status_hdr, note_note = ('qa_child_uuid', 'qa_child_display_doi', + 'qa_child_data_type', 'qa_child_status', + 'Multiple QA derived datasets') + if any(filtered_kids): + rec['note'] = note_note if len(filtered_kids) > 1 else '' + this_kid = filtered_kids[0] + rec[uuid_hdr] = this_kid.uuid + rec[doi_hdr] = this_kid.display_doi + rec[data_type_hdr] = this_kid.data_types[0] + rec[status_hdr] = this_kid.status else: - rec['qa_child_uuid'] = None - rec['qa_child_display_doi'] = None - rec['qa_child_data_type'] = None - rec['qa_child_status'] = None + for key in [uuid_hdr, doi_hdr, data_type_hdr, status_hdr]: + rec[key] = None rec['note'] = '' return rec @@ -299,8 +308,10 @@ def main(): main """ parser = argparse.ArgumentParser() - parser.add_argument("metadatatsv", help="input .tsv or .xlsx file") + parser.add_argument("metadatatsv", help="input .tsv or .xlsx file, or a list of uuids in a .txt file") parser.add_argument("--out", help="name of the output .tsv file", required=True) + parser.add_argument("--include_all_children", action="store_true", + help="include all children, not just those in the QA or Published states") args = parser.parse_args() auth_tok = input('auth_tok: ') entity_factory = EntityFactory(auth_tok) @@ -308,6 +319,13 @@ def main(): in_df = pd.read_csv(args.metadatatsv, sep='\t') elif args.metadatatsv.endswith('.xlsx'): in_df = pd.read_excel(args.metadatatsv) + elif args.metadatatsv.endswith('.txt'): + # a list of bare uuids + recs = [] + for line in open(args.metadatatsv): + assert is_uuid(line.strip()), f'text file {args.metadatatsv} contains non-uuid {line.strip}' + recs.append({'data_path': line.strip()}) + in_df = pd.DataFrame(recs) else: raise RuntimeError('Unrecognized input file format') in_df['uuid'] = in_df.apply(get_uuid, axis=1) @@ -319,7 +337,7 @@ def main(): ds = entity_factory.get(uuid) ds.describe() new_uuids = ds.all_uuids() - rec = ds.build_rec() + rec = ds.build_rec(include_all_children=args.include_all_children) if any([uuid in known_uuids for uuid in new_uuids]): old_note = rec['note'] if 'note' in rec else '' rec['note'] = 'UUID COLLISION! ' + old_note @@ -328,9 +346,13 @@ def main(): out_df = pd.DataFrame(out_recs).rename(columns={'sample_display_doi':'sample_doi', 'sample_hubmap_display_id':'sample_display_id', 'qa_child_uuid':'derived_uuid', + 'child_uuid':'derived_uuid', 'qa_child_display_doi':'derived_doi', + 'child_display_doi':'derived_doi', 'qa_child_data_type':'derived_data_type', - 'qa_child_status':'derived_status'}) + 'child_data_type':'derived_data_type', + 'qa_child_status':'derived_status', + 'child_status':'derived_status'}) out_df.to_csv(args.out, sep='\t', index=False)