Skip to content

Commit

Permalink
Added ccf-releases migration code
Browse files Browse the repository at this point in the history
  • Loading branch information
bherr2 committed Oct 27, 2023
1 parent bda874c commit c708708
Show file tree
Hide file tree
Showing 9 changed files with 576 additions and 4 deletions.
15 changes: 12 additions & 3 deletions src/cli.js
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,8 @@ import { newDraft } from './drafting/new-draft.js';
import { enrich } from './enrichment/enrich.js';
import { finalize } from './finalizing/finalize.js';
import { list } from './list.js';
import { migrateLandmarks } from './migration/ccf-landmarks/migrate.js';
import { migrateCcfLandmarks } from './migration/ccf-landmarks/migrate.js';
import { migrateCcfReleases } from './migration/ccf-releases/migrate.js';
import { normalize } from './normalization/normalize.js';
import { getContext, getProcessorVersion, parseDirectory } from './utils/context.js';
import { error } from './utils/logging.js';
Expand Down Expand Up @@ -100,9 +101,17 @@ program

program
.command('migrate-ccf-landmarks')
.description('Migrate ccf landmarks to HRA Digital Object')
.description('Migrate ccf landmarks to HRA Digital Object format')
.action((_options, command) => {
migrateLandmarks(getContext(program, command));
migrateCcfLandmarks(getContext(program, command));
});

program
.command('migrate-ccf-releases')
.description('Migrate ccf releases to HRA Digital Object format')
.argument('<ccf-releases-path>', 'Path to the ccf-releases repository checked out locally', parseDirectory)
.action((ccfReleasesPath, _options, command) => {
migrateCcfReleases({ ...getContext(program, command), ccfReleasesPath });
});

program
Expand Down
2 changes: 1 addition & 1 deletion src/migration/ccf-landmarks/migrate.js
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ import { getLandmarkMetadata } from './renaming.js';
const CROSSWALK_HEADER = ['extraction_set_for', 'extraction_set_id', 'extraction_set_label', 'node_name', 'label'];
const SOURCE_DATA_URL = 'https://raw.githubusercontent.com/hubmapconsortium/hubmap-ontology/master/source_data';

export async function migrateLandmarks(context) {
export async function migrateCcfLandmarks(context) {
const extractionSiteUrls = (await fetchCsv(`${SOURCE_DATA_URL}/extraction-site-config.csv`)).map((row) => row.object);
const fullCrosswalk = await fetchCsv(`${SOURCE_DATA_URL}/asct-b-3d-models-landmarks.csv`, 10);

Expand Down
26 changes: 26 additions & 0 deletions src/migration/ccf-releases/2d-ftu-lookup.csv
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
id,representation_of,label,old_id,organ_label,organ_id
kidney-ascending-thin-loop-of-henle,UBERON:0004193,loop of Henle ascending limb thin segment,#FTUAscendingThinLimb
kidney-cortical-collecting-duct,UBERON:0004203,Cortical Collecting Duct,#FTUCorticalCollectingDuct
kidney-descending-thin-loop-of-henle,UBERON:0001289,descending limb of loop of Henle,#FTUDescendingThinLimb
kidney-inner-medullary-collecting-duct,UBERON:0004205,inner medullary collecting duct,#FTUInnerMedullaryCollectingDuct
kidney-nephron,UBERON:0001285,nephron,#FTUNephron,Kidney,UBERON:0002113
kidney-outer-medullary-collecting-duct,UBERON:0004204,outer medullary collecting duct,#FTUOuterMedullaryCollectingDuct
kidney-renal-corpuscle,UBERON:0001229,renal corpuscle,#FTURenalCorpuscle,Kidney,UBERON:0002113
kidney-thick-ascending-loop-of-henle,UBERON:0001291,thick ascending limb of loop of Henle,#FTUThickAscendingLimb
large-intestine-crypt-lieberkuhn,UBERON:0001984,crypt of Lieberkuhn of large intestine,#FTUCryptOfLieberkuhn,Large Intestine,UBERON:0000059
liver-liver-lobule,UBERON:0004647,liver lobule,#FTULiverLobule_inset1,Liver,UBERON:0002107
liver-liver-lobule,UBERON:0004647,liver lobule,#FTULiverLobule_inset2,Liver,UBERON:0002107
lung-bronchial-submucosal-gland,UBERON_8410043,bronchus submucosal gland,#FTUBronchialSubmucosalGland
lung-pulmonary-alveolus,UBERON:0002299,alveolus of lung,#FTUAlveoli,Lung,UBERON:0002048
pancreas-intercalated-duct,UBERON:0014726,intercalated duct of pancreas,#FTUIntercalatedDuct
pancreas-islets-langerhans,UBERON:0000006,islet of Langerhans,#FTUIsletOfLangerhans,Pancreas,UBERON:0001264
pancreas-pancreatic-acinus,UBERON:0001263,pancreatic acinus,#FTUAcinus
prostate-prostate-glandular-acinus,UBERON:0004179,prostate glandular acinus,#FTUProstateGlandularAcinus,Prostate Gland,UBERON:0002367
skin-dermal-papilla,UBERON:0001992,papillary layer of dermis,#FTUDermalPapilla
skin-epidermal-ridge,UBERON:0013487,epidermal ridge of digit,#FTUEpidermalRidge
spleen-red-pulp,UBERON:0001250,red pulp of spleen,#FTURedPulp_Inset1
spleen-red-pulp,UBERON:0001250,red pulp of spleen,#FTURedPulp_Inset2
spleen-white-pulp,UBERON:0001959,white pulp of spleen,#FTUWhitePulp_Inset1
spleen-white-pulp,UBERON:0001959,white pulp of spleen,#FTUWhitePulp_Inset2
thymus-thymus-lobule,UBERON:0002125,thymus lobule,#FTUThymusLobule_Inset1,Thymus,UBERON:0002370
thymus-thymus-lobule,UBERON:0002125,thymus lobule,#FTUThymusLobule_Inset2,Thymus,UBERON:0002370
38 changes: 38 additions & 0 deletions src/migration/ccf-releases/hra-metadata.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
title: Human Reference Atlas (HRA)
description: 'Human Reference Atlas (HRA) <https://humanatlas.io>'
creators:
- fullName: Katy Börner
firstName: Katy
lastName: Börner
orcid: 0000-0002-3321-6137
project_leads:
- fullName: Katy Börner
firstName: Katy
lastName: Börner
orcid: 0000-0002-3321-6137
reviewers:
- fullName: Ellen M. Quardokus
firstName: Ellen
lastName: Quardokus
orcid: 0000-0001-7655-4833
externalReviewers: []
creation_date: '2022-05-06'
license: >-
Creative Commons Attribution 4.0 International ([CC BY
4.0](https://creativecommons.org/licenses/by/4.0/))
publisher: HuBMAP
funders:
- funder: National Institutes of Health
awardNumber: OT2OD026671
hubmapId: HBM248.CBJV.556
doi: https://doi.org/10.48539/HBM248.CBJV.556
citation: >-
Sanjay Jain; M. Todd Valerius; Yongqun He, HuBMAP ASCT+B Tables. Kidney v1.2
[https://doi.org/10.48539/HBM248.CBJV.556](https://doi.org/10.48539/HBM248.CBJV.556)
citationOverall: >-
Quardokus, Ellen, Bruce W. Herr II, Lisel Record, Katy Börner. 2022. [*HuBMAP
ASCT+B
Tables*](https://hubmapconsortium.github.io/ccf/pages/ccf-anatomical-structures.html).
Accessed on May 6, 2022.
datatable:
- digital-objects.yaml
179 changes: 179 additions & 0 deletions src/migration/ccf-releases/md-parser.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,179 @@
import { readFileSync } from 'fs';
import { basename } from 'path';

const NAME_REMAPPING = {
'asctb-3d-models-crosswalk': 'asct-b-3d-models-crosswalk',
'asctb-crosswalk': 'asct-b-2d-models-crosswalk',
'bone-marrow-pelvis': 'bonemarrow-pelvis',
'intestine-large': 'large-intestine',
'ln-ibex': '1-human-lymph-node-ibex',
'lymph-node-ibex': '1-human-lymph-node-ibex',
'intestines-codex': '2-intestine-codex',
'kidney-codex': '3-kidney-codex',
'skin-celldive': '4-skin-cell-dive',
'liver-sim': '5-liver-sims',
'pancreas-codex': '6-pancreas-codex',
'lung-celldive': '7-lung-cell-dive',
'intestine-large-male': 'large-intestine-male',
'intestine-large-female': 'large-intestine-female',
'vasculature-male': 'blood-vasculature-male',
'vasculature-female': 'blood-vasculature-female',
vasculature: 'blood-vasculature',
brain: 'allen-brain',
'bone-marrow-and-blood': 'bonemarrow-pelvis',
};

export class HraMarkdownParser {
constructor(inputFile) {
this.inputFile = inputFile;
this.rawMd = readFileSync(inputFile)
.toString()
.replace(/\&ouml\;/g, 'ö')
.trim()
.split('\n');
}

hasKey(key) {
return !!this.rawMd.find((l) => l.includes(`**${key}:**`));
}
getMetadata(key) {
if (!this.hasKey(key)) {
return '';
}
return this.rawMd
.find((l) => l.includes(`**${key}:**`))
.split('|')[2]
.trim();
}
getMultiValue(key) {
return this.getMetadata(key)
.replace('&ouml;', 'ö')
.split(/[\;\,]\ */g)
.map((n) => n.trim());
}
getAccessedDate(dateStr) {
const [_dayOfWeek, month, day, year] = new Date(dateStr).toDateString().split(' ');
return `${month} ${parseInt(day, 10)}, ${year}`;
}
getAuthors(nameKey, orcidKey) {
if (!this.hasKey(nameKey) || !this.hasKey(orcidKey)) {
return [];
}
const names = this.getMultiValue(nameKey);
const orcids = this.getMultiValue(orcidKey).map((n) => n.slice(n.indexOf('[') + 1, n.indexOf(']')).trim());
return names.map((fullName, index) => ({
fullName,
firstName: fullName.split(/\ +/g).slice(0)[0],
lastName: fullName.replace(/\ II$/g, '').split(/\ +/g).slice(-1)[0],
orcid: orcids[index],
}));
}
getFunders(funderKey, awardKey) {
const funders = this.getMultiValue(funderKey);
const awards = this.getMultiValue(awardKey);

return funders.map((funder, index) => ({
funder,
awardNumber: awards[index],
}));
}

getName() {
let name = basename(this.inputFile, '.md')
.replace(this.getDoType() + '-', '')
.replace(/^3d\-/, '')
.replace(/^vh\-/, '')
.replace(/^f-/, 'female-')
.replace(/^m-/, 'male-')
.replace(/-l$/, '-left')
.replace(/-r$/, '-right')
.replace(/-mapping$/, '-crosswalk');

let sex;
if (name.includes('female')) {
sex = 'female';
} else if (name.includes('male')) {
sex = 'male';
}
if (sex) {
const hasLaterality = name.endsWith('-left') || name.endsWith('-right');
const elts = name.split('-').filter((s) => s !== sex);

// Format for reference organs = ${organ}-${sex}-${laterality "optional"}
if (hasLaterality) {
name = `${elts.slice(0, -1).join('-')}-${sex}-${elts.slice(-1).join('-')}`;
} else {
name = `${elts.join('-')}-${sex}`;
}
}

name = NAME_REMAPPING[name] || name;

return name;
}
getTitle() {
return this.rawMd[0].slice(1).trim().split(' ').slice(0, -1).join(' ').trim().replace(/,$/, '');
}
getVersion() {
return this.rawMd[0].slice(1).trim().split(' ').slice(-1)[0];
}
getDescription() {
return this.rawMd[this.rawMd.findIndex((n) => n.startsWith('### Description')) + 1].trim();
}
getHowToCiteKey() {
return this.rawMd
.find((l) => l.includes('**How to Cite') && !l.includes('Overall:**'))
.split('|')[1]
.trim()
.replace(/\*/g, '')
.replace(/\:/g, '');
}
getHowToCiteOverallKey() {
return this.rawMd
.find((l) => l.includes('**How to Cite') && l.includes('Overall:**'))
.split('|')[1]
.trim()
.replace(/\*/g, '')
.replace(/\:/g, '');
}

getDoType() {
return this.inputFile.split('/').slice(-2)[0].replace('ref-organs', 'ref-organ');
}

getDoString() {
return [this.getDoType(), this.getName(), this.getVersion()].join('/');
}

toJson() {
return {
title: this.getTitle(),
description: this.getDescription(),

creators: [
...this.getAuthors('Creator(s)', 'Creator ORCID(s)'),
...this.getAuthors('Creator(s)', 'Creator ORCID'),
],
project_leads: this.getAuthors('Project Lead', 'Project Lead ORCID'),
reviewers: [
...this.getAuthors('Reviewer(s)', 'Reviewers ORCID(s)'),
...this.getAuthors('Reviewer(s)', 'Reviewer ORCID(s)'),
...this.getAuthors('Internal Reviewer(s)', 'Internal Reviewer ORCID(s)'),
],
externalReviewers: this.getAuthors('External Reviewer(s)', 'External Reviewer ORCID(s)'),

creation_date: this.getMetadata('Creation Date') || this.getMetadata('Date'),
creation_year: (this.getMetadata('Creation Date') || this.getMetadata('Date')).split('-')[0],
accessed_date: this.getAccessedDate(this.getMetadata('Creation Date') || this.getMetadata('Date')),

license: this.getMetadata('License'),
publisher: this.getMetadata('Publisher'),
funders: this.getFunders('Funder', 'Award Number'),
hubmapId: this.getMetadata('HuBMAP ID'),
dataTable: this.getMetadata('Data Table') || this.getMetadata('3D Data') || this.getMetadata('2D Data'),
doi: this.getMetadata('DOI').split('[')[1].split(']')[0],
citation: this.getMetadata(this.getHowToCiteKey()),
citationOverall: this.getMetadata(this.getHowToCiteOverallKey()),
};
}
}
103 changes: 103 additions & 0 deletions src/migration/ccf-releases/migrate.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,103 @@
import { existsSync, writeFileSync } from 'fs';
import { dump } from 'js-yaml';
import { resolve } from 'path';
import sh from 'shelljs';
import { HraMarkdownParser } from './md-parser.js';
import { split2dFtuCrosswalk } from './split-2d-ftu-crosswalk.js';
import { splitRefOrganCrosswalk } from './split-ref-organ-crosswalk.js';

function writeDigitalObject(context, md) {
const data = md.toJson();
// Write out metadata.yaml
const yamlDir = resolve(context.doHome, md.getDoType(), md.getName(), md.getVersion(), 'raw');
sh.mkdir('-p', yamlDir);

const dataPaths = data.dataTable
.match(/\(https\:\/\/.*?\)/g)
.map((u) => u.slice(1, -1).split('/').slice(-3).join('/'));

Object.assign(data, {
type: undefined,
name: undefined,
version: undefined,
creation_year: undefined,
accessed_date: undefined,
dataTable: undefined,
datatable: [],
});

for (const inputSrcPath of dataPaths) {
let srcName = inputSrcPath.split('/').slice(-1)[0];
const srcPath = resolve(context.ccfReleasesPath, inputSrcPath);
let destPath = resolve(yamlDir, srcName);

sh.cp(srcPath, destPath);

if (srcPath.endsWith('.zip')) {
srcName = srcName.replace('.zip', '');
destPath = destPath.replace('.zip', '');
sh.exec(`unzip -o ${srcPath} -d ${yamlDir} ${srcName}`);
} else if (srcPath.endsWith('.bz2')) {
srcName = srcName.replace('.bz2', '');
destPath = destPath.replace('.bz2', '');
sh.exec(`bunzip2 -c ${srcPath} > ${destPath}`);
}
if (srcPath.endsWith('.7z')) {
srcName = srcName.replace('.7z', '');
destPath = destPath.replace('.7z', '');
sh.exec(`7z e -aoa ${srcPath} -o${yamlDir} ${srcName}`);
}

data.datatable.push(srcName);
if (!existsSync(srcPath) || !existsSync(destPath)) {
console.log(md.inputFile, md.getDoType(), srcPath, destPath);
}
}

if (!md.getName().includes('crosswalk') && (md.getDoType() === 'ref-organ' || md.getDoType() === '2d-ftu')) {
data.datatable.push('crosswalk.csv');
}

writeFileSync(yamlDir + '/metadata.yaml', dump(data));
}

export function migrateCcfReleases(context) {
const inputDir = context.ccfReleasesPath;
const srcDir = resolve(context.processorHome, 'src/migration/ccf-releases');

const allMd = sh
.ls(resolve(inputDir, 'v1.*/markdown/*/*.md'))
.map((s) => s.split('/').slice(-5))
.map((s) => [s[1], s[3], s[4].replace('.md', '')]);

const collections = {};
for (const [collectionVersion, type, name] of allMd) {
const mdFile = resolve(inputDir, `${collectionVersion}/markdown/${type}/${name}.md`);
const parser = new HraMarkdownParser(mdFile);
writeDigitalObject(context, parser);

collections[collectionVersion] = collections[collectionVersion] || [];
collections[collectionVersion].push(parser.getDoString());
}

for (const [version, digitalObjects] of Object.entries(collections)) {
const yamlDir = resolve(context.doHome, `collection/hra/${version}/raw`);
sh.mkdir('-p', yamlDir);

writeFileSync(yamlDir + '/digital-objects.yaml', dump({ 'digital-objects': digitalObjects }));

sh.cp(resolve(srcDir, 'hra-metadata.yaml'), yamlDir + '/metadata.yaml');

const crosswalk = digitalObjects.find((str) => str.startsWith('2d-ftu/') && str.includes('crosswalk'));
const ftuIllustrations = digitalObjects.filter((str) => str.startsWith('2d-ftu/') && !str.includes('crosswalk'));
for (const doString of ftuIllustrations) {
split2dFtuCrosswalk(context, crosswalk, doString);
}

const refOrganCrosswalk = digitalObjects.find((str) => str.startsWith('ref-organ/') && str.includes('crosswalk'));
const refOrgans = digitalObjects.filter((str) => str.startsWith('ref-organ/') && !str.includes('crosswalk'));
for (const doString of refOrgans) {
splitRefOrganCrosswalk(context, refOrganCrosswalk, doString);
}
}
}
Loading

0 comments on commit c708708

Please sign in to comment.