Skip to content

Commit

Permalink
Improvements to full mapping database build (#22)
Browse files Browse the repository at this point in the history
1. Add zenodo uploads
2. Write configuration to output directory
3. Demonstrate automated upload on protein complex landscape
4. Add automated upload to full database build
  • Loading branch information
cthoyt authored Apr 30, 2024
1 parent cc62889 commit 8888095
Show file tree
Hide file tree
Showing 23 changed files with 1,139 additions and 202 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,6 +40,9 @@ This software provides:
4. A confidence model granular at the curator-level,
mapping set-level, and community feedback-level

We also provide an accompanying raw semantic mapping database on Zenodo at
https://zenodo.org/records/11082039.

## 🚀 Installation

The most recent release can be installed from
Expand Down
14 changes: 7 additions & 7 deletions notebooks/landscape/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,10 +32,10 @@ resource, how many show up in all resources, and how many show up in a few

A summary chart over all landscapes can be generated with `landscape.py`.

| name | raw_term_count | unique_term_count | reduction |
|---------|---------------:|------------------:|----------:|
| disease | 410173 | 243730 | 0.405787 |
| anatomy | 37917 | 32108 | 0.153203 |
| complex | 15869 | 7775 | 0.510051 |
| gene | 4.94578e+07 | 4.87886e+07 | 0.013529 |
| cell | 207019 | 166274 | 0.196818 |
| name | raw_term_count | unique_term_count | reduction | download |
|---------|---------------:|------------------:|----------:|------------------------------------------------------------------------:|
| disease | 410,173 | 243,730 | 0.405787 | [zenodo.record:11091886](https://bioregistry.io/zenodo.record:11091886) |
| anatomy | 37,917 | 32,108 | 0.153203 | [zenodo.record:11091803](https://bioregistry.io/zenodo.record:11091803) |
| complex | 15,869 | 7,775 | 0.510051 | [zenodo.record:11091422](https://bioregistry.io/zenodo.record:11091422) |
| gene | 49,457,767 | 207,019 | 0.013529 | [zenodo.record:11092013](https://bioregistry.io/zenodo.record:11092013) |
| cell | 207,019 | 166,274 | 0.196818 | [zenodo.record:11091581](https://bioregistry.io/zenodo.record:11091581) |
129 changes: 129 additions & 0 deletions notebooks/landscape/anatomy/configuration.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,129 @@
{
"name": "SeMRA Anatomy Mappings Database",
"description": "Supports the analysis of the landscape of anatomy nomenclature resources.",
"creators": [
{
"name": "Charles Tapley Hoyt",
"orcid": "0000-0003-4423-4370"
}
],
"inputs": [
{
"source": "biomappings"
},
{
"source": "gilda"
},
{
"source": "pyobo",
"prefix": "uberon",
"confidence": 0.99
},
{
"source": "pyobo",
"prefix": "bto",
"confidence": 0.99
},
{
"source": "pyobo",
"prefix": "caro",
"confidence": 0.99
},
{
"source": "pyobo",
"prefix": "mesh",
"confidence": 0.99
},
{
"source": "pyobo",
"prefix": "ncit",
"confidence": 0.99
},
{
"source": "pyobo",
"prefix": "umls",
"confidence": 0.99
}
],
"priority": [
"uberon",
"mesh",
"bto",
"caro",
"ncit",
"umls"
],
"mutations": [
{
"source": "uberon",
"confidence": 0.8
},
{
"source": "bto",
"confidence": 0.65
},
{
"source": "caro",
"confidence": 0.8
},
{
"source": "ncit",
"confidence": 0.7
},
{
"source": "umls",
"confidence": 0.7
}
],
"subsets": {
"mesh": [
"mesh:D001829",
"mesh:D009141",
"mesh:D004064",
"mesh:D012137",
"mesh:D014566",
"mesh:D004703",
"mesh:D002319",
"mesh:D009420",
"mesh:D012679",
"mesh:D014024",
"mesh:D005441",
"mesh:D000825",
"mesh:D013284",
"mesh:D006424",
"mesh:D004628",
"mesh:D034582",
"mesh:D018514",
"mesh:D056229",
"mesh:D056226",
"mesh:D056224"
],
"ncit": [
"ncit:C12219"
],
"umls": [
"sty:T024",
"sty:T017"
]
},
"keep_prefixes": [
"uberon",
"mesh",
"bto",
"caro",
"ncit",
"umls"
],
"remove_imprecise": false,
"raw_pickle_path": "/Users/cthoyt/.data/semra/case-studies/anatomy/raw.pkl",
"raw_sssom_path": "/Users/cthoyt/.data/semra/case-studies/anatomy/raw.sssom.tsv",
"processed_pickle_path": "/Users/cthoyt/.data/semra/case-studies/anatomy/processed.pkl",
"processed_sssom_path": "/Users/cthoyt/.data/semra/case-studies/anatomy/processed.sssom.tsv",
"processed_neo4j_path": "/Users/cthoyt/.data/semra/case-studies/anatomy/neo4j",
"processed_neo4j_name": "semra-anatomy",
"priority_pickle_path": "/Users/cthoyt/.data/semra/case-studies/anatomy/priority.pkl",
"priority_sssom_path": "/Users/cthoyt/.data/semra/case-studies/anatomy/priority.sssom.tsv",
"add_labels": false,
"configuration_path": "/Users/cthoyt/.data/semra/case-studies/anatomy/configuration.json",
"zenodo_record": 11091803
}
159 changes: 159 additions & 0 deletions notebooks/landscape/cell/configuration.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
{
"name": "SeMRA Cell and Cell Line Mappings Database",
"description": "Originally a reproduction of the EFO/Cellosaurus/DepMap/CCLE scenario posed in the Biomappings paper, this configuration imports several different cell and cell line resources and identifies mappings between them.",
"creators": [
{
"name": "Charles Tapley Hoyt",
"orcid": "0000-0003-4423-4370"
}
],
"inputs": [
{
"source": "biomappings"
},
{
"source": "gilda"
},
{
"source": "pyobo",
"prefix": "cellosaurus",
"confidence": 0.99
},
{
"source": "bioontologies",
"prefix": "bto",
"confidence": 0.99
},
{
"source": "bioontologies",
"prefix": "cl",
"confidence": 0.99
},
{
"source": "custom",
"prefix": "clo",
"confidence": 0.65
},
{
"source": "pyobo",
"prefix": "efo",
"confidence": 0.99
},
{
"source": "pyobo",
"prefix": "depmap",
"confidence": 0.99,
"extras": {
"version": "22Q4",
"standardize": true,
"license": "CC-BY-4.0"
}
},
{
"source": "pyobo",
"prefix": "ccle",
"confidence": 0.99,
"extras": {
"version": "2019"
}
},
{
"source": "pyobo",
"prefix": "ncit",
"confidence": 0.99
},
{
"source": "pyobo",
"prefix": "umls",
"confidence": 0.99
}
],
"priority": [
"mesh",
"efo",
"cellosaurus",
"ccle",
"depmap",
"bto",
"cl",
"clo",
"ncit",
"umls"
],
"mutations": [
{
"source": "efo",
"confidence": 0.7
},
{
"source": "bto",
"confidence": 0.7
},
{
"source": "cl",
"confidence": 0.7
},
{
"source": "clo",
"confidence": 0.7
},
{
"source": "depmap",
"confidence": 0.7
},
{
"source": "ccle",
"confidence": 0.7
},
{
"source": "cellosaurus",
"confidence": 0.7
},
{
"source": "ncit",
"confidence": 0.7
},
{
"source": "umls",
"confidence": 0.7
}
],
"subsets": {
"mesh": [
"mesh:D002477"
],
"efo": [
"efo:0000324"
],
"ncit": [
"ncit:C12508"
],
"umls": [
"sty:T025"
]
},
"keep_prefixes": [
"mesh",
"efo",
"cellosaurus",
"ccle",
"depmap",
"bto",
"cl",
"clo",
"ncit",
"umls"
],
"remove_imprecise": false,
"raw_pickle_path": "/Users/cthoyt/.data/semra/case-studies/cells/raw.pkl",
"raw_sssom_path": "/Users/cthoyt/.data/semra/case-studies/cells/raw.sssom.tsv",
"processed_pickle_path": "/Users/cthoyt/.data/semra/case-studies/cells/processed.pkl",
"processed_sssom_path": "/Users/cthoyt/.data/semra/case-studies/cells/processed.sssom.tsv",
"processed_neo4j_path": "/Users/cthoyt/.data/semra/case-studies/cells/neo4j",
"processed_neo4j_name": "semra-cell",
"priority_pickle_path": "/Users/cthoyt/.data/semra/case-studies/cells/priority.pkl",
"priority_sssom_path": "/Users/cthoyt/.data/semra/case-studies/cells/priority.sssom.tsv",
"add_labels": true,
"configuration_path": "/Users/cthoyt/.data/semra/case-studies/cells/configuration.json",
"zenodo_record": 11091581
}
Loading

0 comments on commit 8888095

Please sign in to comment.