diff --git a/README.md b/README.md
index ea942ef..decefdf 100644
--- a/README.md
+++ b/README.md
@@ -40,6 +40,9 @@ This software provides:
 4. A confidence model granular at the curator-level,
    mapping set-level, and community feedback-level
 
+We also provide an accompanying raw semantic mapping database on Zenodo at
+https://zenodo.org/records/11082039.
+
 ## 🚀 Installation
 
 The most recent release can be installed from
diff --git a/notebooks/landscape/README.md b/notebooks/landscape/README.md
index 7b18a4e..10f9ea0 100644
--- a/notebooks/landscape/README.md
+++ b/notebooks/landscape/README.md
@@ -32,10 +32,10 @@ resource, how many show up in all resources, and how many show up in a few
 
 A summary chart over all landscapes can be generated with `landscape.py`.
 
-| name    | raw_term_count | unique_term_count | reduction |
-|---------|---------------:|------------------:|----------:|
-| disease |         410173 |            243730 |  0.405787 |
-| anatomy |          37917 |             32108 |  0.153203 |
-| complex |          15869 |              7775 |  0.510051 |
-| gene    |    4.94578e+07 |       4.87886e+07 |  0.013529 |
-| cell    |         207019 |            166274 |  0.196818 |
\ No newline at end of file
+| name    | raw_term_count | unique_term_count | reduction |                                                                download |
+|---------|---------------:|------------------:|----------:|------------------------------------------------------------------------:|
+| disease |        410,173 |           243,730 |  0.405787 | [zenodo.record:11091886](https://bioregistry.io/zenodo.record:11091886) |
+| anatomy |         37,917 |            32,108 |  0.153203 | [zenodo.record:11091803](https://bioregistry.io/zenodo.record:11091803) |
+| complex |         15,869 |             7,775 |  0.510051 | [zenodo.record:11091422](https://bioregistry.io/zenodo.record:11091422) |
+| gene    |     49,457,767 |           207,019 |  0.013529 | [zenodo.record:11092013](https://bioregistry.io/zenodo.record:11092013) |
+| cell    |        207,019 |           166,274 |  0.196818 | [zenodo.record:11091581](https://bioregistry.io/zenodo.record:11091581) |
diff --git a/notebooks/landscape/anatomy/configuration.json b/notebooks/landscape/anatomy/configuration.json
new file mode 100644
index 0000000..f002bcd
--- /dev/null
+++ b/notebooks/landscape/anatomy/configuration.json
@@ -0,0 +1,129 @@
+{
+  "name": "SeMRA Anatomy Mappings Database",
+  "description": "Supports the analysis of the landscape of anatomy nomenclature resources.",
+  "creators": [
+    {
+      "name": "Charles Tapley Hoyt",
+      "orcid": "0000-0003-4423-4370"
+    }
+  ],
+  "inputs": [
+    {
+      "source": "biomappings"
+    },
+    {
+      "source": "gilda"
+    },
+    {
+      "source": "pyobo",
+      "prefix": "uberon",
+      "confidence": 0.99
+    },
+    {
+      "source": "pyobo",
+      "prefix": "bto",
+      "confidence": 0.99
+    },
+    {
+      "source": "pyobo",
+      "prefix": "caro",
+      "confidence": 0.99
+    },
+    {
+      "source": "pyobo",
+      "prefix": "mesh",
+      "confidence": 0.99
+    },
+    {
+      "source": "pyobo",
+      "prefix": "ncit",
+      "confidence": 0.99
+    },
+    {
+      "source": "pyobo",
+      "prefix": "umls",
+      "confidence": 0.99
+    }
+  ],
+  "priority": [
+    "uberon",
+    "mesh",
+    "bto",
+    "caro",
+    "ncit",
+    "umls"
+  ],
+  "mutations": [
+    {
+      "source": "uberon",
+      "confidence": 0.8
+    },
+    {
+      "source": "bto",
+      "confidence": 0.65
+    },
+    {
+      "source": "caro",
+      "confidence": 0.8
+    },
+    {
+      "source": "ncit",
+      "confidence": 0.7
+    },
+    {
+      "source": "umls",
+      "confidence": 0.7
+    }
+  ],
+  "subsets": {
+    "mesh": [
+      "mesh:D001829",
+      "mesh:D009141",
+      "mesh:D004064",
+      "mesh:D012137",
+      "mesh:D014566",
+      "mesh:D004703",
+      "mesh:D002319",
+      "mesh:D009420",
+      "mesh:D012679",
+      "mesh:D014024",
+      "mesh:D005441",
+      "mesh:D000825",
+      "mesh:D013284",
+      "mesh:D006424",
+      "mesh:D004628",
+      "mesh:D034582",
+      "mesh:D018514",
+      "mesh:D056229",
+      "mesh:D056226",
+      "mesh:D056224"
+    ],
+    "ncit": [
+      "ncit:C12219"
+    ],
+    "umls": [
+      "sty:T024",
+      "sty:T017"
+    ]
+  },
+  "keep_prefixes": [
+    "uberon",
+    "mesh",
+    "bto",
+    "caro",
+    "ncit",
+    "umls"
+  ],
+  "remove_imprecise": false,
+  "raw_pickle_path": "/Users/cthoyt/.data/semra/case-studies/anatomy/raw.pkl",
+  "raw_sssom_path": "/Users/cthoyt/.data/semra/case-studies/anatomy/raw.sssom.tsv",
+  "processed_pickle_path": "/Users/cthoyt/.data/semra/case-studies/anatomy/processed.pkl",
+  "processed_sssom_path": "/Users/cthoyt/.data/semra/case-studies/anatomy/processed.sssom.tsv",
+  "processed_neo4j_path": "/Users/cthoyt/.data/semra/case-studies/anatomy/neo4j",
+  "processed_neo4j_name": "semra-anatomy",
+  "priority_pickle_path": "/Users/cthoyt/.data/semra/case-studies/anatomy/priority.pkl",
+  "priority_sssom_path": "/Users/cthoyt/.data/semra/case-studies/anatomy/priority.sssom.tsv",
+  "add_labels": false,
+  "configuration_path": "/Users/cthoyt/.data/semra/case-studies/anatomy/configuration.json",
+  "zenodo_record": 11091803
+}
\ No newline at end of file
diff --git a/notebooks/landscape/cell/configuration.json b/notebooks/landscape/cell/configuration.json
new file mode 100644
index 0000000..2e719b4
--- /dev/null
+++ b/notebooks/landscape/cell/configuration.json
@@ -0,0 +1,159 @@
+{
+  "name": "SeMRA Cell and Cell Line Mappings Database",
+  "description": "Originally a reproduction of the EFO/Cellosaurus/DepMap/CCLE scenario posed in the Biomappings paper, this configuration imports several different cell and cell line resources and identifies mappings between them.",
+  "creators": [
+    {
+      "name": "Charles Tapley Hoyt",
+      "orcid": "0000-0003-4423-4370"
+    }
+  ],
+  "inputs": [
+    {
+      "source": "biomappings"
+    },
+    {
+      "source": "gilda"
+    },
+    {
+      "source": "pyobo",
+      "prefix": "cellosaurus",
+      "confidence": 0.99
+    },
+    {
+      "source": "bioontologies",
+      "prefix": "bto",
+      "confidence": 0.99
+    },
+    {
+      "source": "bioontologies",
+      "prefix": "cl",
+      "confidence": 0.99
+    },
+    {
+      "source": "custom",
+      "prefix": "clo",
+      "confidence": 0.65
+    },
+    {
+      "source": "pyobo",
+      "prefix": "efo",
+      "confidence": 0.99
+    },
+    {
+      "source": "pyobo",
+      "prefix": "depmap",
+      "confidence": 0.99,
+      "extras": {
+        "version": "22Q4",
+        "standardize": true,
+        "license": "CC-BY-4.0"
+      }
+    },
+    {
+      "source": "pyobo",
+      "prefix": "ccle",
+      "confidence": 0.99,
+      "extras": {
+        "version": "2019"
+      }
+    },
+    {
+      "source": "pyobo",
+      "prefix": "ncit",
+      "confidence": 0.99
+    },
+    {
+      "source": "pyobo",
+      "prefix": "umls",
+      "confidence": 0.99
+    }
+  ],
+  "priority": [
+    "mesh",
+    "efo",
+    "cellosaurus",
+    "ccle",
+    "depmap",
+    "bto",
+    "cl",
+    "clo",
+    "ncit",
+    "umls"
+  ],
+  "mutations": [
+    {
+      "source": "efo",
+      "confidence": 0.7
+    },
+    {
+      "source": "bto",
+      "confidence": 0.7
+    },
+    {
+      "source": "cl",
+      "confidence": 0.7
+    },
+    {
+      "source": "clo",
+      "confidence": 0.7
+    },
+    {
+      "source": "depmap",
+      "confidence": 0.7
+    },
+    {
+      "source": "ccle",
+      "confidence": 0.7
+    },
+    {
+      "source": "cellosaurus",
+      "confidence": 0.7
+    },
+    {
+      "source": "ncit",
+      "confidence": 0.7
+    },
+    {
+      "source": "umls",
+      "confidence": 0.7
+    }
+  ],
+  "subsets": {
+    "mesh": [
+      "mesh:D002477"
+    ],
+    "efo": [
+      "efo:0000324"
+    ],
+    "ncit": [
+      "ncit:C12508"
+    ],
+    "umls": [
+      "sty:T025"
+    ]
+  },
+  "keep_prefixes": [
+    "mesh",
+    "efo",
+    "cellosaurus",
+    "ccle",
+    "depmap",
+    "bto",
+    "cl",
+    "clo",
+    "ncit",
+    "umls"
+  ],
+  "remove_imprecise": false,
+  "raw_pickle_path": "/Users/cthoyt/.data/semra/case-studies/cells/raw.pkl",
+  "raw_sssom_path": "/Users/cthoyt/.data/semra/case-studies/cells/raw.sssom.tsv",
+  "processed_pickle_path": "/Users/cthoyt/.data/semra/case-studies/cells/processed.pkl",
+  "processed_sssom_path": "/Users/cthoyt/.data/semra/case-studies/cells/processed.sssom.tsv",
+  "processed_neo4j_path": "/Users/cthoyt/.data/semra/case-studies/cells/neo4j",
+  "processed_neo4j_name": "semra-cell",
+  "priority_pickle_path": "/Users/cthoyt/.data/semra/case-studies/cells/priority.pkl",
+  "priority_sssom_path": "/Users/cthoyt/.data/semra/case-studies/cells/priority.sssom.tsv",
+  "add_labels": true,
+  "configuration_path": "/Users/cthoyt/.data/semra/case-studies/cells/configuration.json",
+  "zenodo_record": 11091581
+}
\ No newline at end of file
diff --git a/notebooks/landscape/complex/configuration.json b/notebooks/landscape/complex/configuration.json
new file mode 100644
index 0000000..607e877
--- /dev/null
+++ b/notebooks/landscape/complex/configuration.json
@@ -0,0 +1,98 @@
+{
+  "name": "SeMRA Protein Complex Landscape Analysis",
+  "description": "Analyze the landscape of protein complex nomenclature resources, species-agnostic.",
+  "creators": [
+    {
+      "name": "Charles Tapley Hoyt",
+      "orcid": "0000-0003-4423-4370"
+    }
+  ],
+  "inputs": [
+    {
+      "source": "gilda"
+    },
+    {
+      "source": "biomappings"
+    },
+    {
+      "source": "pyobo",
+      "prefix": "fplx",
+      "confidence": 0.99
+    },
+    {
+      "source": "custom",
+      "prefix": "fplx",
+      "confidence": 0.99
+    },
+    {
+      "source": "custom",
+      "prefix": "intact_complexportal",
+      "confidence": 0.99
+    },
+    {
+      "source": "pyobo",
+      "prefix": "complexportal",
+      "confidence": 0.99
+    },
+    {
+      "source": "pyobo",
+      "prefix": "go",
+      "confidence": 0.99
+    },
+    {
+      "source": "wikidata",
+      "prefix": "complexportal",
+      "confidence": 0.99
+    },
+    {
+      "source": "wikidata",
+      "prefix": "reactome",
+      "confidence": 0.99
+    }
+  ],
+  "priority": [
+    "complexportal",
+    "fplx",
+    "go",
+    "chembl.target",
+    "wikidata",
+    "scomp",
+    "signor",
+    "intact"
+  ],
+  "mutations": [
+    {
+      "source": "go",
+      "confidence": 0.95
+    }
+  ],
+  "subsets": {
+    "go": [
+      "go:0032991"
+    ]
+  },
+  "post_keep_prefixes": [
+    "complexportal",
+    "fplx",
+    "go",
+    "chembl.target",
+    "wikidata",
+    "scomp",
+    "signor",
+    "intact"
+  ],
+  "remove_imprecise": false,
+  "raw_pickle_path": "/Users/cthoyt/.data/semra/case-studies/complex/raw.pkl",
+  "raw_sssom_path": "/Users/cthoyt/.data/semra/case-studies/complex/raw.sssom.tsv",
+  "raw_neo4j_path": "/Users/cthoyt/.data/semra/case-studies/complex/neo4j_raw",
+  "raw_neo4j_name": "semra-complex",
+  "processed_pickle_path": "/Users/cthoyt/.data/semra/case-studies/complex/processed.pkl",
+  "processed_sssom_path": "/Users/cthoyt/.data/semra/case-studies/complex/processed.sssom.tsv",
+  "processed_neo4j_path": "/Users/cthoyt/.data/semra/case-studies/complex/neo4j",
+  "processed_neo4j_name": "semra-complex",
+  "priority_pickle_path": "/Users/cthoyt/.data/semra/case-studies/complex/priority.pkl",
+  "priority_sssom_path": "/Users/cthoyt/.data/semra/case-studies/complex/priority.sssom.tsv",
+  "add_labels": true,
+  "configuration_path": "/Users/cthoyt/.data/semra/case-studies/complex/configuration.json",
+  "zenodo_record": 11091422
+}
\ No newline at end of file
diff --git a/notebooks/landscape/disease/configuration.json b/notebooks/landscape/disease/configuration.json
new file mode 100644
index 0000000..f796fe5
--- /dev/null
+++ b/notebooks/landscape/disease/configuration.json
@@ -0,0 +1,157 @@
+{
+  "name": "SeMRA Disease Mappings Database",
+  "description": "Supports the analysis of the landscape of disease nomenclature resources.",
+  "creators": [
+    {
+      "name": "Charles Tapley Hoyt",
+      "orcid": "0000-0003-4423-4370"
+    }
+  ],
+  "inputs": [
+    {
+      "source": "biomappings"
+    },
+    {
+      "source": "gilda"
+    },
+    {
+      "source": "bioontologies",
+      "prefix": "doid",
+      "confidence": 0.99
+    },
+    {
+      "source": "bioontologies",
+      "prefix": "mondo",
+      "confidence": 0.99
+    },
+    {
+      "source": "bioontologies",
+      "prefix": "efo",
+      "confidence": 0.99
+    },
+    {
+      "source": "pyobo",
+      "prefix": "mesh",
+      "confidence": 0.99
+    },
+    {
+      "source": "bioontologies",
+      "prefix": "ncit",
+      "confidence": 0.85
+    },
+    {
+      "source": "pyobo",
+      "prefix": "umls",
+      "confidence": 0.9
+    },
+    {
+      "source": "bioontologies",
+      "prefix": "orphanet.ordo",
+      "confidence": 0.9
+    }
+  ],
+  "priority": [
+    "doid",
+    "mondo",
+    "efo",
+    "mesh",
+    "ncit",
+    "orphanet",
+    "orphanet.ordo",
+    "umls",
+    "omim",
+    "omim.ps",
+    "gard",
+    "icd10",
+    "icd10cm",
+    "icd10pcs",
+    "icd11",
+    "icd9",
+    "icd9cm",
+    "icdo"
+  ],
+  "mutations": [
+    {
+      "source": "doid",
+      "confidence": 0.95
+    },
+    {
+      "source": "mondo",
+      "confidence": 0.95
+    },
+    {
+      "source": "efo",
+      "confidence": 0.9
+    },
+    {
+      "source": "ncit",
+      "confidence": 0.7
+    },
+    {
+      "source": "umls",
+      "confidence": 0.7
+    },
+    {
+      "source": "orphanet.ordo",
+      "confidence": 0.7
+    },
+    {
+      "source": "orphanet",
+      "confidence": 0.7
+    }
+  ],
+  "subsets": {
+    "mesh": [
+      "mesh:D007239",
+      "mesh:D001520",
+      "mesh:D011579",
+      "mesh:D001523",
+      "mesh:D004191"
+    ],
+    "efo": [
+      "efo:0000408"
+    ],
+    "ncit": [
+      "ncit:C2991"
+    ],
+    "umls": [
+      "sty:T049",
+      "sty:T047",
+      "sty:T191",
+      "sty:T050",
+      "sty:T048"
+    ]
+  },
+  "keep_prefixes": [
+    "doid",
+    "mondo",
+    "efo",
+    "mesh",
+    "ncit",
+    "orphanet",
+    "orphanet.ordo",
+    "umls",
+    "omim",
+    "omim.ps",
+    "gard",
+    "icd10",
+    "icd10cm",
+    "icd10pcs",
+    "icd11",
+    "icd9",
+    "icd9cm",
+    "icdo"
+  ],
+  "remove_imprecise": false,
+  "raw_pickle_path": "/Users/cthoyt/.data/semra/case-studies/disease/raw.pkl",
+  "raw_sssom_path": "/Users/cthoyt/.data/semra/case-studies/disease/raw.sssom.tsv",
+  "processed_pickle_path": "/Users/cthoyt/.data/semra/case-studies/disease/processed.pkl",
+  "processed_sssom_path": "/Users/cthoyt/.data/semra/case-studies/disease/processed.sssom.tsv",
+  "processed_neo4j_path": "/Users/cthoyt/.data/semra/case-studies/disease/neo4j",
+  "processed_neo4j_name": "semra-disease",
+  "priority_pickle_path": "/Users/cthoyt/.data/semra/case-studies/disease/priority.pkl",
+  "priority_sssom_path": "/Users/cthoyt/.data/semra/case-studies/disease/priority.sssom.tsv",
+  "add_labels": true,
+  "configuration_path": "/Users/cthoyt/.data/semra/case-studies/disease/configuration.json",
+  "zenodo_record": 11091886
+}
\ No newline at end of file
diff --git a/notebooks/landscape/gene/configuration.json b/notebooks/landscape/gene/configuration.json
new file mode 100644
index 0000000..c76639c
--- /dev/null
+++ b/notebooks/landscape/gene/configuration.json
@@ -0,0 +1,132 @@
+{
+  "name": "SeMRA Gene Mapping Database",
+  "description": "Analyze the landscape of gene nomenclature resources, species-agnostic.",
+  "creators": [
+    {
+      "name": "Charles Tapley Hoyt",
+      "orcid": "0000-0003-4423-4370"
+    }
+  ],
+  "inputs": [
+    {
+      "source": "pyobo",
+      "prefix": "hgnc",
+      "confidence": 0.99
+    },
+    {
+      "source": "pyobo",
+      "prefix": "mgi",
+      "confidence": 0.99
+    },
+    {
+      "source": "pyobo",
+      "prefix": "rgd",
+      "confidence": 0.99
+    },
+    {
+      "source": "pyobo",
+      "prefix": "cgnc",
+      "confidence": 0.99
+    },
+    {
+      "source": "pyobo",
+      "prefix": "sgd",
+      "confidence": 0.99
+    },
+    {
+      "source": "pyobo",
+      "prefix": "civic.gid",
+      "confidence": 0.99
+    },
+    {
+      "source": "pyobo",
+      "prefix": "flybase",
+      "confidence": 0.99
+    },
+    {
+      "source": "custom",
+      "prefix": "ncit_hgnc",
+      "confidence": 0.99
+    },
+    {
+      "source": "custom",
+      "prefix": "omim_gene",
+      "confidence": 0.99
+    },
+    {
+      "source": "wikidata",
+      "prefix": "ncbigene",
+      "confidence": 0.99
+    },
+    {
+      "source": "wikidata",
+      "prefix": "civic.gid",
+      "confidence": 0.99
+    },
+    {
+      "source": "wikidata",
+      "prefix": "ensembl",
+      "confidence": 0.99
+    },
+    {
+      "source": "wikidata",
+      "prefix": "hgnc",
+      "confidence": 0.99
+    },
+    {
+      "source": "wikidata",
+      "prefix": "omim",
+      "confidence": 0.99
+    },
+    {
+      "source": "wikidata",
+      "prefix": "umls",
+      "confidence": 0.99
+    }
+  ],
+  "priority": [
+    "ncbigene",
+    "hgnc",
+    "mgi",
+    "rgd",
+    "cgnc",
+    "wormbase",
+    "flybase",
+    "sgd",
+    "omim",
+    "civic.gid",
+    "umls",
+    "ncit",
+    "wikidata"
+  ],
+  "mutations": [
+    {
+      "source": "umls",
+      "confidence": 0.8
+    },
+    {
+      "source": "ncit",
+      "confidence": 0.8
+    }
+  ],
+  "subsets": {
+    "umls": [
+      "umls:C0017337"
+    ],
+    "ncit": [
+      "ncit:C16612"
+    ]
+  },
+  "remove_imprecise": false,
+  "raw_pickle_path": "/Users/cthoyt/.data/semra/case-studies/gene/raw.pkl.gz",
+  "raw_sssom_path": "/Users/cthoyt/.data/semra/case-studies/gene/raw.sssom.tsv.gz",
+  "processed_pickle_path": "/Users/cthoyt/.data/semra/case-studies/gene/processed.pkl.gz",
+  "processed_sssom_path": "/Users/cthoyt/.data/semra/case-studies/gene/processed.sssom.tsv.gz",
+  "processed_neo4j_path": "/Users/cthoyt/.data/semra/case-studies/gene/neo4j",
+  "processed_neo4j_name": "semra-gene",
+  "priority_pickle_path": "/Users/cthoyt/.data/semra/case-studies/gene/priority.pkl.gz",
+  "priority_sssom_path": "/Users/cthoyt/.data/semra/case-studies/gene/priority.sssom.tsv.gz",
+  "add_labels": true,
+  "configuration_path": "/Users/cthoyt/.data/semra/case-studies/gene/configuration.json",
+  "zenodo_record": 11092013
+}
\ No newline at end of file
diff --git a/notebooks/landscape/landscape.py b/notebooks/landscape/landscape.py
index 1283810..9caccc5 100644
--- a/notebooks/landscape/landscape.py
+++ b/notebooks/landscape/landscape.py
@@ -5,6 +5,8 @@
 
 import pandas as pd
 
+from semra import Configuration
+
 HERE = Path(__file__).parent.resolve()
 
 
@@ -14,15 +16,23 @@ def main() -> None:
     for directory in HERE.iterdir():
         if not directory.is_dir():
             continue
-        path = directory.joinpath("stats.json")
-        if not path.is_file():
+
+        row = {"name": directory.name}
+
+        statistics_path = directory.joinpath("stats.json")
+        if not statistics_path.is_file():
             continue
-        row = json.loads(path.read_text())
-        row["name"] = directory.name
+        row.update(json.loads(statistics_path.read_text()))
+
+        configuration_path = directory.joinpath("configuration.json")
+        configuration = Configuration.parse_file(configuration_path)
+        row["zenodo"] = configuration.zenodo_url()
         rows.append(row)
     df = pd.DataFrame(rows).set_index("name")
-    df = df[["raw_term_count", "unique_term_count", "reduction"]]
-    print(df.to_markdown(tablefmt="github"))
+    df = df[["raw_term_count", "unique_term_count", "reduction", "zenodo"]]
+    df["reduction"] = df["reduction"].map(lambda r: f"{r:.1%}")
+    df = df.astype(str)
+    print(df.to_latex(label="landscape-summary-table", caption=""))
 
 
 if __name__ == "__main__":
diff --git a/setup.cfg b/setup.cfg
index 98b319e..c0d66a2 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -65,6 +65,7 @@ install_requires =
     bioontologies
     pyobo
     typing_extensions
+    zenodo_client
 
 # Random options
 zip_safe = false
diff --git a/src/semra/database.py b/src/semra/database.py
index d684f64..6b0e3c3 100644
--- a/src/semra/database.py
+++ b/src/semra/database.py
@@ -1,58 +1,75 @@
 """Assemble a database."""
 
-import pickle
+import csv
 import time
+import typing as t
 
 import bioregistry
 import click
 import pyobo
 import pystow
+import requests
 from bioontologies.obograph import write_warned
 from bioontologies.robot import write_getter_warnings
 from tqdm.auto import tqdm
 from tqdm.contrib.logging import logging_redirect_tqdm
-
-from semra.io import from_bioontologies, from_pyobo, write_neo4j, write_pickle, write_sssom
+from zenodo_client import Creator, Metadata, ensure_zenodo
+
+from semra import Mapping
+from semra.io import (
+    from_bioontologies,
+    from_pickle,
+    from_pyobo,
+    write_neo4j,
+    write_pickle,
+    write_sssom,
+)
+from semra.rules import CHARLIE_NAME, CHARLIE_ORCID
 from semra.sources import SOURCE_RESOLVER
+from semra.sources.wikidata import get_wikidata_mappings_by_prefix
 
 MODULE = pystow.module("semra", "database")
 SOURCES = MODULE.module("sources")
-DATABASE_PATH = MODULE.join(name="sssom.tsv")
-WARNINGS_PATH = MODULE.join("logs", name="warnings.tsv")
-ERRORS_PATH = MODULE.join("logs", name="errors.tsv")
-SUMMARY_PATH = MODULE.join("logs", name="summary.tsv")
-EMPTY_PATH = MODULE.join("logs", name="empty.txt")
+LOGS = MODULE.module("logs")
+SSSOM_PATH = MODULE.join(name="mappings.sssom.tsv.gz")
+PICKLE_PATH = MODULE.join(name="mappings.pkl.gz")
+WARNINGS_PATH = LOGS.join(name="warnings.tsv")
+ERRORS_PATH = LOGS.join(name="errors.tsv")
+SUMMARY_PATH = LOGS.join(name="summary.tsv")
+EMPTY_PATH = LOGS.join(name="empty.txt")
 NEO4J_DIR = MODULE.join("neo4j")
 
 EMPTY = []
-
 summaries = []
 
+skip = {
+    "ado",  # trash
+    "epio",  # trash
+    "chebi",  # too big
+    "pr",  # too big
+    "ncbitaxon",  # too big
+    "ncit",  # too big
+    "ncbigene",  # too big
+    # duplicates of EDAM
+    "edam.data",
+    "edam.format",
+    "edam.operation",
+    "edam.topic",
+    "gwascentral.phenotype",  # added on 2024-04-24, service down
+    "gwascentral.study",  # added on 2024-04-24, service down
+}
+#: A set of prefixes whose obo files need to be parsed without ROBOT checks
+loose = {
+    "caloha",
+    "foodon",
+    "cellosaurus",
+}
+
 
 @click.command()
-def main():
+@click.option("--include-wikidata")
+def main(include_wikidata: bool):
     """Construct the full SeMRA database."""
-    skip = {
-        "ado",  # trash
-        "epio",  # trash
-        "chebi",  # too big
-        "pr",  # too big
-        "ncbitaxon",  # too big
-        "ncit",  # too big
-        "ncbigene",  # too big
-        # duplicates of EDAM
-        "edam.data",
-        "edam.format",
-        "edam.operation",
-        "edam.topic",
-    }
-    #: A set of prefixes whose obo files need to be parsed without ROBOT checks
-    loose = {
-        "caloha",
-        "foodon",
-        "cellosaurus",
-    }
-
     ontology_resources = []
     pyobo_resources = []
     for resource in bioregistry.resources():
@@ -79,27 +96,53 @@ def main():
             continue
         _write_source(resource_mappings, resource.prefix)
         mappings.extend(resource_mappings)
-        summaries.append((resource.prefix, len(resource_mappings), time.time() - start))
+        summaries.append((resource.prefix, len(resource_mappings), time.time() - start, "pyobo"))
         _write_summary()
 
     it = tqdm(list(SOURCE_RESOLVER), unit="source", desc="Custom sources")
     for func in it:
         start = time.time()
         resource_name = func.__name__.removeprefix("get_").removesuffix("_mappings")
+        if resource_name == "wikidata":
+            # this one needs extra informatzi
+            continue
         it.set_postfix(source=resource_name)
         with logging_redirect_tqdm():
             resource_mappings = func()
             _write_source(resource_mappings, resource_name)
             mappings.extend(resource_mappings)
-        summaries.append((resource_name, len(resource_mappings), time.time() - start))
+        summaries.append((resource_name, len(resource_mappings), time.time() - start, "custom"))
         _write_summary()
 
+    skip_wikidata_prefixes = {
+        "pubmed",  # too big! need paging?
+        "doi",  # too big! need paging?
+        "inchi",  # too many funny characters
+        "smiles",  # too many funny characters
+    }
+    if include_wikidata:
+        for prefix in tqdm(bioregistry.get_registry_map("wikidata"), unit="property", desc="Wikidata"):
+            it.set_postfix(prefix=prefix)
+            if prefix in skip_wikidata_prefixes:
+                continue
+            start = time.time()
+            resource_name = f"wikidata_{prefix}"
+            try:
+                resource_mappings = get_wikidata_mappings_by_prefix(prefix)
+            except requests.exceptions.JSONDecodeError as e:
+                tqdm.write(f"[{resource_name}] failed to get mappings from wikidata: {e}")
+                continue
+            _write_source(resource_mappings, resource_name)
+            mappings.extend(resource_mappings)
+            summaries.append((resource_name, len(resource_mappings), time.time() - start, "wikidata"))
+            _write_summary()
+
     it = tqdm(ontology_resources, unit="ontology", desc="Ontology sources")
     for resource in it:
         it.set_postfix(prefix=resource.prefix)
-        path = SOURCES.join(name=f"{resource.prefix}.pkl")
+        path = SOURCES.join(name=f"{resource.prefix}.pkl.gz")
         if path.is_file():
-            resource_mappings = pickle.loads(path.read_bytes())
+            resource_mappings = from_pickle(path)
         else:
             start = time.time()
             try:
@@ -112,28 +155,59 @@ def main():
             # this outputs on each iteration to get faster insight
             write_warned(WARNINGS_PATH)
             write_getter_warnings(ERRORS_PATH)
-            summaries.append((resource.prefix, len(resource_mappings), time.time() - start))
+            summaries.append((resource.prefix, len(resource_mappings), time.time() - start, "bioontologies"))
             _write_summary()
 
         mappings.extend(resource_mappings)
 
-    click.echo(f"Writing SSSOM to {DATABASE_PATH}")
-    write_sssom(mappings, DATABASE_PATH)
-    click.echo(f"Writing Neo4j folder to {DATABASE_PATH}")
+    click.echo(f"Writing SSSOM to {SSSOM_PATH}")
+    write_sssom(mappings, SSSOM_PATH)
+    click.echo(f"Writing Pickle to {PICKLE_PATH}")
+    write_pickle(mappings, PICKLE_PATH)
+    click.echo(f"Writing Neo4j folder to {NEO4J_DIR}")
     write_neo4j(mappings, NEO4J_DIR)
 
-
-def _write_source(mappings, key):
-    write_pickle(mappings, SOURCES.join(name=f"{key}.pkl"))
+    # Define the metadata that will be used on initial upload
+    zenodo_metadata = Metadata(
+        title="SeMRA Mapping Database",
+        upload_type="dataset",
+        description=f"A compendium of mappings extracted from {len(summaries)} database/ontologies. "
+        f"Note that primary mappings are marked with the license of their source (when available). "
+        f"Inferred mappings are distributed under the CC0 license.",
+        creators=[
+            Creator(name=CHARLIE_NAME, orcid=CHARLIE_ORCID.identifier),
+        ],
+    )
+    res = ensure_zenodo(
+        key="semra-database-test-1",
+        data=zenodo_metadata,
+        paths=[
+            SSSOM_PATH,
+            WARNINGS_PATH,
+            ERRORS_PATH,
+            SUMMARY_PATH,
+            *NEO4J_DIR.iterdir(),
+        ],
+        sandbox=True,
+    )
+    click.echo(res.json()["links"]["html"])
+
+
+def _write_source(mappings: t.List[Mapping], key: str) -> None:
     if mappings:
-        write_sssom(mappings, SOURCES.join(name=f"{key}.sssom.tsv"))
+        write_pickle(mappings, SOURCES.join(name=f"{key}.pkl.gz"))
+        write_sssom(mappings, SOURCES.join(name=f"{key}.sssom.tsv"), add_labels=True)
     else:
         EMPTY.append(key)
         EMPTY_PATH.write_text("\n".join(EMPTY))
 
 
-def _write_summary():
-    SUMMARY_PATH.write_text("\n".join(f"{p}\t{n:,}\t{round(delta, 3)}" for p, n, delta in summaries))
+def _write_summary() -> None:
+    with SUMMARY_PATH.open("w") as file:
+        writer = csv.writer(file, delimiter="\t")
+        writer.writerow(("prefix", "mappings", "seconds", "source_type"))
+        for prefix, n_mappings, time_delta, source_type in summaries:
+            writer.writerow((prefix, n_mappings, round(time_delta, 2), source_type))
 
 
 if __name__ == "__main__":
diff --git a/src/semra/io.py b/src/semra/io.py
index a00cc71..3b5209b 100644
--- a/src/semra/io.py
+++ b/src/semra/io.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import csv
 import gzip
 import logging
 import pickle
@@ -21,6 +22,7 @@
 import requests
 from bioregistry import Collection
 from tqdm.autonotebook import tqdm
+from tqdm.contrib.logging import logging_redirect_tqdm
 
 from semra.rules import DB_XREF, UNSPECIFIED_MAPPING
 from semra.struct import Evidence, Mapping, MappingSet, ReasonedEvidence, Reference, SimpleEvidence
@@ -50,6 +52,11 @@
 #: node to the mapping node(s) from which it was derived
 DERIVED_PREDICATE = "derivedFromMapping"
 
+HAS_AUTHOR_PREDICATE = "hasAuthor"
+
+#: The default confidence for ontology-based mappings
+DEFAULT_ONTOLOGY_CONFIDENCE = 0.9
+
 
 def _safe_get_version(prefix: str) -> str | None:
     """Get a version from Bioversions, or return None if not possible."""
@@ -99,7 +106,7 @@ def _from_pyobo_pair(
     :param source_prefix: The prefix of the ontology
     :param target_prefix: The prefix of the target
     :param predicate: The predicate of the mappings. Defaults to :data:`DB_XREF`.
-    :param confidence: The confidence level for the mappings. Defaults to 0.9
+    :param confidence: The confidence level for the mappings. Defaults to :data:`DEFAULT_ONTOLOGY_CONFIDENCE`.
     :param standardize:
         Should the local unique identifiers in the first and third columns be standardized
         using :func:`bioregistry.standardize_identifier`? Defaults to false.
@@ -151,7 +158,7 @@ def from_cache_df(
     :param source_prefix: The prefix of the ontology
     :param prefixes: A set of prefixes to subset the second column of cross-reference targets
     :param predicate: The predicate of the mappings. Defaults to :data:`DB_XREF`.
-    :param confidence: The confidence level for the mappings. Defaults to 0.9
+    :param confidence: The confidence level for the mappings. Defaults to :data:`DEFAULT_ONTOLOGY_CONFIDENCE`
     :param standardize:
         Should the local unique identifiers in the first and third columns be standardized
         using :func:`bioregistry.standardize_identifier`? Defaults to false.
@@ -205,7 +212,7 @@ def _from_pyobo_df(
     :param source_prefix: The prefix of the ontology
     :param prefixes: A set of prefixes to subset the second column of cross-reference targets
     :param predicate: The predicate of the mappings. Defaults to :data:`DB_XREF`.
-    :param confidence: The confidence level for the mappings. Defaults to 0.9
+    :param confidence: The confidence level for the mappings. Defaults to :data:`DEFAULT_ONTOLOGY_CONFIDENCE`
     :param standardize:
         Should the local unique identifiers in the first and third columns be standardized
         using :func:`bioregistry.standardize_identifier`? Defaults to false.
@@ -226,7 +233,7 @@ def _from_pyobo_df(
     if justification is None:
         justification = UNSPECIFIED_MAPPING
     if confidence is None:
-        confidence = 0.9
+        confidence = DEFAULT_ONTOLOGY_CONFIDENCE
     if license is None:
         license = bioregistry.get_license(source_prefix)
     if isinstance(prefixes, str):
@@ -283,8 +290,10 @@ def from_pyobo(
     return _from_pyobo_prefix(prefix, standardize=standardize, **kwargs)
 
 
-def from_bioontologies(prefix: str, confidence=None, **kwargs) -> list[Mapping]:
+def from_bioontologies(prefix: str, confidence: float | None = None, **kwargs) -> list[Mapping]:
     """Get mappings from a given ontology via :mod:`bioontologies`."""
+    if confidence is None:
+        confidence = DEFAULT_ONTOLOGY_CONFIDENCE
     o = bioontologies.get_obograph_by_prefix(prefix, **kwargs)
     g = o.guess(prefix)
     # note that we don't extract stuff from edges so just node standardization is good enough
@@ -350,6 +359,8 @@ def _parse_sssom_row(
         author = None
     if "mapping_set_name" in row and pd.notna(row["mapping_set_name"]):
         n = row["mapping_set_name"]
+    elif "mapping_set" in row and pd.notna(row["mapping_set"]):
+        n = row["mapping_set"]
     elif mapping_set_name is None:
         raise KeyError("need a mapping set name")
     else:
@@ -417,8 +428,9 @@ def get_sssom_df(mappings: list[Mapping], *, add_labels: bool = False) -> pd.Dat
     ]
     df = pd.DataFrame(rows, columns=columns)
     if add_labels:
-        for label_column, id_column in [("subject_label", "subject_id"), ("object_label", "object_id")]:
-            df[label_column] = df[id_column].map(_get_name_by_curie)  # type:ignore
+        with logging_redirect_tqdm():
+            for label_column, id_column in [("subject_label", "subject_id"), ("object_label", "object_id")]:
+                df[label_column] = df[id_column].map(_get_name_by_curie)  # type:ignore
         df = df[
             [
                 "subject_id",
@@ -469,8 +481,11 @@ def get_orcid_name(orcid: str) -> Optional[str]:
     if orcid.startswith("orcid:"):
         orcid = orcid[len("orcid:") :]
 
-    res = requests.get(f"https://orcid.org/{orcid}", headers={"Accept": "application/json"}, timeout=5).json()
-    name = res["person"]["name"]
+    try:
+        res = requests.get(f"https://orcid.org/{orcid}", headers={"Accept": "application/json"}, timeout=5).json()
+    except IOError:  # e.g., ReadTimeout
+        return None
+    name = res.get("person", {}).get("name")
     if name is None:
         return None
     if credit_name := name.get("credit-name"):
@@ -543,7 +558,7 @@ def _neo4j_bool(b: bool, /) -> Literal["true", "false"]:  # noqa:FBT001
     return "true" if b else "false"  # type:ignore
 
 
-def _safe_confidence(x) -> str:
+def _safe_confidence(x: Evidence) -> str:
     confidence = x.get_confidence()
     if confidence is None:
         return ""
@@ -559,6 +574,7 @@ def write_neo4j(
     add_labels: bool = False,
     startup_script_name: str = "startup.sh",
     run_script_name: str = "run_on_docker.sh",
+    sort: bool = False,
 ) -> None:
     """Write all files needed to construct a Neo4j graph database from a set of mappings.
 
@@ -582,6 +598,7 @@ def write_neo4j(
     :param startup_script_name: The name of the startup script that the Dockerfile calls
     :param run_script_name:
         The name of the run script that you as the user should call to wrap building and running the Docker image
+    :param sort: Should the output nodes files be sorted?
     :raises NotADirectoryError:
         If the directory given does not already exist. It's suggested
         to use :mod:`pystow` to create deterministic directories.
@@ -612,16 +629,15 @@ def write_neo4j(
     run_path = directory.joinpath(run_script_name)
     docker_path = directory.joinpath("Dockerfile")
 
-    concept_nodes_path = directory.joinpath("concept_nodes.tsv")
+    concept_nodes_path = directory.joinpath("concept_nodes.tsv.gz")
     concepts: set[Reference] = set()
-    concept_nodes_header = ["curie:ID", ":LABEL", "prefix", "name", "priority:boolean"]
+    concept_nodes_header = ["curie:ID", "prefix", "name", "priority:boolean"]
     if equivalence_classes is None:
         equivalence_classes = {}
 
-    mapping_nodes_path = directory.joinpath("mapping_nodes.tsv")
+    mapping_nodes_path = directory.joinpath("mapping_nodes.tsv.gz")
     mapping_nodes_header = [
         "curie:ID",
-        ":LABEL",
         "prefix",
         "predicate",
         "confidence",
@@ -630,22 +646,20 @@ def write_neo4j(
         "tertiary:boolean",
     ]
 
-    evidence_nodes_path = directory.joinpath("evidence_nodes.tsv")
+    evidence_nodes_path = directory.joinpath("evidence_nodes.tsv.gz")
     evidences = {}
     evidence_nodes_header = [
         "curie:ID",
-        ":LABEL",
         "prefix",
         "type",
         "mapping_justification",
         "confidence:float",
     ]
 
-    mapping_set_nodes_path = directory.joinpath("mapping_set_nodes.tsv")
+    mapping_set_nodes_path = directory.joinpath("mapping_set_nodes.tsv.gz")
     mapping_sets = {}
     mapping_set_nodes_header = [
         "curie:ID",
-        ":LABEL",
         "prefix",
         "name",
         "license",
@@ -653,8 +667,7 @@ def write_neo4j(
         "confidence:float",
     ]
 
-    edges_path = directory.joinpath("edges.tsv")
-    edges: list[tuple[str, str, str, str | float, str, str, str, str]] = []
+    mapping_edges_path = directory.joinpath("mapping_edges.tsv.gz")
     edges_header = [
         ":START_ID",
         ":TYPE",
@@ -665,65 +678,80 @@ def write_neo4j(
         "tertiary:boolean",
         "mapping_sets:string[]",
     ]
-
-    for mapping in tqdm(mappings, unit="mapping", unit_scale=True, desc="Preparing Neo4j"):
-        concepts.add(mapping.s)
-        concepts.add(mapping.o)
-
-        edges.append(
-            (
-                mapping.s.curie,
-                mapping.p.curie,
-                mapping.o.curie,
-                _safe_confidence(mapping),
-                _neo4j_bool(mapping.has_primary),
-                _neo4j_bool(mapping.has_secondary),
-                _neo4j_bool(mapping.has_tertiary),
-                "|".join(sorted({evidence.mapping_set.name for evidence in mapping.evidence if evidence.mapping_set})),
+    edges_path = directory.joinpath("edges.tsv.gz")
+    edges_supp_header = [
+        ":START_ID",
+        ":TYPE",
+        ":END_ID",
+    ]
+    with gzip.open(mapping_edges_path, "wt") as file1, gzip.open(edges_path, "wt") as file2:
+        mapping_writer = csv.writer(file1, delimiter="\t")
+        mapping_writer.writerow(edges_header)
+
+        edge_writer = csv.writer(file2, delimiter="\t")
+        edge_writer.writerow(edges_supp_header)
+
+        for mapping in tqdm(mappings, unit="mapping", unit_scale=True, desc="Preparing Neo4j"):
+            concepts.add(mapping.s)
+            concepts.add(mapping.o)
+
+            mapping_writer.writerow(
+                (
+                    mapping.s.curie,
+                    mapping.p.curie,
+                    mapping.o.curie,
+                    _safe_confidence(mapping),
+                    _neo4j_bool(mapping.has_primary),
+                    _neo4j_bool(mapping.has_secondary),
+                    _neo4j_bool(mapping.has_tertiary),
+                    "|".join(
+                        sorted({evidence.mapping_set.name for evidence in mapping.evidence if evidence.mapping_set})
+                    ),
+                )
             )
-        )
-        edges.append((mapping.curie, ANNOTATED_SOURCE.curie, mapping.s.curie, "", "", "", "", ""))
-        edges.append((mapping.curie, ANNOTATED_TARGET.curie, mapping.o.curie, "", "", "", "", ""))
-        for evidence in mapping.evidence:
-            edges.append((mapping.curie, HAS_EVIDENCE_PREDICATE, evidence.curie, "", "", "", "", ""))
-            evidences[evidence.key()] = evidence
-            if evidence.mapping_set:
-                mapping_sets[evidence.mapping_set.name] = evidence.mapping_set
-                edges.append((evidence.curie, FROM_SET_PREDICATE, evidence.mapping_set.curie, "", "", "", "", ""))
-            elif isinstance(evidence, ReasonedEvidence):
-                for mmm in evidence.mappings:
-                    edges.append((evidence.curie, DERIVED_PREDICATE, mmm.curie, "", "", "", "", ""))
-            # elif isinstance(evidence, SimpleEvidence):
-            #     pass
-            # else:
-            #     raise TypeError
-
-            # Add authorship information for the evidence, if available
-            if evidence.author:
-                concepts.add(evidence.author)
-                edges.append((evidence.curie, "hasAuthor", evidence.author.curie, "", "", "", "", ""))
-
-    _write_tsv(
+            edge_writer.writerow((mapping.curie, ANNOTATED_SOURCE.curie, mapping.s.curie))
+            edge_writer.writerow((mapping.curie, ANNOTATED_TARGET.curie, mapping.o.curie))
+            for evidence in mapping.evidence:
+                edge_writer.writerow((mapping.curie, HAS_EVIDENCE_PREDICATE, evidence.curie))
+                evidences[evidence.key()] = evidence
+                if evidence.mapping_set:
+                    mapping_sets[evidence.mapping_set.name] = evidence.mapping_set
+                    edge_writer.writerow((evidence.curie, FROM_SET_PREDICATE, evidence.mapping_set.curie))
+                elif isinstance(evidence, ReasonedEvidence):
+                    for mmm in evidence.mappings:
+                        edge_writer.writerow((evidence.curie, DERIVED_PREDICATE, mmm.curie))
+                # elif isinstance(evidence, SimpleEvidence):
+                #     pass
+                # else:
+                #     raise TypeError
+
+                # Add authorship information for the evidence, if available
+                if evidence.author:
+                    concepts.add(evidence.author)
+                    edge_writer.writerow((evidence.curie, HAS_AUTHOR_PREDICATE, evidence.author.curie))
+
+    sorted_concepts = sorted(concepts, key=lambda n: n.curie) if sort else list(concepts)
+    _write_tsv_gz(
         concept_nodes_path,
         concept_nodes_header,
         (
             (
                 concept.curie,
-                "concept",
                 concept.prefix,
                 _get_name_by_curie(concept.curie) or "" if add_labels else "",
                 _neo4j_bool(equivalence_classes.get(concept, False)),
             )
-            for concept in sorted(concepts, key=lambda n: n.curie)
+            for concept in tqdm(sorted_concepts, desc="writing concept nodes", unit_scale=True, unit="concept")
         ),
     )
-    _write_tsv(
+
+    sorted_mappings = sorted(mappings, key=lambda n: n.curie) if sort else mappings
+    _write_tsv_gz(
         mapping_nodes_path,
         mapping_nodes_header,
         (
             (
                 mapping.curie,
-                "mapping",
                 "semra.mapping",
                 mapping.p.curie,
                 _safe_confidence(mapping),
@@ -731,41 +759,44 @@ def write_neo4j(
                 _neo4j_bool(mapping.has_secondary),
                 _neo4j_bool(mapping.has_tertiary),
             )
-            for mapping in sorted(mappings, key=lambda n: n.curie)
+            for mapping in tqdm(sorted_mappings, desc="writing mapping nodes", unit_scale=True, unit="mapping")
         ),
     )
-    _write_tsv(
+
+    sorted_mapping_sets = sorted(mapping_sets.values(), key=lambda n: n.curie) if sort else list(mapping_sets.values())
+    _write_tsv_gz(
         mapping_set_nodes_path,
         mapping_set_nodes_header,
         (
             (
                 mapping_set.curie,
-                "mappingset",
                 "semra.mappingset",
                 mapping_set.name,
                 mapping_set.license or "",
                 mapping_set.version or "",
                 _safe_confidence(mapping_set),
             )
-            for mapping_set in sorted(mapping_sets.values(), key=lambda n: n.curie)
+            for mapping_set in sorted_mapping_sets
         ),
     )
-    _write_tsv(
+
+    sorted_evidences = sorted(evidences.values(), key=lambda row: row.curie) if sort else list(evidences.values())
+    _write_tsv_gz(
         evidence_nodes_path,
         evidence_nodes_header,
         (
             (
                 evidence.curie,
-                "evidence",
                 "semra.evidence",
                 evidence.evidence_type,
                 evidence.justification.curie,
                 _safe_confidence(evidence),
             )
-            for evidence in sorted(evidences.values(), key=lambda row: row.curie)
+            for evidence in tqdm(
+                sorted_evidences, desc="Writing evidence nodes", leave=False, unit_scale=True, unit="evidence"
+            )
         ),
     )
-    _write_tsv(edges_path, edges_header, sorted(set(edges), key=_edge_key))
 
     startup_commands = dedent(
         """\
@@ -773,9 +804,7 @@ def write_neo4j(
         neo4j start
 
         # Get the port
-        until [ \
-          "$(curl -s -w '%{http_code}' -o /dev/null "http://localhost:7474")" \
-          -eq 200 ]
+        until [ "$(curl -s -w '%{http_code}' -o /dev/null "http://localhost:7474")" -eq 200 ]
         do
           sleep 5
         done
@@ -805,26 +834,33 @@ def write_neo4j(
             apt-get install -y git zip unzip bzip2 gcc pkg-config python3.11 && \\
             curl -sS https://bootstrap.pypa.io/get-pip.py | python3.11
 
-        ARG twiddle1=dee
         RUN python3.11 -m pip install "semra[web] @ git+https://github.com/biopragmatics/semra.git"
 
         # Add graph content
-        ARG twiddle2=dee
-        COPY concept_nodes.tsv /sw/concept_nodes.tsv
-        COPY mapping_nodes.tsv /sw/mapping_nodes.tsv
-        COPY evidence_nodes.tsv /sw/evidence_nodes.tsv
-        COPY mapping_set_nodes.tsv /sw/mapping_set_nodes.tsv
-        COPY edges.tsv /sw/edges.tsv
+        COPY concept_nodes.tsv.gz /sw/concept_nodes.tsv.gz
+        COPY mapping_nodes.tsv.gz /sw/mapping_nodes.tsv.gz
+        COPY evidence_nodes.tsv.gz /sw/evidence_nodes.tsv.gz
+        COPY mapping_set_nodes.tsv.gz /sw/mapping_set_nodes.tsv.gz
+        COPY mapping_edges.tsv.gz /sw/mapping_edges.tsv.gz
+        COPY edges.tsv.gz /sw/edges.tsv.gz
 
         # Ingest graph content into neo4j
         RUN sed -i 's/#dbms.default_listen_address/dbms.default_listen_address/' /etc/neo4j/neo4j.conf && \\
             sed -i 's/#dbms.security.auth_enabled/dbms.security.auth_enabled/' /etc/neo4j/neo4j.conf && \\
             neo4j-admin import --delimiter='TAB' --skip-duplicate-nodes=true --skip-bad-relationships=true \\
+                --relationships /sw/mapping_edges.tsv \\
                 --relationships /sw/edges.tsv \\
-                --nodes /sw/concept_nodes.tsv \\
-                --nodes /sw/mapping_nodes.tsv \\
-                --nodes /sw/mapping_set_nodes.tsv \\
-                --nodes /sw/evidence_nodes.tsv
+                --nodes=concept=/sw/concept_nodes.tsv \\
+                --nodes=mapping=/sw/mapping_nodes.tsv \\
+                --nodes=mappingset=/sw/mapping_set_nodes.tsv \\
+                --nodes=evidence=/sw/evidence_nodes.tsv
+
+        RUN rm /sw/concept_nodes.tsv.gz
+        RUN rm /sw/mapping_nodes.tsv.gz
+        RUN rm /sw/evidence_nodes.tsv.gz
+        RUN rm /sw/mapping_set_nodes.tsv.gz
+        RUN rm /sw/edges.tsv.gz
+        RUN rm /sw/mapping_edges.tsv.gz
 
         COPY startup.sh startup.sh
         ENTRYPOINT ["/bin/bash", "/sw/startup.sh"]
@@ -860,9 +896,9 @@ def write_neo4j(
     # command_path.write_text(shell_command)
 
 
-def _write_tsv(path, header, rows) -> None:
+def _write_tsv_gz(path, header, rows) -> None:
     click.echo(f"writing to {path}")
-    with path.open("w") as file:
-        print(*header, sep="\t", file=file)  # noqa:T201
-        for row in rows:
-            print(*row, sep="\t", file=file)  # noqa:T201
+    with gzip.open(path, "wt") as file:
+        writer = csv.writer(file, delimiter="\t")
+        writer.writerow(header)
+        writer.writerows(rows)
diff --git a/src/semra/landscape/anatomy.py b/src/semra/landscape/anatomy.py
index 47f889f..5cdbe53 100644
--- a/src/semra/landscape/anatomy.py
+++ b/src/semra/landscape/anatomy.py
@@ -5,6 +5,7 @@
 from pyobo.sources.mesh import get_mesh_category_curies
 
 import semra
+from semra.pipeline import CREATOR_CHARLIE
 
 __all__ = [
     "MODULE",
@@ -32,7 +33,9 @@
 }
 
 CONFIGURATION = semra.Configuration(
-    name="Anatomy mappings",
+    name="SeMRA Anatomy Mappings Database",
+    description="Supports the analysis of the landscape of anatomy nomenclature resources.",
+    creators=[CREATOR_CHARLIE],
     inputs=[
         semra.Input(source="biomappings"),
         semra.Input(source="gilda"),
@@ -64,6 +67,8 @@
     processed_neo4j_name="semra-anatomy",
     priority_pickle_path=MODULE.join(name="priority.pkl"),
     priority_sssom_path=MODULE.join(name="priority.sssom.tsv"),
+    configuration_path=MODULE.join(name="configuration.json"),
+    zenodo_record=11091803,
 )
 
 
@@ -71,6 +76,7 @@
 def main():
     """Build the mapping database for anatomical terms."""
     CONFIGURATION.get_mappings(refresh_raw=True, refresh_processed=True)
+    CONFIGURATION.upload_zenodo()
 
 
 if __name__ == "__main__":
diff --git a/src/semra/landscape/cells.py b/src/semra/landscape/cells.py
index d0fef7f..8c48738 100644
--- a/src/semra/landscape/cells.py
+++ b/src/semra/landscape/cells.py
@@ -20,7 +20,7 @@
 
 from semra.api import project, str_source_target_counts
 from semra.io import write_sssom
-from semra.pipeline import Configuration, Input, Mutation, get_mappings_from_config
+from semra.pipeline import CREATOR_CHARLIE, Configuration, Input, Mutation, get_mappings_from_config
 
 __all__ = [
     "MODULE",
@@ -50,9 +50,10 @@
 }
 
 CONFIGURATION = Configuration(
-    name="Cell and Cell Line Mappings",
+    name="SeMRA Cell and Cell Line Mappings Database",
     description="Originally a reproduction of the EFO/Cellosaurus/DepMap/CCLE scenario posed in the Biomappings paper, "
     "this configuration imports several different cell and cell line resources and identifies mappings between them.",
+    creators=[CREATOR_CHARLIE],
     inputs=[
         Input(source="biomappings"),
         Input(source="gilda"),
@@ -96,6 +97,8 @@
     processed_neo4j_name="semra-cell",
     priority_pickle_path=MODULE.join(name="priority.pkl"),
     priority_sssom_path=MODULE.join(name="priority.sssom.tsv"),
+    configuration_path=MODULE.join(name="configuration.json"),
+    zenodo_record=11091581,
 )
 
 
@@ -103,6 +106,7 @@
 def main():
     """Build the mapping database for cell and cell line terms."""
     mappings = get_mappings_from_config(CONFIGURATION, refresh_raw=True, refresh_processed=True)
+    CONFIGURATION.upload_zenodo()
 
     click.echo(f"Processing returned {len(mappings):,} mappings")
     click.echo(str_source_target_counts(mappings))
diff --git a/src/semra/landscape/complexes.py b/src/semra/landscape/complexes.py
index 2724b52..0977ddd 100644
--- a/src/semra/landscape/complexes.py
+++ b/src/semra/landscape/complexes.py
@@ -3,7 +3,8 @@
 import click
 import pystow
 
-from semra.pipeline import Configuration, Input, Mutation
+from semra.pipeline import Configuration, Creator, Input, Mutation
+from semra.rules import CHARLIE_NAME, CHARLIE_ORCID
 
 __all__ = [
     "MODULE",
@@ -27,8 +28,9 @@
 }
 
 CONFIGURATION = Configuration(
-    name="Protein Complex Landscape Analysis",
+    name="SeMRA Protein Complex Landscape Analysis",
     description="Analyze the landscape of protein complex nomenclature resources, species-agnostic.",
+    creators=[Creator(orcid=CHARLIE_ORCID.identifier, name=CHARLIE_NAME)],
     inputs=[
         Input(source="gilda"),
         Input(source="biomappings"),
@@ -51,20 +53,25 @@
     ],
     raw_pickle_path=MODULE.join(name="raw.pkl"),
     raw_sssom_path=MODULE.join(name="raw.sssom.tsv"),
-    # raw_neo4j_path=MODULE.join("neo4j_raw"),
+    raw_neo4j_path=MODULE.join("neo4j_raw"),
+    raw_neo4j_name="semra-complex",
     processed_pickle_path=MODULE.join(name="processed.pkl"),
     processed_sssom_path=MODULE.join(name="processed.sssom.tsv"),
     processed_neo4j_path=MODULE.join("neo4j"),
     processed_neo4j_name="semra-complex",
     priority_pickle_path=MODULE.join(name="priority.pkl"),
     priority_sssom_path=MODULE.join(name="priority.sssom.tsv"),
+    configuration_path=MODULE.join(name="configuration.json"),
+    zenodo_record=11091422,
 )
 
 
 @click.command()
 def main():
     """Build the mapping database for protein complex terms."""
-    CONFIGURATION.get_mappings(refresh_raw=True, refresh_processed=True)
+    CONFIGURATION.get_mappings(refresh_raw=False, refresh_processed=False)
+    res = CONFIGURATION.upload_zenodo()
+    click.echo(res.json()["links"]["html"])
 
 
 if __name__ == "__main__":
diff --git a/src/semra/landscape/diseases.py b/src/semra/landscape/diseases.py
index 33ae4d5..642cd1d 100644
--- a/src/semra/landscape/diseases.py
+++ b/src/semra/landscape/diseases.py
@@ -5,7 +5,7 @@
 import pystow
 from pyobo.sources.mesh import get_mesh_category_curies
 
-from semra.pipeline import Configuration, Input, Mutation
+from semra.pipeline import CREATOR_CHARLIE, Configuration, Input, Mutation
 
 __all__ = [
     "MODULE",
@@ -45,8 +45,9 @@
 }
 
 CONFIGURATION = Configuration(
-    name="Disease Landscape Analysis",
-    description="",
+    name="SeMRA Disease Mappings Database",
+    description="Supports the analysis of the landscape of disease nomenclature resources.",
+    creators=[CREATOR_CHARLIE],
     inputs=[
         Input(source="biomappings"),
         Input(source="gilda"),
@@ -84,6 +85,8 @@
     processed_neo4j_name="semra-disease",
     priority_pickle_path=MODULE.join(name="priority.pkl"),
     priority_sssom_path=MODULE.join(name="priority.sssom.tsv"),
+    configuration_path=MODULE.join(name="configuration.json"),
+    zenodo_record=11091886,
 )
 
 
@@ -92,6 +95,7 @@ def main():
     """Build the mapping database for disease terms."""
     # Takes about 2 hours
     CONFIGURATION.get_mappings(refresh_raw=True, refresh_processed=True)
+    CONFIGURATION.upload_zenodo()
 
 
 if __name__ == "__main__":
diff --git a/src/semra/landscape/genes.py b/src/semra/landscape/genes.py
index 6e75cc8..0b4e3f0 100644
--- a/src/semra/landscape/genes.py
+++ b/src/semra/landscape/genes.py
@@ -3,7 +3,7 @@
 import click
 import pystow
 
-from semra.pipeline import Configuration, Input, Mutation
+from semra.pipeline import CREATOR_CHARLIE, Configuration, Input, Mutation
 
 __all__ = [
     "MODULE",
@@ -30,8 +30,9 @@
 ]
 
 CONFIGURATION = Configuration(
-    name="Gene Landscape Analysis",
+    name="SeMRA Gene Mapping Database",
     description="Analyze the landscape of gene nomenclature resources, species-agnostic.",
+    creators=[CREATOR_CHARLIE],
     inputs=[
         Input(prefix="hgnc", source="pyobo", confidence=0.99),
         Input(prefix="mgi", source="pyobo", confidence=0.99),
@@ -58,15 +59,17 @@
         Mutation(source="umls", confidence=0.8),
         Mutation(source="ncit", confidence=0.8),
     ],
-    raw_pickle_path=MODULE.join(name="raw.pkl"),
-    raw_sssom_path=MODULE.join(name="raw.sssom.tsv"),
+    raw_pickle_path=MODULE.join(name="raw.pkl.gz"),
+    raw_sssom_path=MODULE.join(name="raw.sssom.tsv.gz"),
     # raw_neo4j_path=MODULE.join("neo4j_raw"),
-    processed_pickle_path=MODULE.join(name="processed.pkl"),
-    processed_sssom_path=MODULE.join(name="processed.sssom.tsv"),
+    processed_pickle_path=MODULE.join(name="processed.pkl.gz"),
+    processed_sssom_path=MODULE.join(name="processed.sssom.tsv.gz"),
     processed_neo4j_path=MODULE.join("neo4j"),
     processed_neo4j_name="semra-gene",
-    priority_pickle_path=MODULE.join(name="priority.pkl"),
-    priority_sssom_path=MODULE.join(name="priority.sssom.tsv"),
+    priority_pickle_path=MODULE.join(name="priority.pkl.gz"),
+    priority_sssom_path=MODULE.join(name="priority.sssom.tsv.gz"),
+    configuration_path=MODULE.join(name="configuration.json"),
+    zenodo_record=11092013,
 )
 
 
@@ -74,6 +77,7 @@
 def main():
     """Build the mapping database for gene terms."""
     CONFIGURATION.get_mappings(refresh_raw=True, refresh_processed=True)
+    CONFIGURATION.upload_zenodo()
 
 
 if __name__ == "__main__":
diff --git a/src/semra/landscape/utils.py b/src/semra/landscape/utils.py
index 4127d2f..9889a39 100644
--- a/src/semra/landscape/utils.py
+++ b/src/semra/landscape/utils.py
@@ -68,6 +68,9 @@ def notebook(
     if output_directory is None:
         output_directory = configuration.raw_pickle_path.parent
     output_directory = Path(output_directory).expanduser().resolve()
+    configuration_path = output_directory.joinpath("configuration.json")
+    configuration_path.write_text(configuration.model_dump_json(indent=2, exclude_none=True, exclude_unset=True))
+
     terms = get_terms(configuration.priority, configuration.subsets)
 
     hydrated_subsets = configuration.get_hydrated_subsets()
diff --git a/src/semra/pipeline.py b/src/semra/pipeline.py
index 7c9514d..3990e9c 100644
--- a/src/semra/pipeline.py
+++ b/src/semra/pipeline.py
@@ -8,6 +8,7 @@
 from pathlib import Path
 from typing import Any, Literal, Optional
 
+import requests
 from pydantic import BaseModel, Field, root_validator
 from tqdm.auto import tqdm
 
@@ -35,7 +36,7 @@
     write_pickle,
     write_sssom,
 )
-from semra.rules import DB_XREF, EXACT_MATCH, IMPRECISE
+from semra.rules import CHARLIE_NAME, CHARLIE_ORCID, DB_XREF, EXACT_MATCH, IMPRECISE
 from semra.sources import SOURCE_RESOLVER
 from semra.sources.biopragmatics import (
     from_biomappings_negative,
@@ -46,9 +47,13 @@
 from semra.sources.wikidata import get_wikidata_mappings_by_prefix
 from semra.struct import Mapping, Reference
 
+if t.TYPE_CHECKING:
+    import zenodo_client
+
 __all__ = [
     # Configuration model
     "Configuration",
+    "Creator",
     "SubsetConfiguration",
     "Input",
     "Mutation",
@@ -82,6 +87,16 @@ class Mutation(BaseModel):
 SubsetConfiguration = t.Mapping[str, t.Collection[str]]
 
 
+class Creator(BaseModel):
+    """A model describing a creator."""
+
+    name: str
+    orcid: str
+
+
+CREATOR_CHARLIE = Creator(name=CHARLIE_NAME, orcid=CHARLIE_ORCID.identifier)
+
+
 class Configuration(BaseModel):
     """Represents the steps taken during mapping assembly."""
 
@@ -89,6 +104,7 @@ class Configuration(BaseModel):
     description: Optional[str] = Field(
         None, description="An explanation of the purpose of the mapping set configuration"
     )
+    creators: t.List[Creator] = Field(default_factory=list, description="A list of the ORCID identifiers for creators")
     inputs: t.List[Input] = Field(..., description="A list of sources of mappings")
     negative_inputs: t.List[Input] = Field(default=[Input(source="biomappings", prefix="negative")])
     priority: t.List[str] = Field(
@@ -97,9 +113,9 @@ class Configuration(BaseModel):
     mutations: t.List[Mutation] = Field(default_factory=list)
     subsets: t.Optional[t.Mapping[str, t.List[str]]] = Field(
         None,
-        description="A field to put restrictions on the subhierarchies from each resource. For example, if "
+        description="A field to put restrictions on the sub-hierarchies from each resource. For example, if "
         "you want to assemble cell mappings from MeSH, you don't need all possible mesh mappings, but only "
-        "ones that have to do with terms in the cell hierchy under the mesh:D002477 term. Therefore, this "
+        "ones that have to do with terms in the cell hierarchy under the mesh:D002477 term. Therefore, this "
         "dictionary allows for specifying such restrictions",
         examples=[
             {"mesh": ["mesh:D002477"]},
@@ -138,6 +154,10 @@ class Configuration(BaseModel):
 
     add_labels: bool = Field(default=False, description="Should PyOBO be used to look up labels for SSSOM output?")
 
+    configuration_path: Optional[Path] = Field(None, description="The path where this configuration should be written.")
+
+    zenodo_record: Optional[int] = Field(None, description="The Zenodo record identifier")
+
     @root_validator(skip_on_failure=True)
     def infer_priority(cls, values):  # noqa:N805
         """Infer the priority from the input list of not given."""
@@ -146,6 +166,12 @@ def infer_priority(cls, values):  # noqa:N805
             values["priority"] = [inp.prefix for inp in values["inputs"].inputs if inp.prefix is not None]
         return values
 
+    def zenodo_url(self) -> t.Optional[str]:
+        """Get the zenodo URL, if available."""
+        if self.zenodo_record is None:
+            return None
+        return f"https://bioregistry.io/zenodo.record:{self.zenodo_record}"
+
     @classmethod
     def from_prefixes(
         cls, *, name: str, prefixes: t.Iterable[str], include_biomappings: bool = True, include_gilda: bool = True
@@ -189,6 +215,80 @@ def get_hydrated_subsets(self) -> t.Mapping[str, t.Collection[str]]:
             return {}
         return hydrate_subsets(self.subsets)
 
+    def _get_zenodo_metadata(self) -> "zenodo_client.Metadata":
+        if not self.creators:
+            raise ValueError("Creating a Zenodo record requires annotating the creators field")
+        import zenodo_client
+
+        if self.name is None:
+            raise ValueError("name must be given to upload to zenodo")
+        if self.description is None:
+            raise ValueError("description must be given to upload to zenodo")
+        if not self.creators:
+            raise ValueError("at least one creator must be given to upload to zenodo")
+
+        return zenodo_client.Metadata(
+            upload_type="dataset",
+            title=self.name,
+            description=self.description,
+            creators=[zenodo_client.Creator(name=creator.name, orcid=creator.orcid) for creator in self.creators],
+        )
+
+    def _get_zenodo_paths(self, *, processed: bool = True) -> t.List[Path]:
+        if self.configuration_path is not None and not self.configuration_path.is_file():
+            self.configuration_path.write_text(self.model_dump_json(indent=2, exclude_none=True, exclude_unset=True))
+        paths = [
+            self.configuration_path,
+            self.raw_sssom_path,
+            self.raw_pickle_path,
+            self.processed_sssom_path,
+            self.processed_pickle_path,
+            self.priority_sssom_path,
+            self.processed_pickle_path,
+        ]
+        for path in paths:
+            if path is None:
+                raise ValueError("Can't upload to Zenodo if not all output paths are configured")
+            if not path.is_file():
+                raise FileNotFoundError(path)
+        if processed and self.processed_neo4j_path is not None and self.processed_neo4j_path.is_dir():
+            paths.extend(self.processed_neo4j_path.iterdir())
+        elif self.raw_neo4j_path is not None and self.raw_neo4j_path.is_dir():
+            paths.extend(self.raw_neo4j_path.iterdir())
+        else:
+            logger.debug("Not uploading neo4j")
+        return t.cast(t.List[Path], paths)
+
+    def ensure_zenodo(
+        self, key: str, *, metadata: t.Optional["zenodo_client.Metadata"] = None, processed: bool = True, **kwargs
+    ) -> requests.Response:
+        """Ensure a zenodo record."""
+        if self.zenodo_record is not None:
+            raise ValueError(
+                f"Refusing to create new Zenodo record since it already exists: "
+                f"https://bioregistry.io/zenodo.record:{self.zenodo_record}.\n\n"
+                f"Use `Configuration.upload_zenodo(processed={processed})` instead."
+            )
+
+        from zenodo_client import ensure_zenodo
+
+        paths = self._get_zenodo_paths(processed=processed)
+        res = ensure_zenodo(key=key, data=metadata or self._get_zenodo_metadata(), paths=paths, **kwargs)
+        return res
+
+    def upload_zenodo(self, processed: bool = True, **kwargs) -> requests.Response:
+        """Upload a Zenodo record."""
+        if not self.zenodo_record:
+            raise ValueError(
+                "Can not upload to zenodo if no record is configured.\n\n"
+                f"Use `Configuration.ensure_zenodo(key=..., processed={processed})` instead."
+            )
+        from zenodo_client import update_zenodo
+
+        paths = self._get_zenodo_paths(processed=processed)
+        res = update_zenodo(str(self.zenodo_record), paths=paths, **kwargs)
+        return res
+
 
 def get_mappings_from_config(
     configuration: Configuration,
@@ -214,6 +314,11 @@ def get_mappings_from_config(
             "loaded cached raw mappings from %s in %.2f seconds", configuration.raw_pickle_path, time.time() - start
         )
     else:
+        if configuration.configuration_path is not None:
+            configuration.configuration_path.write_text(
+                configuration.model_dump_json(exclude_none=True, exclude_unset=True, indent=2)
+            )
+
         raw_mappings = get_raw_mappings(configuration)
         if configuration.validate_raw:
             validate_mappings(raw_mappings)
@@ -374,7 +479,7 @@ def process(
         mappings = infer_mutual_dbxref_mutations(mappings, upgrade_prefixes, confidence=0.95)
         _log_diff(before, mappings, verb="Inferred upgrades", elapsed=time.time() - start)
 
-    # remove dbxrefs
+    # remove database cross-references
     if remove_imprecise:
         logger.info("Removing unqualified database xrefs")
         before = len(mappings)
@@ -382,7 +487,7 @@ def process(
         mappings = [m for m in mappings if m.p not in IMPRECISE]
         _log_diff(before, mappings, verb="Filtered non-precise", elapsed=time.time() - start)
 
-    # 3. Inference based on adding reverse relations then doing multi-chain hopping
+    # 3. Inference based on adding reverse relations then doing multichain hopping
     logger.info("Inferring reverse mappings")
     before = len(mappings)
     start = time.time()
diff --git a/src/semra/rules.py b/src/semra/rules.py
index 1d44c18..49d94fe 100644
--- a/src/semra/rules.py
+++ b/src/semra/rules.py
@@ -2,7 +2,7 @@
 
 from __future__ import annotations
 
-from semra.struct import Reference
+from curies import Reference
 
 EXACT_MATCH = Reference(prefix="skos", identifier="exactMatch")
 BROAD_MATCH = Reference(prefix="skos", identifier="broadMatch")
@@ -44,4 +44,5 @@
 KNOWLEDGE_MAPPING = Reference.from_curie("semapv:BackgroundKnowledgeBasedMatching")
 
 CHARLIE_ORCID = Reference.from_curie("orcid:0000-0003-4423-4370")
+CHARLIE_NAME = "Charles Tapley Hoyt"
 BEN_ORCID = Reference.from_curie("orcid:0000-0001-9439-5346")
diff --git a/src/semra/sources/chembl.py b/src/semra/sources/chembl.py
index 13ebd4e..6ed6f86 100644
--- a/src/semra/sources/chembl.py
+++ b/src/semra/sources/chembl.py
@@ -54,9 +54,9 @@ def get_chembl_protein_mappings(version: Optional[str] = None) -> list[Mapping]:
     df = chembl_downloader.get_uniprot_mapping_df(version=version)
     return [
         Mapping(
-            s=Reference(prefix="uniprot", identifier=uniprot),
+            s=Reference(prefix="chembl.target", identifier=chembl_id),
             p=EXACT_MATCH,
-            o=Reference(prefix="chembl.target", identifier=chembl_id),
+            o=Reference(prefix="uniprot", identifier=uniprot),
             evidence=[
                 SimpleEvidence(
                     justification=UNSPECIFIED_MAPPING,
diff --git a/src/semra/sources/famplex.py b/src/semra/sources/famplex.py
index c4512c5..f9701f7 100644
--- a/src/semra/sources/famplex.py
+++ b/src/semra/sources/famplex.py
@@ -42,7 +42,7 @@ def get_fplx_mappings() -> list[Mapping]:
             and not (target_prefix == "NXP" and target_id.startswith("FA:"))  # is this a problem?
         )
     ]
-    validate_mappings(rv)
+    validate_mappings(rv, progress=False)
     return rv
 
 
diff --git a/src/semra/sources/pubchem.py b/src/semra/sources/pubchem.py
index 551644a..e93beec 100644
--- a/src/semra/sources/pubchem.py
+++ b/src/semra/sources/pubchem.py
@@ -3,11 +3,11 @@
 from __future__ import annotations
 
 import logging
-from typing import Optional
+from typing import Optional, Set
 
 import bioversions
-import pandas as pd
 import pyobo
+import requests
 from curies import Reference
 
 from semra.rules import EXACT_MATCH, UNSPECIFIED_MAPPING
@@ -25,29 +25,26 @@ def get_pubchem_mesh_mappings(version: Optional[str] = None) -> list[Mapping]:
     """Get a mapping from PubChem compound identifiers to their equivalent MeSH terms."""
     if version is None:
         version = bioversions.get_version("pubchem")
-    url = f"ftp://ftp.ncbi.nlm.nih.gov/pubchem/Compound/Monthly/{version}/Extras/CID-MeSH"
-    df = pd.read_csv(
-        url,
-        dtype=str,
-        header=None,
-        names=["pubchem", "mesh"],
-    )
+
     mesh_name_to_id = pyobo.get_name_id_mapping("mesh")
-    needs_curation = set()
-    mesh_ids = []
-    for name in df["mesh"]:
-        mesh_id = mesh_name_to_id.get(name)
-        if mesh_id is None and name not in needs_curation:
-            needs_curation.add(name)
-            logger.debug("[mesh] needs curating: %s", name)
-        mesh_ids.append(mesh_id)
-    logger.info("[mesh] %d/%d need updating", len(needs_curation), len(mesh_ids))
-    df["mesh"] = mesh_ids
-
-    return [
-        Mapping(
+    needs_curation: Set[str] = set()
+
+    url = f"https://ftp.ncbi.nlm.nih.gov/pubchem/Compound/Monthly/{version}/Extras/CID-MeSH"
+    res = requests.get(url, stream=True)
+
+    rv = []
+    for line in res.iter_lines():
+        # on a small number of entries, there are multiple names. their impact is negligible
+        pubchem, mesh_name, *_ = line.decode("utf8").strip().split("\t")
+        mesh_id = mesh_name_to_id.get(mesh_name)
+        if mesh_id is None:
+            if mesh_name not in needs_curation:
+                needs_curation.add(mesh_name)
+                logger.debug("[mesh] needs curating: %s", mesh_name)
+            continue
+        mapping = Mapping(
             s=Reference(prefix="pubchem.compound", identifier=pubchem),
-            o=Reference(prefix="mesh", identifier=mesh),
+            o=Reference(prefix="mesh", identifier=mesh_id),
             p=EXACT_MATCH,
             evidence=[
                 SimpleEvidence(
@@ -57,6 +54,7 @@ def get_pubchem_mesh_mappings(version: Optional[str] = None) -> list[Mapping]:
                 )
             ],
         )
-        for pubchem, mesh in df.values
-        if mesh is not None
-    ]
+        rv.append(mapping)
+
+    logger.warning("[pubchem-mesh] %d MeSH names need manual curation", len(needs_curation))
+    return rv
diff --git a/src/semra/sources/wikidata.py b/src/semra/sources/wikidata.py
index 9f18994..b71cd62 100644
--- a/src/semra/sources/wikidata.py
+++ b/src/semra/sources/wikidata.py
@@ -42,13 +42,19 @@ def _help(
     if predicate is None:
         predicate = EXACT_MATCH
 
-    mapping_set = MappingSet(name="Wikidata", license="CC0", confidence=0.99)
+    mapping_set = MappingSet(name="wikidata", license="CC0", confidence=0.99)
     return [
         Mapping(
             s=Reference(prefix="wikidata", identifier=wikidata_id),
             p=predicate,
-            o=Reference(prefix=target_prefix, identifier=xref_id),
+            o=Reference(prefix=target_prefix, identifier=_clean_xref_id(target_prefix, xref_id)),
             evidence=[SimpleEvidence(justification=UNSPECIFIED_MAPPING, mapping_set=mapping_set)],
         )
         for wikidata_id, xref_id in iter_wikidata_mappings(prop, cache=cache)
     ]
+
+
+def _clean_xref_id(prefix: str, identifier: str) -> str:
+    if identifier.lower().startswith(f"{prefix}_"):
+        identifier = identifier[len(prefix) + 1 :]
+    return identifier