Skip to content

Commit 22aecdf

Browse files
committed
Fix #69 - improve TSV import
1 parent 367811b commit 22aecdf

10 files changed

+157
-16
lines changed

deps.edn

+3-2
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
{org.clojure/clojure {:mvn/version "1.12.0"}
66
org.clojure/core.async {:mvn/version "1.6.681"}
77
org.clojure/core.match {:mvn/version "1.1.0"}
8+
org.clojure/data.csv {:mvn/version "1.1.0"}
89
org.clojure/data.xml {:mvn/version "0.2.0-alpha9"}
910
org.clojure/data.zip {:mvn/version "1.1.0"}
1011
org.clojure/tools.logging {:mvn/version "1.3.0"}
@@ -37,7 +38,7 @@
3738
:main-opts ["-m" "com.eldrix.hermes.cmd.core"]}
3839

3940
:dev
40-
{:extra-paths ["cmd" "test"]
41+
{:extra-paths ["cmd" "test/src" "test/resources"]
4142
:extra-deps {org.clojure/tools.cli {:mvn/version "1.1.230"}
4243
io.pedestal/pedestal.service {:mvn/version "0.7.1"}
4344
io.pedestal/pedestal.error {:mvn/version "0.7.1"}
@@ -54,7 +55,7 @@
5455
org.apache.lucene/lucene-backward-codecs {:mvn/version "10.0.0"}}}
5556

5657
:test
57-
{:extra-paths ["cmd" "test" "test/resources"]
58+
{:extra-paths ["cmd" "test/src" "test/resources"]
5859
:extra-deps {org.clojure/test.check {:mvn/version "1.1.1"}
5960
com.wsscode/pathom3 {:git/url "https://github.com/wilkerlucio/pathom3.git"
6061
:git/sha "2d9d1cf8ccfeee83566c31e776a5ef105b2a1626"}

src/com/eldrix/hermes/importer.clj

+30-14
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99
(ns com.eldrix.hermes.importer
1010
"Provides import functionality for processing directories of files"
1111
(:require [clojure.core.async :as a]
12+
[clojure.data.csv :as csv]
1213
[clojure.data.json :as json]
1314
[clojure.java.io :as io]
1415
[clojure.spec.alpha :as s]
@@ -73,7 +74,7 @@
7374
(merge (json/read-str (slurp f) :key-fn keyword :value-fn read-metadata-value)) ;; read in metadaa
7475
(update :modules update-keys (fn [x] (-> x name parse-long)))) ;; return all module identifiers as longs
7576
(catch Throwable e (log/warn e "Invalid metadata in distribution file" (:name default))
76-
(assoc default :error "Invalid metadata in distribution file")))))
77+
(assoc default :error "Invalid metadata in distribution file")))))
7778

7879
(defn metadata-files
7980
"Returns a list of release package information files from the directory.
@@ -90,20 +91,35 @@
9091
(doall (->> (metadata-files dir)
9192
(map read-metadata))))
9293

93-
(defn- process-file
94+
(defprotocol SnomedFile
95+
(parse-filename [this] "Returns structured data about a SNOMED file"))
96+
97+
(extend-protocol SnomedFile
98+
String
99+
(parse-filename [s] (snomed/parse-snomed-filename s))
100+
File
101+
(parse-filename [f] (snomed/parse-snomed-filename (.getName f)))
102+
java.net.URL
103+
(parse-filename [url] (snomed/parse-snomed-filename (.getPath url)))
104+
nil
105+
(parse-filename [_] nil))
106+
107+
(defn process-file
94108
"Process the specified file, streaming batched results to the channel
95-
specified, blocking if channel not being drained.
109+
specified, blocking if channel not being drained.
110+
Parameters:
111+
- f : anything coercible using clojure.java.io/reader
96112
97113
Each batch is a map with keys
98114
- :type : a type of SNOMED component
99115
- :parser : a parser that can take each row and give you data
100116
- :headings : a sequence of headings from the original file
101117
- :data : a sequence of vectors representing each column."
102-
[filename out-c & {:keys [batch-size] :or {batch-size 1000}}]
103-
(with-open [reader (io/reader filename)]
104-
(let [{:keys [identifier parser filename component]} (snomed/parse-snomed-filename filename)]
105-
(when parser
106-
(let [csv-data (map #(str/split % #"\t") (line-seq reader))
118+
[f out-c & {:keys [batch-size] :or {batch-size 1000}}]
119+
(let [{:keys [identifier parser filename component]} (parse-filename f)]
120+
(when parser
121+
(with-open [reader (io/reader f)]
122+
(let [csv-data (csv/read-csv reader :separator \tab)
107123
headings (first csv-data)
108124
data (rest csv-data)
109125
batches (->> data
@@ -141,12 +157,12 @@
141157
(a/>!! processed-c e)))
142158
(a/close! raw-c))
143159
(a/pipeline
144-
nthreads
145-
processed-c
146-
(map snomed/parse-batch)
147-
raw-c
148-
true
149-
(fn ex-handler [err] (log/debug "Error during import pipeline: " (ex-data err)) err))
160+
nthreads
161+
processed-c
162+
(map snomed/parse-batch)
163+
raw-c
164+
true
165+
(fn ex-handler [err] (log/debug "Error during import pipeline: " (ex-data err)) err))
150166
processed-c))
151167

152168
(defn load-snomed
+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
This is an example distribution with fragments of different release files for testing
2+
3+
Core terminology files
4+
======================
5+
6+
These are 'correct' versions of Concept, Description and Relationship files
7+
- ./Terminology/sct2_Concept_Snapshot_INT_20230131.txt
8+
- ./Terminology/sct2_Description_Snapshot-en_INT_20230131.txt
9+
- ./Terminology/sct2_RelationshipConcreteValues_Snapshot_INT_20230131.txt
10+
- ./Terminology/sct2_Relationship_Snapshot_INT_20230131.txt
11+
12+
Reference set files
13+
===================
14+
15+
- ./Refset/Map/der2_iisssccRefset_ExtendedMapSnapshot_INT_20230131.txt
16+
- This is a 'correct' International release version of an extended map
17+
18+
- ./Refset/Map/der2_ssRefset_SimpleMapWithDescriptionSnapshot_12345_20241021.txt
19+
- This is a 'custom' simple map reference set with null values for one the columns
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
id effectiveTime active moduleId refsetId referencedComponentId mapGroup mapPriority mapRule mapAdvice mapTarget correlationId mapCategoryId
2+
00005b30-d2ad-5891-ae82-060d2e20a9fc 20150731 1 449080006 447562003 211339002 1 1 TRUE ALWAYS S80.8 | POSSIBLE REQUIREMENT FOR AN EXTERNAL CAUSE CODE S80.8 447561005 447637006
3+
00006f25-3157-5132-b658-25708c9f1290 20150731 1 449080006 447562003 37535007 1 1 TRUE ALWAYS Q99.9 Q99.9 447561005 447637006
4+
00009ee5-904c-5e9f-a67a-ffe16d847782 20150731 1 449080006 447562003 299741008 1 1 TRUE MAP SOURCE CONCEPT CANNOT BE CLASSIFIED WITH AVAILABLE DATA 447561005 447638001
5+
0000b488-4995-58e1-a542-a8fc22593548 20220331 1 449080006 447562003 1208339007 1 1 TRUE ALWAYS G11.1 | POSSIBLE REQUIREMENT FOR ADDITIONAL CODE TO FULLY DESCRIBE DISEASE OR CONDITION G11.1 447561005 447637006
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,3 @@
1+
id effectiveTime active moduleId refsetId referencedComponentId mapTarget mapTargetDescription
2+
000d91ce-aae4-4f9e-ad06-51be576becd6 20241021 1 195941000112101 22671000001102 1434181000001106 ABC01256Q
3+
00280f1a-4c71-4683-8cb4-3e19e97ff9d9 20241021 1 195941000112101 22671000001102 1291061000001109 ABC08795Q
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
id effectiveTime active moduleId definitionStatusId
2+
100005 20020131 0 900000000000207008 900000000000074008
3+
101009 20020131 1 900000000000207008 900000000000074008
4+
102002 20020131 1 900000000000207008 900000000000074008
5+
103007 20020131 1 900000000000207008 900000000000074008
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
id effectiveTime active moduleId conceptId languageCode typeId term caseSignificanceId
2+
101013 20170731 1 900000000000207008 126813005 en 900000000000013009 Neoplasm of anterior aspect of epiglottis 900000000000448009
3+
102018 20170731 1 900000000000207008 126814004 en 900000000000013009 Neoplasm of junctional region of epiglottis 900000000000448009
4+
103011 20170731 1 900000000000207008 126815003 en 900000000000013009 Neoplasm of lateral wall of oropharynx 900000000000448009
5+
104017 20170731 1 900000000000207008 126816002 en 900000000000013009 Neoplasm of posterior wall of oropharynx 900000000000448009
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
id effectiveTime active moduleId sourceId value relationshipGroup typeId characteristicTypeId modifierId
2+
13830203029 20210731 1 900000000000207008 830045007 #3 0 1142139005 900000000000011006 900000000000451002
3+
13830204024 20210731 1 900000000000207008 830064001 #3 0 1142139005 900000000000011006 900000000000451002
4+
13830205020 20210731 1 900000000000207008 830066004 #3 0 1142139005 900000000000011006 900000000000451002
5+
13830206021 20210731 1 900000000000207008 830108003 #1 0 1142139005 900000000000011006 900000000000451002
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,5 @@
1+
id effectiveTime active moduleId sourceId destinationId relationshipGroup typeId characteristicTypeId modifierId
2+
100022 20090731 0 900000000000207008 100000000 102272007 0 116680003 900000000000011006 900000000000451002
3+
101021 20020131 1 900000000000207008 10000006 29857009 0 116680003 900000000000011006 900000000000451002
4+
102025 20020131 1 900000000000207008 10000006 9972008 0 116680003 900000000000011006 900000000000451002
5+
103024 20030131 0 900000000000207008 1000004 19130008 0 116680003 900000000000011006 900000000000451002
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
(ns com.eldrix.hermes.importer-test
2+
(:require
3+
[clojure.core.async :as async]
4+
[clojure.java.io :as io]
5+
[clojure.spec.gen.alpha :as gen]
6+
[clojure.test :refer [deftest is testing]]
7+
[com.eldrix.hermes.importer :as importer]
8+
[com.eldrix.hermes.rf2 :as rf2])
9+
(:import (java.time LocalDate)))
10+
11+
(deftest parse-filename
12+
(testing "nil filename"
13+
(is (nil? (importer/parse-filename nil))))
14+
(testing "concept filename as string"
15+
(let [{:keys [format version-date content-subtype type country-code identifier component]} (importer/parse-filename "sct2_Concept_Snapshot_INT_20230131.txt")]
16+
(is (= "Concept" component))
17+
(is (= "INT" country-code))
18+
(is (= :info.snomed/Concept identifier))
19+
(is (= "2" format))
20+
(is (= "sct" type))
21+
(is (= (LocalDate/of 2023 1 31) version-date))))
22+
(testing "description filename as URL"
23+
(let [{:keys [identifier]} (importer/parse-filename (java.net.URL. "file://Terminology/sct2_Description_Snapshot-en_INT_20230131.txt"))]
24+
(is (= :info.snomed/Description identifier))))
25+
(testing "relationship concrete values filename as file"
26+
(let [{:keys [identifier]} (importer/parse-filename (io/file "./Terminology/sct2_RelationshipConcreteValues_Snapshot_INT_20230131.txt"))]
27+
(is (= :info.snomed/RelationshipConcreteValues identifier)))))
28+
29+
(defn import-file
30+
"Import a SNOMED file"
31+
[f]
32+
(let [ch (async/chan)]
33+
(async/thread
34+
(importer/process-file f ch)
35+
(async/close! ch))
36+
(async/<!! ch)))
37+
38+
(deftest import-concepts
39+
(let [{:keys [type parser headings data]} (import-file (io/resource "example-snapshot/Terminology/sct2_Concept_Snapshot_INT_20230131.txt"))]
40+
(is (= :info.snomed/Concept type))))
41+
42+
(deftest import-refset
43+
(let [{:keys [type parser headings data] :as f} (import-file (io/resource "example-snapshot/Refset/Map/der2_iisssccRefset_ExtendedMapSnapshot_INT_20230131.txt"))]
44+
(is (= :info.snomed/ExtendedMapRefset type))
45+
(is (= ["id" "effectiveTime" "active" "moduleId" "refsetId" "referencedComponentId"
46+
"mapGroup" "mapPriority" "mapRule" "mapAdvice" "mapTarget"
47+
"correlationId" "mapCategoryId"] headings))))
48+
49+
(deftest import-custom-refset-nil-values
50+
(let [{:keys [type parser headings data] :as f} (import-file (io/resource "example-snapshot/Refset/Map/der2_ssRefset_SimpleMapWithDescriptionSnapshot_12345_20241021.txt"))]
51+
(is (= :info.snomed/SimpleMapRefset type))
52+
(is (= ["id" "effectiveTime" "active" "moduleId" "refsetId" "referencedComponentId"
53+
"mapTarget" "mapTargetDescription"] headings))
54+
(is (= ["000d91ce-aae4-4f9e-ad06-51be576becd6" "20241021"
55+
"1" "195941000112101" "22671000001102"
56+
"1434181000001106" "ABC01256Q" ""] (first data))
57+
"Empty last column should be returned as empty string")))
58+
59+
(comment
60+
(require '[clojure.data.csv :as csv])
61+
(csv/read-csv "hi\tthere\tand\thow\tare\tyou?\t" :separator \tab)
62+
(def f (io/resource "example-snapshot/Terminology/sct2_Concept_Snapshot_INT_20230131.txt"))
63+
(type f)
64+
(io/as-file f)
65+
(importer/parse-filename "sct2_Concept.txt")
66+
(importer/parse-filename (java.net.URL. "https://wibble.com/sct_Concept_Snapshot_INT_20230131.txt"))
67+
(importer/parse-filename f)
68+
(importer/parse-filename nil)
69+
(def ch (async/chan))
70+
(async/thread
71+
(importer/process-file f ch)
72+
(async/close! ch))
73+
(def ch (importer/load-snomed (io/resource "example-snapshot/")))
74+
(async/<!! ch)
75+
76+
(gen/sample (rf2/gen-simple-map-refset {:fields [""]})))
77+

0 commit comments

Comments
 (0)