Skip to content

Commit 4692e55

Browse files
Merge pull request #20 from kbase/schema_updates
Updating schema to break out experiments/protocols and bioentities
2 parents 15c6d8f + ab08eed commit 4692e55

16 files changed

+2932
-1230
lines changed

.github/workflows/linkml_tasks.yaml

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -33,7 +33,7 @@ jobs:
3333
- name: Lint linkml file
3434
id: lint_linkml
3535
run: |
36-
make lint
36+
make lint-no-warn
3737
continue-on-error: true
3838

3939
# - name: Validate sample data against the schema
@@ -54,6 +54,12 @@ jobs:
5454
make gendoc
5555
continue-on-error: true
5656

57+
# - name: Test pyspark code generation
58+
# id: test_pyspark
59+
# run: |
60+
# make pyspark
61+
# continue-on-error: true
62+
5763
- name: outcome failure
5864
if: steps.lint_linkml.outcome != 'success' || steps.validate_linkml.outcome != 'success' || steps.test_docgen.outcome != 'success'
5965
# steps.test_sample_data.outcome != 'success' || steps.test_sample_data_jsonschema.outcome != 'success'

README.md

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,8 @@ KBase CDM schema in linkml format.
88

99
## Repository Structure
1010

11+
Please note that the [Pyspark data structures](src/cdm_schema/kbase_cdm_pyspark.py) are the recommended way to create schema-compliant CDM classes.
12+
1113
* [src/](src/) - source files
1214
* [cdm_schema](src/cdm_schema)
1315
* [kbase_cdm_pydantic.py](src/cdm_schema/kbase_cdm_pydantic.py) -- CDM schema as Pydantic classes
@@ -17,15 +19,12 @@ KBase CDM schema in linkml format.
1719

1820
## Developer Documentation
1921

20-
<details>
2122
Use the `make` command to generate project artefacts:
2223

2324
* `make gen-artefacts`: generate the Pydantic, python, and JSONschema versions of the schema
2425

2526
* `make deploy`: deploys site
2627

27-
</details>
28-
2928
## Credits
3029

3130
This project was made with

linkml_to_pyspark.py

Lines changed: 70 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -19,34 +19,58 @@
1919

2020
logger = logging.getLogger(__name__)
2121

22+
STRING = "STRING"
23+
BOOL = "BOOLEAN"
24+
FLOAT = "FLOAT"
25+
INT = "INTEGER"
26+
DATE = "DATE"
27+
TS = "TIMESTAMP"
28+
2229
# Map LinkML types to PySpark types
2330
TYPE_MAP = {
24-
"boolean": BooleanType(),
25-
"xsd:boolean": BooleanType(),
31+
"boolean": BOOL,
32+
"xsd:boolean": BOOL,
2633
# numerical
27-
"decimal": FloatType(),
28-
"double": FloatType(),
29-
"float": FloatType(),
30-
"integer": IntegerType(),
31-
"long": FloatType(),
34+
"decimal": FLOAT,
35+
"double": FLOAT,
36+
"float": FLOAT,
37+
"integer": INT,
38+
"long": FLOAT,
3239
# dates and times
33-
"date": DateType(),
34-
"dateTime": DateType(),
35-
"time": TimestampType(),
36-
"xsd:date": DateType(),
37-
"xsd:dateTime": DateType(),
38-
"xsd:time": TimestampType(),
39-
"linkml:DateOrDatetime": DateType(),
40+
"date": DATE,
41+
"dateTime": DATE,
42+
"time": TS,
43+
"xsd:date": DATE,
44+
"xsd:dateTime": DATE,
45+
"xsd:time": TS,
46+
"linkml:DateOrDatetime": DATE,
4047
# string-like
41-
"anyURI": StringType(),
42-
"language": StringType(),
43-
"string": StringType(),
44-
"shex:nonLiteral": StringType(),
45-
"shex:iri": StringType(),
48+
"anyURI": STRING,
49+
"language": STRING,
50+
"str": STRING,
51+
"string": STRING,
52+
"shex:nonLiteral": STRING,
53+
"shex:iri": STRING,
54+
}
55+
56+
remap = {
57+
STRING: StringType(),
58+
BOOL: BooleanType(),
59+
FLOAT: FloatType(),
60+
INT: IntegerType(),
61+
DATE: DateType(),
62+
TS: TimestampType(),
4663
}
4764

4865

49-
def resolve_slot_range_class_relational(sv: SchemaView, class_name: str) -> set[DataType]:
66+
class SchemaViewWithProcessed(SchemaView):
67+
def __init__(self, *args, **kwargs) -> None:
68+
self.PROCESSED = {}
69+
self.RANGE_TO_TYPE = {}
70+
super().__init__(*args, **kwargs)
71+
72+
73+
def resolve_slot_range_class_relational(sv: SchemaViewWithProcessed, class_name: str) -> set[str]:
5074
"""Generate the appropriate slot range for a given class.
5175
5276
:param sv: the schema, via SchemaView
@@ -66,19 +90,19 @@ def resolve_slot_range_class_relational(sv: SchemaView, class_name: str) -> set[
6690
if not class_id_slot.range:
6791
msg = f"Class {class_name} identifier {class_id_slot.name} has no range: defaulting to string"
6892
logger.warning(msg)
69-
sv.PROCESSED[class_name] = StringType()
70-
return {StringType()}
93+
sv.RANGE_TO_TYPE[class_name] = STRING
94+
return {STRING}
7195

7296
if class_id_slot.range in sv.all_classes():
7397
msg = f"Class {class_id_slot.range} used as range for identifier slot of class {class_name}"
7498
logger.warning(msg)
75-
sv.PROCESSED[class_name] = StringType()
76-
return {StringType()}
99+
sv.RANGE_TO_TYPE[class_name] = STRING
100+
return {STRING}
77101

78102
return resolve_slot_range(sv, class_name=class_name, slot_name=class_id_slot.name, slot_range=class_id_slot.range)
79103

80104

81-
def resolve_slot_range_type(sv: SchemaView, type_name: str) -> set[DataType]:
105+
def resolve_slot_range_type(sv: SchemaViewWithProcessed, type_name: str) -> set[str]:
82106
"""Generate the appropriate slot range for a given type.
83107
84108
:param sv: the schema, via SchemaView
@@ -94,14 +118,14 @@ def resolve_slot_range_type(sv: SchemaView, type_name: str) -> set[DataType]:
94118
msg = f"type {type_name} lacks base and uri fields"
95119
logger.warning(msg)
96120
# add it to the mapping
97-
sv.PROCESSED[type_name] = StringType()
98-
return {StringType()}
121+
sv.RANGE_TO_TYPE[type_name] = STRING
122+
return {STRING}
99123

100124
type_uri = type_uri.removeprefix("xsd:")
101-
return {TYPE_MAP.get(type_uri, StringType())}
125+
return {TYPE_MAP.get(type_uri, STRING)}
102126

103127

104-
def resolve_slot_range(sv: SchemaView, class_name: str, slot_name: str, slot_range: str) -> set[DataType]:
128+
def resolve_slot_range(sv: SchemaViewWithProcessed, class_name: str, slot_name: str, slot_range: str) -> set[str]:
105129
"""Generate the appropriate spark datatype for a given slot_range.
106130
107131
:param sv: the schema, via SchemaView
@@ -115,8 +139,8 @@ def resolve_slot_range(sv: SchemaView, class_name: str, slot_name: str, slot_ran
115139
:return: set of spark datatype(s) to use
116140
:rtype: set[DataType]
117141
"""
118-
if slot_range in sv.PROCESSED:
119-
return {sv.PROCESSED[slot_range]}
142+
if slot_range in sv.RANGE_TO_TYPE:
143+
return {sv.RANGE_TO_TYPE[slot_range]}
120144

121145
if slot_range in sv.all_classes():
122146
return resolve_slot_range_class_relational(sv, slot_range)
@@ -126,20 +150,22 @@ def resolve_slot_range(sv: SchemaView, class_name: str, slot_name: str, slot_ran
126150

127151
# resolve enums as strings for now
128152
if slot_range in sv.all_enums():
129-
sv.PROCESSED[slot_range] = StringType()
130-
return {StringType()}
153+
sv.RANGE_TO_TYPE[slot_range] = STRING
154+
return {STRING}
131155

132156
if slot_range not in TYPE_MAP:
133157
msg = f"{class_name}.{slot_name} range {slot_range}: no type mapping found; using StringType()"
134158
logger.warning(msg)
135159
# add it to the mapping
136-
sv.PROCESSED[slot_range] = StringType()
137-
return {StringType()}
160+
sv.RANGE_TO_TYPE[slot_range] = STRING
161+
return {STRING}
138162

139163
return {TYPE_MAP[slot_range]}
140164

141165

142-
def build_struct_for_class(sv: SchemaView, class_name: str) -> dict[str, tuple[str, DataType, bool]] | None:
166+
def build_struct_for_class(
167+
sv: SchemaViewWithProcessed, class_name: str
168+
) -> dict[str, tuple[str, DataType, bool]] | None:
143169
"""Generate the appropriate Spark schema for a class in a LinkML schema.
144170
145171
:param sv: the schema, via SchemaView
@@ -184,7 +210,7 @@ def build_struct_for_class(sv: SchemaView, class_name: str) -> dict[str, tuple[s
184210
if len(slot_range_resolved) > 1:
185211
msg = f"WARNING: {class_name}.{slot.name}: more than one possible slot range: {', '.join(slot_range_resolved)}"
186212
logger.warning(msg)
187-
slot_range_resolved = {StringType()}
213+
slot_range_resolved = {STRING}
188214

189215
if len(slot_range_resolved) == 0:
190216
msg = f"ERROR: {class_name}.{slot.name} slot_range_set length is 0"
@@ -204,7 +230,7 @@ def build_struct_for_class(sv: SchemaView, class_name: str) -> dict[str, tuple[s
204230

205231

206232
def generate_pyspark_from_sv(
207-
sv: SchemaView, classes: list[str] | None = None
233+
sv: SchemaViewWithProcessed, classes: list[str] | None = None
208234
) -> dict[str, dict[str, tuple[DataType, bool]]]:
209235
"""Generate pyspark tables from a LinkML schema.
210236
@@ -215,7 +241,7 @@ def generate_pyspark_from_sv(
215241
:param classes: list of class names to parse; defaults to None
216242
:type classes: list[str] | None
217243
:return: dictionary containing annotations for each field in each class of the schema, excluding abstract classes and mixins
218-
:rtype: ddict[str, dict[str, tuple[DataType, bool]]]
244+
:rtype: dict[str, dict[str, tuple[DataType, bool]]]
219245
"""
220246
spark_schemas = {}
221247

@@ -227,10 +253,12 @@ def generate_pyspark_from_sv(
227253
return spark_schemas
228254

229255

230-
def write_output(sv: SchemaView, output_path: Path, spark_schemas: dict[str, dict[str, tuple[DataType, bool]]]) -> None:
256+
def write_output(
257+
sv: SchemaViewWithProcessed, output_path: Path, spark_schemas: dict[str, dict[str, tuple[DataType, bool]]]
258+
) -> None:
231259
indent = " " * 4
232260
# extract all the types from the StructFields
233-
all_types = {dt for table_fields in spark_schemas.values() for dt, _ in table_fields.values()}
261+
all_types = {remap[dt] for table_fields in spark_schemas.values() for dt, _ in table_fields.values()}
234262
header_material = [
235263
f'"""Automated conversion of {sv.schema.name} to PySpark."""',
236264
"",
@@ -247,7 +275,7 @@ def write_output(sv: SchemaView, output_path: Path, spark_schemas: dict[str, dic
247275
[
248276
f'{indent}"{table_name}": StructType([',
249277
*[
250-
f'{indent}{indent}StructField("{name}", {dtype}, nullable={nullable}),'
278+
f'{indent}{indent}StructField("{name}", {remap[dtype]}, nullable={nullable}),'
251279
for name, (dtype, nullable) in table.items()
252280
],
253281
f"{indent}" + "]),\n",
@@ -259,10 +287,6 @@ def write_output(sv: SchemaView, output_path: Path, spark_schemas: dict[str, dic
259287
print(f"PySpark schema written to {output_path}")
260288

261289

262-
class SchemaViewWithProcessed(SchemaView):
263-
PROCESSED = {}
264-
265-
266290
if __name__ == "__main__":
267291
sv = SchemaViewWithProcessed("./src/linkml/cdm_schema.yaml")
268292

0 commit comments

Comments
 (0)