diff --git a/index.md b/index.md
new file mode 100644
index 0000000..2945620
--- /dev/null
+++ b/index.md
@@ -0,0 +1,123 @@
+# kitchen_sink
+
+Kitchen Sink Schema
+
+This schema does not do anything useful. It exists to test all features of linkml.
+
+This particular text field exists to demonstrate markdown within a text field:
+
+Lists:
+
+   * a
+   * b
+   * c
+
+And links, e.g to [Person](Person.md)
+
+URI: https://w3id.org/linkml/tests/kitchen_sink
+
+Name: kitchen_sink
+
+
+
+## Classes
+
+| Class | Description |
+| --- | --- |
+| [Activity](Activity.md) | a provence-generating activity |
+| [Address](Address.md) | None |
+| [Agent](Agent.md) | a provence-generating agent |
+| [Dataset](Dataset.md) | None |
+| [Event](Event.md) | None |
+| &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[BirthEvent](BirthEvent.md) | None |
+| &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[EmploymentEvent](EmploymentEvent.md) | None |
+| &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[MarriageEvent](MarriageEvent.md) | None |
+| &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[MedicalEvent](MedicalEvent.md) | None |
+| [HasAliases](HasAliases.md) | None |
+| [Organization](Organization.md) | None |
+| &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[Company](Company.md) | None |
+| [Person](Person.md) | None |
+| [Place](Place.md) | None |
+| [Relationship](Relationship.md) | None |
+| &nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;&nbsp;[FamilialRelationship](FamilialRelationship.md) | None |
+| [WithLocation](WithLocation.md) | None |
+
+
+
+## Slots
+
+| Slot | Description |
+| --- | --- |
+| [acted_on_behalf_of](acted_on_behalf_of.md) |  |
+| [activities](activities.md) |  |
+| [activity_set](activity_set.md) |  |
+| [addresses](addresses.md) |  |
+| [age_in_years](age_in_years.md) |  |
+| [agent_set](agent_set.md) |  |
+| [aliases](aliases.md) |  |
+| [ceo](ceo.md) |  |
+| [city](city.md) |  |
+| [companies](companies.md) |  |
+| [description](description.md) |  |
+| [employed_at](employed_at.md) |  |
+| [ended_at_time](ended_at_time.md) |  |
+| [has_birth_event](has_birth_event.md) |  |
+| [has_employment_history](has_employment_history.md) |  |
+| [has_familial_relationships](has_familial_relationships.md) |  |
+| [has_marriage_history](has_marriage_history.md) |  |
+| [has_medical_history](has_medical_history.md) |  |
+| [id](id.md) |  |
+| [in_location](in_location.md) |  |
+| [is_current](is_current.md) |  |
+| [married_to](married_to.md) |  |
+| [name](name.md) |  |
+| [persons](persons.md) |  |
+| [related_to](related_to.md) |  |
+| [started_at_time](started_at_time.md) |  |
+| [street](street.md) |  |
+| [type](type.md) |  |
+| [used](used.md) |  |
+| [was_associated_with](was_associated_with.md) |  |
+| [was_generated_by](was_generated_by.md) |  |
+| [was_informed_by](was_informed_by.md) |  |
+
+
+## Enumerations
+
+| Enumeration | Description |
+| --- | --- |
+| [DiagnosisType](DiagnosisType.md) |  |
+| [FamilialRelationshipType](FamilialRelationshipType.md) |  |
+
+
+## Types
+
+| Type | Description |
+| --- | --- |
+| [Boolean](Boolean.md) | A binary (true or false) value |
+| [Curie](Curie.md) | a compact URI |
+| [Date](Date.md) | a date (year, month and day) in an idealized calendar |
+| [DateOrDatetime](DateOrDatetime.md) | Either a date or a datetime |
+| [Datetime](Datetime.md) | The combination of a date and time |
+| [Decimal](Decimal.md) | A real number with arbitrary precision that conforms to the xsd:decimal speci... |
+| [Double](Double.md) | A real number that conforms to the xsd:double specification |
+| [Float](Float.md) | A real number that conforms to the xsd:float specification |
+| [Integer](Integer.md) | An integer |
+| [Jsonpath](Jsonpath.md) | A string encoding a JSON Path |
+| [Jsonpointer](Jsonpointer.md) | A string encoding a JSON Pointer |
+| [Ncname](Ncname.md) | Prefix part of CURIE |
+| [Nodeidentifier](Nodeidentifier.md) | A URI, CURIE or BNODE that represents a node in a model |
+| [Objectidentifier](Objectidentifier.md) | A URI or CURIE that represents an object in the model |
+| [Sparqlpath](Sparqlpath.md) | A string encoding a SPARQL Property Path |
+| [String](String.md) | A character string |
+| [Time](Time.md) | A time object represents a (local) time of day, independent of any particular... |
+| [Uri](Uri.md) | a complete URI |
+| [Uriorcurie](Uriorcurie.md) | a URI or a CURIE |
+
+
+## Subsets
+
+| Subset | Description |
+| --- | --- |
+| [SubsetA](SubsetA.md) | test subset A |
+| [SubsetB](SubsetB.md) | test subset B |
diff --git a/kitchen_sink.md b/kitchen_sink.md
new file mode 100644
index 0000000..be6229b
--- /dev/null
+++ b/kitchen_sink.md
@@ -0,0 +1,18 @@
+# kitchen_sink
+
+Kitchen Sink Schema
+
+This schema does not do anything useful. It exists to test all features of linkml.
+
+This particular text field exists to demonstrate markdown within a text field:
+
+Lists:
+
+   * a
+   * b
+   * c
+
+And links, e.g to [Person](Person.md)
+
+URI: https://w3id.org/linkml/tests/kitchen_sink
+
diff --git a/linkml_solr/cli.py b/linkml_solr/cli.py
index afff7d1..10f4bab 100644
--- a/linkml_solr/cli.py
+++ b/linkml_solr/cli.py
@@ -8,7 +8,9 @@
 from linkml_runtime.linkml_model import SchemaDefinition
 from linkml_runtime.loaders import yaml_loader
 from linkml_solr import SolrQueryEngine, SolrEndpoint, DEFAULT_CORE, DEFAULT_SOLR_URL
-from linkml_solr.utils.solr_bulkload import bulkload_file
+from linkml_solr.utils.solr_bulkload import bulkload_file, bulkload_chunked, bulkload_duckdb
+import requests
+import time
 
 
 @click.group()
@@ -44,6 +46,7 @@ def main(verbose: int, quiet: bool):
               show_default=True,
               help='solr core.')
 @click.option('--format', '-f',
+              type=click.Choice(['csv', 'json']),
               default='csv',
               show_default=True,
               help='input format.')
@@ -52,17 +55,193 @@ def main(verbose: int, quiet: bool):
               help='solr url.')
 @click.option('--processor', '-p',
               help='Processor argument to pass when bulk loading to Solr')
+@click.option('--chunk-size', '-c',
+              default=100000,
+              show_default=True,
+              help='Number of rows per chunk for large files')
+@click.option('--parallel-workers', '-w',
+              default=None,
+              type=int,
+              help='Number of parallel workers (default: auto-detect based on CPU cores)')
+@click.option('--chunked/--no-chunked',
+              default=False,
+              show_default=True,
+              help='Use chunked parallel loading for large files')
+@click.option('--auto-configure/--no-auto-configure',
+              default=True,
+              show_default=True,
+              help='Automatically configure Solr for optimal performance')
+@click.option('--ram-buffer',
+              default=2048,
+              show_default=True,
+              help='RAM buffer size in MB (used with auto-configure)')
 @click.argument('files', nargs=-1)
-def bulkload(files, format, schema, url, core, processor=None):
+def bulkload(files, format, schema, url, core, processor, chunk_size, parallel_workers, chunked, auto_configure, ram_buffer):
+    """
+    Bulk load files into Solr with optional chunking and performance optimization
+    """
+    if schema is not None:
+        with open(schema) as stream:
+            schema_obj = yaml_loader.load(stream, target_class=SchemaDefinition)
+    else:
+        schema_obj = None
+    
+    # Auto-configure Solr for performance if requested
+    if auto_configure:
+        print("Configuring Solr for optimal bulk loading performance...")
+        configure_solr_performance(url, core, ram_buffer, disable_autocommit=True)
+    
+    total_loaded = 0
+    start_time = time.time()
+    
+    try:
+        for f in files:
+            print(f"Processing file: {f}")
+            
+            if chunked and format in ['csv', 'json']:
+                # Use chunked loading for large files
+                docs_loaded = bulkload_chunked(
+                    csv_file=f,
+                    base_url=url,
+                    core=core,
+                    schema=schema_obj,
+                    chunk_size=chunk_size,
+                    max_workers=parallel_workers,
+                    format=format,
+                    processor=processor
+                )
+                total_loaded += docs_loaded
+                print(f"Loaded {docs_loaded} documents from {f}")
+            else:
+                # Use standard bulkload
+                bulkload_file(f, format=format, schema=schema_obj, core=core, base_url=url, processor=processor, commit=False)
+                print(f"Loaded file {f}")
+        
+        # Commit all changes at the end
+        print("Committing all changes...")
+        commit_start = time.time()
+        if commit_solr(url, core):
+            commit_time = time.time() - commit_start
+            total_time = time.time() - start_time
+            overall_docs_per_sec = total_loaded / total_time if total_time > 0 else 0
+            print(f"Successfully committed {total_loaded} documents to Solr")
+            print(f"Total time: {total_time:.2f}s (commit: {commit_time:.2f}s)")
+            print(f"Overall throughput: {overall_docs_per_sec:,.0f} docs/sec")
+        else:
+            total_time = time.time() - start_time
+            overall_docs_per_sec = total_loaded / total_time if total_time > 0 else 0
+            print("Warning: Commit may have failed")
+            print(f"Total time: {total_time:.2f}s")
+            print(f"Overall throughput: {overall_docs_per_sec:,.0f} docs/sec")
+            
+    except Exception as e:
+        total_time = time.time() - start_time
+        print(f"Error during bulk loading: {e}")
+        print(f"Total time before error: {total_time:.2f}s")
+        # Try to commit what we have
+        commit_solr(url, core)
+        raise
+
+
+@main.command()
+@click.option('--core', '-C',
+              default=DEFAULT_CORE,
+              show_default=True,
+              help='solr core.')
+@click.option('--url', '-u',
+              default=DEFAULT_SOLR_URL,
+              help='solr url.')
+@click.option('--schema', '-s',
+              help='Path to schema.')
+@click.option('--chunk-size', '-c',
+              default=100000,
+              show_default=True,
+              help='Number of rows per chunk')
+@click.option('--parallel-workers', '-w',
+              default=None,
+              type=int,
+              help='Number of parallel workers (default: auto-detect based on CPU cores)')
+@click.option('--where',
+              help='SQL WHERE clause for filtering data')
+@click.option('--columns',
+              help='Comma-separated list of columns to export (default: all)')
+@click.option('--order-by',
+              help='SQL ORDER BY clause for consistent chunking')
+@click.option('--auto-configure/--no-auto-configure',
+              default=True,
+              show_default=True,
+              help='Automatically configure Solr for optimal performance')
+@click.option('--ram-buffer',
+              default=2048,
+              show_default=True,
+              help='RAM buffer size in MB (used with auto-configure)')
+@click.option('--processor', '-p',
+              help='Processor argument to pass when bulk loading to Solr')
+@click.argument('db_path')
+@click.argument('table_name')
+def bulkload_db(db_path, table_name, core, url, schema, chunk_size, parallel_workers, 
+               where, columns, order_by, auto_configure, ram_buffer, processor):
     """
-    Convert multiple golr yaml schemas to linkml
+    Bulk load data from DuckDB database to Solr with high-performance parallel processing
+    
+    DB_PATH: Path to the DuckDB database file
+    TABLE_NAME: Name of the table to export
     """
-    inputs = {}
     if schema is not None:
         with open(schema) as stream:
             schema_obj = yaml_loader.load(stream, target_class=SchemaDefinition)
-    for f in files:
-        bulkload_file(f, format=format, schema=schema_obj, core=core, base_url=url, processor=processor)
+    else:
+        schema_obj = None
+    
+    # Auto-configure Solr for performance if requested
+    if auto_configure:
+        print("Configuring Solr for optimal bulk loading performance...")
+        configure_solr_performance(url, core, ram_buffer, disable_autocommit=True)
+    
+    start_time = time.time()
+    
+    try:
+        print(f"Loading from DuckDB: {db_path} → table: {table_name}")
+        
+        total_loaded = bulkload_duckdb(
+            db_path=db_path,
+            table_name=table_name,
+            base_url=url,
+            core=core,
+            schema=schema_obj,
+            chunk_size=chunk_size,
+            max_workers=parallel_workers,
+            where_clause=where,
+            columns=columns,
+            order_by=order_by,
+            processor=processor
+        )
+        
+        # Commit all changes at the end
+        print("Committing all changes...")
+        commit_start = time.time()
+        if commit_solr(url, core):
+            commit_time = time.time() - commit_start
+            total_time = time.time() - start_time
+            overall_docs_per_sec = total_loaded / total_time if total_time > 0 else 0
+            print(f"Successfully committed {total_loaded} documents to Solr")
+            print(f"Total time: {total_time:.2f}s (commit: {commit_time:.2f}s)")
+            print(f"Overall throughput: {overall_docs_per_sec:,.0f} docs/sec")
+        else:
+            total_time = time.time() - start_time
+            overall_docs_per_sec = total_loaded / total_time if total_time > 0 else 0
+            print("Warning: Commit may have failed")
+            print(f"Total time: {total_time:.2f}s")
+            print(f"Overall throughput: {overall_docs_per_sec:,.0f} docs/sec")
+            
+    except Exception as e:
+        total_time = time.time() - start_time
+        print(f"Error during bulk loading: {e}")
+        print(f"Total time before error: {total_time:.2f}s")
+        # Try to commit what we have
+        commit_solr(url, core)
+        raise
+
 
 @main.command()
 @click.option('--schema', '-s',
@@ -95,7 +274,19 @@ def bulkload(files, format, schema, url, core, processor=None):
               default=DEFAULT_SOLR_URL,
               show_default=True,
               help='solr url.')
-def start_server(schema, kill, container, url, core, port, sleep: int, create_schema):
+@click.option('--memory', '-m',
+              default='4g',
+              show_default=True,
+              help='Docker memory limit (e.g., 4g, 8g)')
+@click.option('--heap-size', '-j',
+              default='3g',
+              show_default=True,
+              help='JVM heap size (e.g., 2g, 4g)')
+@click.option('--ram-buffer-mb',
+              default=512,
+              show_default=True,
+              help='Solr RAM buffer size in MB for bulk loading')
+def start_server(schema, kill, container, url, core, port, sleep: int, create_schema, memory, heap_size, ram_buffer_mb):
     """
     Starts a solr server (via Docker)
     """
@@ -109,6 +300,12 @@ def start_server(schema, kill, container, url, core, port, sleep: int, create_sc
         container,
         '-p',
         f'{port}:{port}',
+        '-m',
+        memory,
+        '-e',
+        f'SOLR_JAVA_MEM=-Xms{heap_size} -Xmx{heap_size}',
+        '-e',
+        f'SOLR_OPTS=-Dsolr.ramBufferSizeMB={ram_buffer_mb} -Dsolr.jetty.request.header.size=65535',
         'solr:8',
         'solr-precreate',
         core]
@@ -193,5 +390,61 @@ def create_schema(schema, url, core, debug, dry_run, top_class):
         print(gen.serialize())
 
 
+def configure_solr_performance(url, core, ram_buffer_mb=2048, disable_autocommit=True):
+    """
+    Configure Solr for optimal bulk loading performance
+    """
+    config_url = f"{url}/{core}/config"
+    
+    if disable_autocommit:
+        # Disable autocommit
+        response = requests.post(config_url, 
+                               headers={'Content-type': 'application/json'},
+                               json={
+                                   "set-property": {
+                                       "updateHandler.autoCommit.maxTime": -1,
+                                       "updateHandler.autoSoftCommit.maxTime": -1
+                                   }
+                               })
+        if response.status_code == 200:
+            print(f"Disabled autocommit: {response.status_code}")
+        else:
+            print(f"Failed to disable autocommit: {response.status_code}")
+            print(f"Error response: {response.text}")
+    
+    # Note: RAM buffer size must be set via environment variables when starting Solr
+    print(f"Note: RAM buffer ({ram_buffer_mb}MB) should be set via SOLR_OPTS environment variable when starting Solr")
+
+
+def commit_solr(url, core):
+    """
+    Commit changes to Solr
+    """
+    commit_url = f"{url}/{core}/update?commit=true"
+    response = requests.post(commit_url, headers={'Content-Type': 'application/json'}, json={})
+    print(f"Committed changes: {response.status_code}")
+    return response.status_code == 200
+
+
+@main.command()
+@click.option('--url', '-u',
+              default=DEFAULT_SOLR_URL,
+              help='solr url.')
+@click.option('--core', '-C',
+              default=DEFAULT_CORE,
+              help='solr core.')
+@click.option('--ram-buffer',
+              default=2048,
+              help='RAM buffer size in MB.')
+@click.option('--enable/--disable',
+              default=False,
+              help='Enable or disable autocommit.')
+def configure_performance(url, core, ram_buffer, enable):
+    """
+    Configure Solr performance settings for bulk loading
+    """
+    configure_solr_performance(url, core, ram_buffer, not enable)
+
+
 if __name__ == '__main__':
     main()
diff --git a/linkml_solr/utils/solr_bulkload.py b/linkml_solr/utils/solr_bulkload.py
index 8a087ff..7ccda5b 100644
--- a/linkml_solr/utils/solr_bulkload.py
+++ b/linkml_solr/utils/solr_bulkload.py
@@ -1,18 +1,69 @@
-from typing import List
+from typing import List, Optional, Iterator
 import logging
 import subprocess
+import duckdb
+import tempfile
+import json
+import os
+from concurrent.futures import ThreadPoolExecutor, as_completed
+import threading
+import time
+import os
 from linkml_runtime.linkml_model.meta import SchemaDefinition, SlotDefinitionName
+import requests
+
+# Global session for connection pooling
+_session_lock = threading.Lock()
+_global_session = None
+
+def get_http_session():
+    """Get a shared HTTP session with connection pooling"""
+    global _global_session
+    with _session_lock:
+        if _global_session is None:
+            _global_session = requests.Session()
+            # Configure connection pooling for parallel uploads
+            adapter = requests.adapters.HTTPAdapter(
+                pool_connections=20,
+                pool_maxsize=20,
+                pool_block=True
+            )
+            _global_session.mount('http://', adapter)
+            _global_session.mount('https://', adapter)
+        return _global_session
 
 def _get_multivalued_slots(schema: SchemaDefinition) -> List[SlotDefinitionName]:
     return [s.name for s in schema.slots.values() if s.multivalued]
 
 
+def get_optimal_worker_count(max_workers: Optional[int] = None) -> int:
+    """
+    Determine optimal number of parallel workers based on system capabilities
+    
+    :param max_workers: Maximum workers to use, None for auto-detection
+    :return: Optimal number of workers
+    """
+    if max_workers is not None:
+        return max_workers
+    
+    # Get CPU count
+    cpu_count = os.cpu_count() or 4  # Fallback to 4 if detection fails
+    
+    # For I/O bound HTTP uploads, use CPU count + 50% but cap at reasonable limit
+    # This balances parallelism with system resource usage without being too aggressive
+    optimal = min(int(cpu_count * 1.5), 12)
+    
+    # Minimum of 2 workers for any benefit
+    return max(optimal, 2)
+
+
 def bulkload_file(f,
                   format='csv',
                   base_url=None,
                   core=None,
                   schema: SchemaDefinition = None,
                   processor: str = None,
+                  commit: bool = False,
                   ):
     """
     Bulkload a file using solr bulkload API
@@ -30,7 +81,8 @@ def bulkload_file(f,
     separator = '%09'
     internal_separator = '%7C'
     parts = [f'f.{s}.split=true&f.{s}.separator={internal_separator}' for s in mvslots]
-    url = f'{base_url}/{core}/update?{"&".join(parts)}&commit=true&separator={separator}'
+    commit_param = 'true' if commit else 'false'
+    url = f'{base_url}/{core}/update?{"&".join(parts)}&commit={commit_param}&separator={separator}'
     if (processor is not None):
         url = f'{url}&processor={processor}'
     if format == 'csv':
@@ -39,7 +91,430 @@ def bulkload_file(f,
         ct = 'application/json'
     else:
         raise Exception(f'Unknown format {format}')
-    command = ['curl', url, '-T', f'{f}', '-X', 'POST', '-H', f'Content-type:{ct}']
-    print(command)
-    subprocess.run(command)
+    # Use direct HTTP with connection pooling for better performance
+    try:
+        session = get_http_session()
+        with open(f, 'rb') as file_data:
+            response = session.post(url, data=file_data, headers={'Content-Type': ct}, timeout=300)
+            print(f"Uploaded {f}: {response.status_code}")
+            if response.status_code != 200:
+                print(f"Error response: {response.text}")
+            return response.status_code == 200
+    except Exception as e:
+        print(f"Error uploading {f}: {e}")
+        return False
+
+
+def csv_to_json_chunk(csv_file: str, chunk_start: int, chunk_size: int, output_file: str) -> int:
+    """
+    Convert a chunk of CSV/TSV data to JSON format using DuckDB
+    
+    :param csv_file: Input CSV/TSV file path
+    :param chunk_start: Starting row (0-based)
+    :param chunk_size: Number of rows to process
+    :param output_file: Output JSON file path
+    :return: Number of documents processed
+    """
+    try:
+        conn = duckdb.connect()
+        
+        # Auto-detect separator and read chunk with DuckDB
+        sep = '\t' if csv_file.endswith('.tsv') else ','
+        query = f"""
+        SELECT * FROM read_csv_auto('{csv_file}', 
+                                   delim='{sep}',
+                                   ignore_errors=true,
+                                   header=true)
+        LIMIT {chunk_size} OFFSET {chunk_start}
+        """
+        
+        result = conn.execute(query).fetchall()
+        columns = [desc[0] for desc in conn.description]
+        
+        # Convert to list of dicts for JSON
+        docs = [dict(zip(columns, row)) for row in result]
+        
+        with open(output_file, 'w') as f:
+            json.dump(docs, f)
+        
+        conn.close()
+        return len(docs)
+    except Exception as e:
+        print(f"Error processing chunk starting at {chunk_start}: {e}")
+        return 0
+
+
+def process_and_upload_chunk(csv_file: str, chunk_start: int, chunk_size: int, 
+                           base_url: str, core: str, schema: SchemaDefinition,
+                           format: str, processor: str) -> int:
+    """
+    Process a chunk (create temp file) and upload it to Solr in one parallel task
+    """
+    temp_file = None
+    try:
+        if format == 'json':
+            temp_file = tempfile.NamedTemporaryFile(suffix='.json', delete=False)
+            temp_file.close()
+            docs_processed = csv_to_json_chunk(csv_file, chunk_start, chunk_size, temp_file.name)
+        else:
+            temp_file = tempfile.NamedTemporaryFile(suffix='.csv', delete=False)
+            temp_file.close()
+            docs_processed = _create_csv_chunk(csv_file, chunk_start, chunk_size, temp_file.name)
+        
+        if docs_processed > 0:
+            success = bulkload_file(
+                temp_file.name,
+                format=format,
+                base_url=base_url,
+                core=core,
+                schema=schema,
+                processor=processor,
+                commit=False
+            )
+            if success:
+                print(f"Chunk {chunk_start}-{chunk_start+chunk_size}: {docs_processed} docs uploaded")
+                return docs_processed
+            else:
+                print(f"Failed to upload chunk {chunk_start}-{chunk_start+chunk_size}")
+                return 0
+        else:
+            return 0
+            
+    except Exception as e:
+        print(f"Error processing chunk {chunk_start}: {e}")
+        return 0
+    finally:
+        # Clean up temp file
+        if temp_file and os.path.exists(temp_file.name):
+            try:
+                os.unlink(temp_file.name)
+            except:
+                pass
+
+
+def bulkload_chunked(csv_file: str,
+                    base_url: str,
+                    core: str,
+                    schema: SchemaDefinition,
+                    chunk_size: int = 100000,
+                    max_workers: Optional[int] = None,
+                    format: str = 'csv',
+                    processor: str = None) -> int:
+    """
+    Load a large CSV file in chunks with parallel processing
+    
+    :param csv_file: Path to the CSV file
+    :param base_url: Solr base URL
+    :param core: Solr core name
+    :param schema: LinkML schema definition
+    :param chunk_size: Number of rows per chunk
+    :param max_workers: Number of parallel workers
+    :param format: Output format ('csv' or 'cbor')
+    :param processor: Solr processor to use
+    :return: Total number of documents loaded
+    """
+    # Get total row count using DuckDB (more reliable for large files)
+    conn = duckdb.connect()
+    sep = '\t' if csv_file.endswith('.tsv') else ','
+    count_query = f"""
+    SELECT COUNT(*) FROM read_csv_auto('{csv_file}', 
+                                      delim='{sep}',
+                                      ignore_errors=true,
+                                      header=true)
+    """
+    total_rows = conn.execute(count_query).fetchone()[0]
+    conn.close()
+    
+    # Auto-detect optimal worker count
+    actual_workers = get_optimal_worker_count(max_workers)
+    cpu_count = os.cpu_count() or 'unknown'
+    
+    if max_workers is None:
+        print(f"Auto-detected {actual_workers} workers (CPU cores: {cpu_count})")
+    
+    print(f"Processing {total_rows} rows in chunks of {chunk_size} with {actual_workers} parallel workers")
+    
+    total_loaded = 0
+    preprocessing_start = time.time()
+    
+    # Submit all chunk processing and upload tasks in parallel
+    with ThreadPoolExecutor(max_workers=actual_workers) as executor:
+        futures = []
+        
+        for chunk_start in range(0, total_rows, chunk_size):
+            actual_chunk_size = min(chunk_size, total_rows - chunk_start)
+            
+            # Submit the entire process+upload as one parallel task
+            future = executor.submit(
+                process_and_upload_chunk,
+                csv_file,
+                chunk_start,
+                actual_chunk_size,
+                base_url,
+                core,
+                schema,
+                format,
+                processor
+            )
+            futures.append(future)
+        
+        preprocessing_time = time.time() - preprocessing_start
+        print(f"Preprocessing complete ({preprocessing_time:.2f}s) - starting parallel uploads...")
+        upload_start = time.time()
+        
+        # Process results as they complete (truly parallel!)
+        for future in as_completed(futures):
+            try:
+                docs_loaded = future.result()
+                total_loaded += docs_loaded
+                print(f"Progress: {total_loaded}/{total_rows} documents loaded")
+            except Exception as e:
+                print(f"Error in parallel chunk processing: {e}")
+        
+        upload_time = time.time() - upload_start
+        total_processing_time = time.time() - preprocessing_start
+        
+        # Calculate throughput metrics
+        docs_per_sec = total_loaded / total_processing_time if total_processing_time > 0 else 0
+        upload_docs_per_sec = total_loaded / upload_time if upload_time > 0 else 0
+        
+        print(f"Upload complete! Processing: {preprocessing_time:.2f}s, Upload: {upload_time:.2f}s, Total: {total_processing_time:.2f}s")
+        print(f"Throughput: {docs_per_sec:,.0f} docs/sec overall, {upload_docs_per_sec:,.0f} docs/sec upload")
+    
+    return total_loaded
+
+
+def query_duckdb_chunk(db_path: str, query: str, offset: int, chunk_size: int) -> tuple:
+    """
+    Query a chunk of data from DuckDB with read-only connection
+    
+    :param db_path: Path to DuckDB database
+    :param query: Base SQL query (without LIMIT/OFFSET)
+    :param offset: Starting row offset
+    :param chunk_size: Number of rows to fetch
+    :return: (results, columns) tuple
+    """
+    try:
+        # Open read-only connection for safety
+        conn = duckdb.connect(db_path, read_only=True)
+        
+        # Add LIMIT/OFFSET to the query
+        chunk_query = f"{query} LIMIT {chunk_size} OFFSET {offset}"
+        
+        result = conn.execute(chunk_query)
+        rows = result.fetchall()
+        columns = [desc[0] for desc in result.description]
+        
+        conn.close()
+        return rows, columns
+    except Exception as e:
+        print(f"Error querying DuckDB chunk at offset {offset}: {e}")
+        return [], []
+
+
+def upload_duckdb_chunk(db_path: str, query: str, offset: int, chunk_size: int,
+                       base_url: str, core: str, schema: SchemaDefinition, processor: Optional[str] = None) -> int:
+    """
+    Query DuckDB chunk and upload directly to Solr
+    
+    :param db_path: Path to DuckDB database
+    :param query: Base SQL query
+    :param offset: Starting row offset  
+    :param chunk_size: Number of rows to process
+    :param base_url: Solr base URL
+    :param core: Solr core name
+    :param schema: LinkML schema definition
+    :return: Number of documents uploaded
+    """
+    try:
+        # Query the chunk
+        rows, columns = query_duckdb_chunk(db_path, query, offset, chunk_size)
+        
+        if not rows:
+            return 0
+        
+        # Convert to JSON for Solr
+        docs = [dict(zip(columns, row)) for row in rows]
+        json_data = json.dumps(docs)
+        
+        # Upload directly to Solr
+        session = get_http_session()
+        url = f'{base_url}/{core}/update/json/docs?commit=false'
+        if processor is not None:
+            url = f'{url}&processor={processor}'
+        
+        response = session.post(url, data=json_data, 
+                               headers={'Content-Type': 'application/json'}, 
+                               timeout=300)
+        
+        if response.status_code == 200:
+            print(f"Uploaded chunk offset {offset}: {len(docs)} docs")
+            return len(docs)
+        else:
+            print(f"Error uploading chunk offset {offset}: {response.status_code}")
+            print(f"Error response: {response.text}")
+            return 0
+            
+    except Exception as e:
+        print(f"Error processing DuckDB chunk at offset {offset}: {e}")
+        return 0
+
+
+def bulkload_duckdb(db_path: str,
+                   table_name: str,
+                   base_url: str,
+                   core: str,
+                   schema: SchemaDefinition,
+                   chunk_size: int = 100000,
+                   max_workers: Optional[int] = None,
+                   where_clause: Optional[str] = None,
+                   columns: Optional[str] = None,
+                   order_by: Optional[str] = None,
+                   processor: Optional[str] = None) -> int:
+    """
+    Load data from DuckDB database to Solr with parallel processing
+    
+    :param db_path: Path to DuckDB database file
+    :param table_name: Name of table to export
+    :param base_url: Solr base URL
+    :param core: Solr core name
+    :param schema: LinkML schema definition
+    :param chunk_size: Number of rows per chunk
+    :param max_workers: Number of parallel workers (None for auto-detect)
+    :param where_clause: Optional SQL WHERE clause
+    :param columns: Optional comma-separated column list
+    :param order_by: Optional SQL ORDER BY clause
+    :return: Total number of documents loaded
+    """
+    try:
+        # Build SQL query
+        column_list = columns if columns else "*"
+        base_query = f"SELECT {column_list} FROM {table_name}"
+        
+        if where_clause:
+            base_query += f" WHERE {where_clause}"
+        
+        if order_by:
+            base_query += f" ORDER BY {order_by}"
+        
+        # Get total row count with read-only connection
+        conn = duckdb.connect(db_path, read_only=True)
+        count_query = f"SELECT COUNT(*) FROM {table_name}"
+        if where_clause:
+            count_query += f" WHERE {where_clause}"
+            
+        total_rows = conn.execute(count_query).fetchone()[0]
+        conn.close()
+        
+        # Auto-detect optimal worker count
+        actual_workers = get_optimal_worker_count(max_workers)
+        cpu_count = os.cpu_count() or 'unknown'
+        
+        if max_workers is None:
+            print(f"Auto-detected {actual_workers} workers (CPU cores: {cpu_count})")
+        
+        print(f"Processing {total_rows} rows from DuckDB in chunks of {chunk_size} with {actual_workers} parallel workers")
+        print(f"Query: {base_query}")
+        
+        total_loaded = 0
+        processing_start = time.time()
+        
+        # Process chunks in batches to control memory usage
+        with ThreadPoolExecutor(max_workers=actual_workers) as executor:
+            # Create list of all chunk offsets
+            chunk_offsets = list(range(0, total_rows, chunk_size))
+            total_chunks = len(chunk_offsets)
+            
+            print(f"Processing {total_chunks} chunks in batches of {actual_workers} workers")
+            upload_start = time.time()
+            
+            # Process chunks in batches of worker count
+            for batch_start in range(0, len(chunk_offsets), actual_workers):
+                batch_end = min(batch_start + actual_workers, len(chunk_offsets))
+                batch_offsets = chunk_offsets[batch_start:batch_end]
+                
+                print(f"Processing batch {batch_start//actual_workers + 1}/{(total_chunks + actual_workers - 1)//actual_workers}: chunks {batch_start + 1}-{batch_end}")
+                
+                # Submit current batch
+                futures = []
+                for offset in batch_offsets:
+                    future = executor.submit(
+                        upload_duckdb_chunk,
+                        db_path,
+                        base_query,
+                        offset,
+                        chunk_size,
+                        base_url,
+                        core,
+                        schema,
+                        processor
+                    )
+                    futures.append(future)
+                
+                # Wait for current batch to complete
+                for future in as_completed(futures):
+                    try:
+                        docs_loaded = future.result()
+                        total_loaded += docs_loaded
+                        print(f"Progress: {total_loaded:,}/{total_rows:,} documents loaded")
+                    except Exception as e:
+                        print(f"Error in parallel DuckDB chunk processing: {e}")
+            
+            upload_time = time.time() - upload_start
+            total_processing_time = time.time() - processing_start
+            
+            # Calculate throughput metrics
+            docs_per_sec = total_loaded / total_processing_time if total_processing_time > 0 else 0
+            upload_docs_per_sec = total_loaded / upload_time if upload_time > 0 else 0
+            
+            print(f"DuckDB upload complete! Upload: {upload_time:.2f}s, Total: {total_processing_time:.2f}s")
+            print(f"Throughput: {docs_per_sec:,.0f} docs/sec overall, {upload_docs_per_sec:,.0f} docs/sec upload")
+        
+        return total_loaded
+        
+    except Exception as e:
+        print(f"Error in DuckDB bulk loading: {e}")
+        return 0
+
+
+def _create_csv_chunk(csv_file: str, chunk_start: int, chunk_size: int, output_file: str) -> int:
+    """
+    Create a CSV/TSV chunk file with header using DuckDB
+    
+    :param csv_file: Input CSV/TSV file path
+    :param chunk_start: Starting row (0-based)
+    :param chunk_size: Number of rows to process
+    :param output_file: Output CSV file path
+    :return: Number of rows processed
+    """
+    try:
+        conn = duckdb.connect()
+        
+        # Auto-detect separator
+        sep = '\t' if csv_file.endswith('.tsv') else ','
+        
+        # Export chunk directly to CSV
+        query = f"""
+        COPY (
+            SELECT * FROM read_csv_auto('{csv_file}', 
+                                       delim='{sep}',
+                                       ignore_errors=true,
+                                       header=true)
+            LIMIT {chunk_size} OFFSET {chunk_start}
+        ) TO '{output_file}' (FORMAT CSV, DELIMITER '\t', HEADER true)
+        """
+        
+        conn.execute(query)
+        
+        # Count rows to return
+        count_query = f"""
+        SELECT COUNT(*) FROM read_csv_auto('{output_file}', header=true, ignore_errors=true)
+        """
+        count = conn.execute(count_query).fetchone()[0]
+        
+        conn.close()
+        return count
+    except Exception as e:
+        print(f"Error creating CSV chunk starting at {chunk_start}: {e}")
+        return 0
 
diff --git a/model.yaml b/model.yaml
new file mode 100644
index 0000000..b66b48b
--- /dev/null
+++ b/model.yaml
@@ -0,0 +1,781 @@
+id: https://w3id.org/monarch/monarch-py
+name: monarch-py
+description: Data models for the Monarch Initiative data access library
+prefixes:
+  linkml: https://w3id.org/linkml/
+  biolink: https://w3id.org/biolink/vocab/
+imports:
+  - linkml:types
+  - similarity
+default_range: string
+
+enums:
+  AssociationDirectionEnum:
+    description: >-
+      The directionality of an association as it relates to a specified entity, with edges being categorized
+      as incoming or outgoing
+    permissible_values:
+      incoming:
+        description: >-
+          An association for which a specified entity is the object or part of the object closure
+      outgoing:
+        description: >-
+          An association for which a specified entity is the subject or part of the subject closure
+
+classes:
+  Association:
+    slots:
+      - id
+      - category
+      - subject
+      - original_subject
+      - subject_namespace
+      - subject_category
+      - subject_closure
+      - subject_label
+      - subject_closure_label
+      - subject_taxon
+      - subject_taxon_label
+      - predicate
+      - object
+      - original_object
+      - object_namespace
+      - object_category
+      - object_closure
+      - object_label
+      - object_closure_label
+      - object_taxon
+      - object_taxon_label
+      - primary_knowledge_source
+      - aggregator_knowledge_source
+      - negated
+      - pathway
+      - evidence_count
+      - knowledge_level
+      - agent_type
+      - has_evidence
+      - has_evidence_links
+      - has_count
+      - has_total
+      - has_percentage
+      - has_quotient
+      - grouping_key
+      - provided_by
+      - provided_by_link
+      - publications
+      - publications_links
+      - frequency_qualifier
+      - onset_qualifier
+      - sex_qualifier
+      - stage_qualifier
+      - qualifiers
+      - qualifiers_label
+      - qualifiers_namespace
+      - qualifiers_category
+      - qualifiers_closure
+      - qualifiers_closure_label
+      - qualifier
+      - qualifier_label
+      - qualifier_namespace
+      - qualifier_category
+      - qualifier_closure
+      - qualifier_closure_label
+      - frequency_qualifier_label
+      - frequency_qualifier_namespace
+      - frequency_qualifier_category
+      - frequency_qualifier_closure
+      - frequency_qualifier_closure_label
+      - onset_qualifier_label
+      - onset_qualifier_namespace
+      - onset_qualifier_category
+      - onset_qualifier_closure
+      - onset_qualifier_closure_label
+      - sex_qualifier_label
+      - sex_qualifier_namespace
+      - sex_qualifier_category
+      - sex_qualifier_closure
+      - sex_qualifier_closure_label
+      - stage_qualifier_label
+      - stage_qualifier_namespace
+      - stage_qualifier_category
+      - stage_qualifier_closure
+      - stage_qualifier_closure_label
+  AssociationCount:
+    is_a: FacetValue
+    slots:
+      - category
+    slot_usage:
+      category:
+        multivalued: false
+  AssociationCountList:
+    description: Container class for a list of association counts
+    slots:
+      - items
+    slot_usage:
+      items:
+        range: AssociationCount
+  AssociationResults:
+    is_a: Results
+    slots:
+      - items
+    slot_usage:
+      items:
+        range: Association
+  CompactAssociation:
+    slots:
+      - category
+      - subject
+      - subject_label
+      - predicate
+      - object
+      - object_label
+      - negated
+  CompactAssociationResults:
+    is_a: Results
+    slots:
+      - items
+    slot_usage:
+      items:
+        range: CompactAssociation
+  AssociationTableResults:
+    is_a: Results
+    slots:
+      - items
+    slot_usage:
+      items:
+        range: DirectionalAssociation
+  AssociationTypeMapping:
+    description: >-
+      A data class to hold the necessary information to produce association type counts for given 
+      entities with appropriate directional labels
+    slots:
+      - subject_label
+      - object_label
+      - symmetric
+      - category
+    slot_usage:
+      subject_label:
+        description: A label to describe the subjects of the association type as a whole for use in the UI
+      object_label:
+        description: A label to describe the objects of the association type as a whole for use in the UI
+      symmetric:
+        description: >-
+          Whether the association type is symmetric, meaning that the subject and object labels should be
+          interchangeable
+        ifabsent: false
+        required: true
+      category:
+        description: The biolink category to use in queries for this association type
+        required: true
+        multivalued: false
+  CategoryGroupedAssociationResults:
+    is_a: Results
+    slots:
+      - counterpart_category
+      - items
+    slot_usage:
+      items:
+        range: Association
+  DirectionalAssociation:
+    is_a: Association
+    description: >-
+      An association that gives it's direction relative to a specified entity
+    slots:
+      - direction
+  ExpandedCurie:
+    description: A curie bundled along with its expanded url
+    slots:
+      - id
+      - url
+  Entity:
+    description: Represents an Entity in the Monarch KG data model
+    slots:
+      - id
+      - category
+      - name
+      - full_name
+      - deprecated
+      - description
+      - xref
+      - provided_by
+      - in_taxon
+      - in_taxon_label
+      - symbol
+      - synonym
+      - uri
+      - iri
+      - namespace
+      - has_phenotype
+      - has_phenotype_label
+      - has_phenotype_closure
+      - has_phenotype_closure_label
+      - has_phenotype_count
+  EntityResults:
+    is_a: Results
+    slots:
+      - items
+    slot_usage:
+      items:
+        range: Entity
+  FacetValue:
+    slots:
+      - label
+      - count
+  FacetField:
+    slots:
+      - label
+      - facet_values
+  HistoPheno:
+    slots:
+      - id
+      - items
+    slot_usage:
+      items:
+        range: HistoBin
+  HistoBin:
+    is_a: FacetValue
+    slots:
+      - id
+  Mapping:
+    description: >-
+      A minimal class to hold a SSSOM mapping
+    slots:
+      - subject_id
+      - subject_label
+      - predicate_id
+      - object_id
+      - object_label
+      - mapping_justification
+      - id
+  MappingResults:
+    description: SSSOM Mappings returned as a results collection
+    is_a: Results
+    slots:
+      - items
+    slot_usage:
+      items:
+        range: Mapping
+  MultiEntityAssociationResults:
+    is_a: Results
+    slots:
+      - id
+      - name
+      - associated_categories
+  Node:
+    description: UI container class extending Entity with additional information
+    is_a: Entity
+    slots:
+      - in_taxon
+      - in_taxon_label
+      - inheritance
+      - causal_gene
+      - causes_disease
+      - mappings
+      - external_links
+      - provided_by_link
+      - association_counts
+      - node_hierarchy
+  NodeHierarchy:
+    slots:
+      - super_classes
+      - sub_classes
+  Release:
+    description: >-
+      A class to hold information about a release of the Monarch KG
+    slots:
+      - version
+      - url
+      - kg
+      - sqlite
+      - solr
+      - neo4j
+      - metadata
+      - graph_stats
+      - qc_report
+  Results:
+    abstract: true
+    slots:
+      - limit
+      - offset
+      - total
+  SearchResult:
+    is_a: Entity
+    slots:
+      - highlight
+      - score
+    slot_usage:
+      category:
+        required: true
+      name:
+        required: true
+  SearchResults:
+    is_a: Results
+    slots:
+      - items
+      - facet_fields
+      - facet_queries
+    slot_usage:
+      items:
+        range: SearchResult
+  TextAnnotationResult:
+    slots:
+      - text
+      - tokens
+      - start
+      - end
+
+slots:
+  aggregator_knowledge_source:
+    multivalued: true
+  association_counts:
+    range: AssociationCount
+    multivalued: true
+    inlined_as_list: true
+    required: true
+  associated_categories:
+    range: CategoryGroupedAssociationResults
+    multivalued: true
+    inlined_as_list: true
+    required: true
+  category:
+    multivalued: false
+  causal_gene:
+    description: >-
+      A list of genes that are known to be causally associated with a disease
+    range: Entity
+    multivalued: true
+    inlined_as_list: true
+  causes_disease:
+    description: >-
+      A list of diseases that are known to be causally associated with a gene
+    range: Entity
+    multivalued: true
+    inlined_as_list: true
+  count:
+    description: count of documents
+    range: integer
+  counterpart_category:
+    description: >-
+      The category of the counterpart entity in a given association, 
+      eg. the category of the entity that is not the subject
+    range: string
+  deprecated:
+    description: >-
+      A boolean flag indicating that an entity is no longer considered current or valid.
+    range: boolean
+    exact_mappings:
+      - oboInOwl:ObsoleteClass
+  description:
+    range: string
+  direction:
+    description: >-
+      The directionality of the association relative to a given entity for an association_count.
+      If the entity is the subject or in the subject closure, the direction is forwards, if it is
+      the object or in the object closure, the direction is backwards.
+    range: AssociationDirectionEnum
+    required: true
+  evidence_count:
+    description: count of supporting documents, evidence codes, and sources supplying evidence
+    range: integer
+  knowledge_level:
+    description: >-
+      Describes the level of knowledge expressed in a statement, based on the
+      reasoning or analysis methods used to generate the statement, or the
+      scope or specificity of what the statement expresses to be true.
+    slot_uri: biolink:knowledge_level
+    notes: >-
+      The range in this schema is represented as a string, but is constrained 
+      to values from biolink:KnowledgeLevelEnum at ingest time
+    range: string
+    multivalued: false
+    required: true
+  agent_type:
+    description: >-
+      Describes the high-level category of agent who originally generated a 
+      statement of knowledge or other type of information.
+    slot_uri: biolink:agent_type
+    notes: >-
+      The range in this schema is represented as a string, but is constrained 
+      to values from biolink:AgentTypeEnum at ingest time
+    range: string
+    multivalued: false
+    required: true
+  external_links:
+    description: ExpandedCurie with id and url for xrefs
+    range: ExpandedCurie
+    multivalued: true
+    inlined_as_list: true
+  facet_fields:
+    description: Collection of facet field responses with the field values and counts
+    inlined: true
+    inlined_as_list: true
+    multivalued: true
+    range: FacetField
+  facet_queries:
+    description: Collection of facet query responses with the query string values and counts
+    inlined: true
+    inlined_as_list: true
+    multivalued: true
+    range: FacetValue
+  facet_values:
+    description: Collection of FacetValue label/value instances belonging to a FacetField
+    inlined: true
+    inlined_as_list: true
+    multivalued: true
+    range: FacetValue
+  frequency_qualifier:
+    range: string
+  full_name:
+    description: The long form name of an entity
+    range: string
+  grouping_key:
+    description: A concatenation of fields used to group associations with the same essential/defining properties
+    range: string
+  has_count:
+    description: count of out of has_total representing a frequency
+    range: integer
+  has_total:
+    description: total, devided by has_count, representing a frequency
+    range: integer
+  has_percentage:
+    description: percentage, which may be calculated from has_count and has_total, as 100 * quotient or provided directly, rounded to the integer level
+    range: float
+  has_quotient:
+    description: quotient, which should be 1/100 of has_percentage
+    range: float
+  has_evidence:
+    range: string
+    multivalued: true
+  has_evidence_links:
+    description: List of ExpandedCuries with id and url for evidence
+    range: ExpandedCurie
+    multivalued: true
+    inlined: true
+    inlined_as_list: true
+  has_phenotype:
+    description: >-
+      A list of phenotype identifiers that are known to be associated with this entity
+    range: string
+    multivalued: true
+    inlined_as_list: true
+  has_phenotype_label:
+    description: >-
+      A list of phenotype labels that are known to be associated with this entity
+    range: string
+    multivalued: true
+    inlined_as_list: true
+  has_phenotype_closure:
+    description: >-
+      A list of phenotype identifiers that are known to be associated with this entity expanded to include all ancestors
+    range: string
+    multivalued: true
+    inlined_as_list: true
+  has_phenotype_closure_label:
+    description: >-
+      A list of phenotype labels that are known to be associated with this entity expanded to include all ancestors
+    range: string
+    multivalued: true
+    inlined_as_list: true
+  has_phenotype_count:
+    description: >-
+      A count of the number of phenotypes that are known to be associated with this entity
+    range: integer
+  highlight:
+    description: matching text snippet containing html tags
+    range: string
+  id:
+    identifier: true
+    range: string
+    required: true
+  in_taxon:
+    description: The biolink taxon that the entity is in the closure of.
+    range: string
+  in_taxon_label:
+    description: The label of the biolink taxon that the entity is in the closure of.
+    range: string
+  inheritance:
+    range: Entity
+    inlined: true
+  items:
+    description: A collection of items, with the type to be overriden by slot_usage
+    range: string
+    inlined: true
+    inlined_as_list: true
+    multivalued: true
+    required: true
+  knowledge_source:
+    multivalued: true
+  label:
+    range: string
+    required: true
+  limit:
+    description: number of items to return in a response
+    range: integer
+    required: true
+  name:
+    range: string
+  namespace:
+    range: string
+    description: The namespace/prefix portion of this entity's identifier
+  negated:
+    range: boolean
+  node_hierarchy:
+    range: NodeHierarchy
+    inlined: true
+  object:
+    range: string
+    required: true
+  offset:
+    description: offset into the total number of items
+    range: integer
+    required: true
+  onset_qualifier:
+    range: string
+  original_object:
+    range: string
+  original_subject:
+    range: string
+  pathway:
+    range: string
+  predicate:
+    multivalued: false
+    range: string
+    required: true
+  primary_knowledge_source:
+    range: string
+  provided_by_link:
+    description: A link to the docs for the knowledge source that provided the node/edge.
+    range: ExpandedCurie
+    inlined: true
+  provided_by:
+    range: string
+  publications:
+    multivalued: true
+  publications_links:
+    description: List of ExpandedCuries with id and url for publications
+    range: ExpandedCurie
+    multivalued: true
+    inlined: true
+    inlined_as_list: true
+  score:
+    range: float
+  sex_qualifier:
+    range: string
+  stage_qualifier:
+    range: string
+  subject:
+    range: string
+    required: true
+  sub_classes:
+    range: Entity
+    multivalued: true
+    inlined: true
+    inlined_as_list: true
+    required: true
+  super_classes:
+    range: Entity
+    multivalued: true
+    inlined: true
+    inlined_as_list: true
+    required: true
+  symbol:
+    range: string
+  symmetric:
+    description: >-
+      Whether the association type is symmetric, i.e. the subject and object labels are interchangeable.
+    range: boolean
+  synonym:
+    multivalued: true
+  total:
+    description: total number of items matching a query
+    range: integer
+    required: true
+  xref:
+    multivalued: true
+    range: string
+  uri:
+    description: The URI of the entity
+  url:
+    range: string
+  iri:
+    range: string
+  subject_label:
+    is_a: name
+    description: The name of the subject entity
+  subject_namespace:
+    range: string
+    description: The namespace/prefix of the subject entity
+  subject_category:
+    is_a: category
+    description: The category of the subject entity
+  subject_closure:
+    multivalued: true
+    description: Field containing subject id and the ids of all of it's ancestors
+  subject_closure_label:
+    multivalued: true
+    description: Field containing subject name and the names of all of it's ancestors
+  subject_taxon:
+    is_a: in_taxon
+  subject_taxon_label:
+    is_a: in_taxon_label
+  object_label:
+    is_a: name
+    description: The name of the object entity
+  object_namespace:
+    range: string
+    description: The namespace/prefix of the object entity
+  object_category:
+    is_a: category
+    description: The category of the object entity
+  object_closure:
+    multivalued: true
+    description: Field containing object id and the ids of all of it's ancestors
+  object_closure_label:
+    multivalued: true
+    description: Field containing object name and the names of all of it's ancestors
+  object_taxon:
+    is_a: in_taxon
+  object_taxon_label:
+    is_a: in_taxon_label
+  qualifiers:
+    multivalued: true
+  qualifiers_label:
+    is_a: name
+    description: The name of the frequency_qualifier entity
+  qualifiers_namespace:
+    range: string
+    description: The namespace/prefix of the frequency_qualifier entity
+  qualifiers_category:
+    is_a: category
+    description: The category of the frequency_qualifier entity
+  qualifiers_closure:
+    multivalued: true
+    description: Field containing frequency_qualifier id and the ids of all of it's ancestors
+  qualifiers_closure_label:
+    multivalued: true
+    description: Field containing frequency_qualifier name and the names of all of it's ancestors
+  qualifier:
+    multivalued: true
+  qualifier_label:
+    is_a: name
+    description: The name of the frequency_qualifier entity
+  qualifier_namespace:
+    range: string
+    description: The namespace/prefix of the frequency_qualifier entity
+  qualifier_category:
+    is_a: category
+    description: The category of the frequency_qualifier entity
+  qualifier_closure:
+    multivalued: true
+    description: Field containing frequency_qualifier id and the ids of all of it's ancestors
+  qualifier_closure_label:
+    multivalued: true
+    description: Field containing frequency_qualifier name and the names of all of it's ancestors
+  frequency_qualifier_label:
+    is_a: name
+    description: The name of the frequency_qualifier entity
+  frequency_qualifier_namespace:
+    range: string
+    description: The namespace/prefix of the frequency_qualifier entity
+  frequency_qualifier_category:
+    is_a: category
+    description: The category of the frequency_qualifier entity
+  frequency_qualifier_closure:
+    multivalued: true
+    description: Field containing frequency_qualifier id and the ids of all of it's ancestors
+  frequency_qualifier_closure_label:
+    multivalued: true
+    description: Field containing frequency_qualifier name and the names of all of it's ancestors
+  onset_qualifier_label:
+    is_a: name
+    description: The name of the onset_qualifier entity
+  onset_qualifier_namespace:
+    range: string
+    description: The namespace/prefix of the onset_qualifier entity
+  onset_qualifier_category:
+    is_a: category
+    description: The category of the onset_qualifier entity
+  onset_qualifier_closure:
+    multivalued: true
+    description: Field containing onset_qualifier id and the ids of all of it's ancestors
+  onset_qualifier_closure_label:
+    multivalued: true
+    description: Field containing onset_qualifier name and the names of all of it's ancestors
+  sex_qualifier_label:
+    is_a: name
+    description: The name of the sex_qualifier entity
+  sex_qualifier_namespace:
+    range: string
+    description: The namespace/prefix of the sex_qualifier entity
+  sex_qualifier_category:
+    is_a: category
+    description: The category of the sex_qualifier entity
+  sex_qualifier_closure:
+    multivalued: true
+    description: Field containing sex_qualifier id and the ids of all of it's ancestors
+  sex_qualifier_closure_label:
+    multivalued: true
+    description: Field containing sex_qualifier name and the names of all of it's ancestors
+  stage_qualifier_label:
+    is_a: name
+    description: The name of the stage_qualifier entity
+  stage_qualifier_namespace:
+    range: string
+    description: The namespace/prefix of the stage_qualifier entity
+  stage_qualifier_category:
+    is_a: category
+    description: The category of the stage_qualifier entity
+  stage_qualifier_closure:
+    multivalued: true
+    description: Field containing stage_qualifier id and the ids of all of it's ancestors
+  stage_qualifier_closure_label:
+    multivalued: true
+    description: Field containing stage_qualifier name and the names of all of it's ancestors
+
+  # sssom slots
+  mappings:
+    description: List of ExpandedCuries with id and url for mapped entities
+    range: ExpandedCurie
+    multivalued: true
+    inlined_as_list: true
+  subject_id:
+    range: string
+    required: true
+  # subject label is already included in this schema
+  predicate_id:
+    range: string
+    required: true
+  object_id:
+    range: string
+    required: true
+  # object label is already included in this schema
+  mapping_justification:
+    range: string
+
+  # Text annotation
+  text:
+    description: text without tokens
+    range: string
+    inlined: true
+  tokens:
+    description: A collection of entities or concepts
+    range: Entity
+    inlined: true
+    inlined_as_list: true
+    multivalued: true
+  start:
+    description: start position of the annotation
+    range: integer
+  end:
+    description: end position of the annotation
+    range: integer
+
+  # release slots
+  version: string
+  kg: string
+  sqlite: string
+  solr: string
+  neo4j: string
+  metadata: string
+  graph_stats: string
+  qc_report: string
\ No newline at end of file
diff --git a/poetry.lock b/poetry.lock
index 80ca592..4df6e58 100644
--- a/poetry.lock
+++ b/poetry.lock
@@ -269,6 +269,51 @@ files = [
     {file = "distlib-0.3.8.tar.gz", hash = "sha256:1530ea13e350031b6312d8580ddb6b27a104275a31106523b8f123787f494f64"},
 ]
 
+[[package]]
+name = "duckdb"
+version = "1.3.2"
+description = "DuckDB in-process database"
+optional = false
+python-versions = ">=3.7.0"
+files = [
+    {file = "duckdb-1.3.2-cp310-cp310-macosx_12_0_arm64.whl", hash = "sha256:14676651b86f827ea10bf965eec698b18e3519fdc6266d4ca849f5af7a8c315e"},
+    {file = "duckdb-1.3.2-cp310-cp310-macosx_12_0_universal2.whl", hash = "sha256:e584f25892450757919639b148c2410402b17105bd404017a57fa9eec9c98919"},
+    {file = "duckdb-1.3.2-cp310-cp310-macosx_12_0_x86_64.whl", hash = "sha256:84a19f185ee0c5bc66d95908c6be19103e184b743e594e005dee6f84118dc22c"},
+    {file = "duckdb-1.3.2-cp310-cp310-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:186fc3f98943e97f88a1e501d5720b11214695571f2c74745d6e300b18bef80e"},
+    {file = "duckdb-1.3.2-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:6b7e6bb613b73745f03bff4bb412f362d4a1e158bdcb3946f61fd18e9e1a8ddf"},
+    {file = "duckdb-1.3.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:1c90646b52a0eccda1f76b10ac98b502deb9017569e84073da00a2ab97763578"},
+    {file = "duckdb-1.3.2-cp310-cp310-win_amd64.whl", hash = "sha256:4cdffb1e60defbfa75407b7f2ccc322f535fd462976940731dfd1644146f90c6"},
+    {file = "duckdb-1.3.2-cp311-cp311-macosx_12_0_arm64.whl", hash = "sha256:e1872cf63aae28c3f1dc2e19b5e23940339fc39fb3425a06196c5d00a8d01040"},
+    {file = "duckdb-1.3.2-cp311-cp311-macosx_12_0_universal2.whl", hash = "sha256:db256c206056468ae6a9e931776bdf7debaffc58e19a0ff4fa9e7e1e82d38b3b"},
+    {file = "duckdb-1.3.2-cp311-cp311-macosx_12_0_x86_64.whl", hash = "sha256:1d57df2149d6e4e0bd5198689316c5e2ceec7f6ac0a9ec11bc2b216502a57b34"},
+    {file = "duckdb-1.3.2-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:54f76c8b1e2a19dfe194027894209ce9ddb073fd9db69af729a524d2860e4680"},
+    {file = "duckdb-1.3.2-cp311-cp311-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:45bea70b3e93c6bf766ce2f80fc3876efa94c4ee4de72036417a7bd1e32142fe"},
+    {file = "duckdb-1.3.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:003f7d36f0d8a430cb0e00521f18b7d5ee49ec98aaa541914c6d0e008c306f1a"},
+    {file = "duckdb-1.3.2-cp311-cp311-win_amd64.whl", hash = "sha256:0eb210cedf08b067fa90c666339688f1c874844a54708562282bc54b0189aac6"},
+    {file = "duckdb-1.3.2-cp312-cp312-macosx_12_0_arm64.whl", hash = "sha256:2455b1ffef4e3d3c7ef8b806977c0e3973c10ec85aa28f08c993ab7f2598e8dd"},
+    {file = "duckdb-1.3.2-cp312-cp312-macosx_12_0_universal2.whl", hash = "sha256:9d0ae509713da3461c000af27496d5413f839d26111d2a609242d9d17b37d464"},
+    {file = "duckdb-1.3.2-cp312-cp312-macosx_12_0_x86_64.whl", hash = "sha256:72ca6143d23c0bf6426396400f01fcbe4785ad9ceec771bd9a4acc5b5ef9a075"},
+    {file = "duckdb-1.3.2-cp312-cp312-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b49a11afba36b98436db83770df10faa03ebded06514cb9b180b513d8be7f392"},
+    {file = "duckdb-1.3.2-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:36abdfe0d1704fe09b08d233165f312dad7d7d0ecaaca5fb3bb869f4838a2d0b"},
+    {file = "duckdb-1.3.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:3380aae1c4f2af3f37b0bf223fabd62077dd0493c84ef441e69b45167188e7b6"},
+    {file = "duckdb-1.3.2-cp312-cp312-win_amd64.whl", hash = "sha256:11af73963ae174aafd90ea45fb0317f1b2e28a7f1d9902819d47c67cc957d49c"},
+    {file = "duckdb-1.3.2-cp313-cp313-macosx_12_0_arm64.whl", hash = "sha256:a3418c973b06ac4e97f178f803e032c30c9a9f56a3e3b43a866f33223dfbf60b"},
+    {file = "duckdb-1.3.2-cp313-cp313-macosx_12_0_universal2.whl", hash = "sha256:2a741eae2cf110fd2223eeebe4151e22c0c02803e1cfac6880dbe8a39fecab6a"},
+    {file = "duckdb-1.3.2-cp313-cp313-macosx_12_0_x86_64.whl", hash = "sha256:51e62541341ea1a9e31f0f1ade2496a39b742caf513bebd52396f42ddd6525a0"},
+    {file = "duckdb-1.3.2-cp313-cp313-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:b3e519de5640e5671f1731b3ae6b496e0ed7e4de4a1c25c7a2f34c991ab64d71"},
+    {file = "duckdb-1.3.2-cp313-cp313-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:4732fb8cc60566b60e7e53b8c19972cb5ed12d285147a3063b16cc64a79f6d9f"},
+    {file = "duckdb-1.3.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:97f7a22dcaa1cca889d12c3dc43a999468375cdb6f6fe56edf840e062d4a8293"},
+    {file = "duckdb-1.3.2-cp313-cp313-win_amd64.whl", hash = "sha256:cd3d717bf9c49ef4b1016c2216517572258fa645c2923e91c5234053defa3fb5"},
+    {file = "duckdb-1.3.2-cp39-cp39-macosx_12_0_arm64.whl", hash = "sha256:18862e3b8a805f2204543d42d5f103b629cb7f7f2e69f5188eceb0b8a023f0af"},
+    {file = "duckdb-1.3.2-cp39-cp39-macosx_12_0_universal2.whl", hash = "sha256:75ed129761b6159f0b8eca4854e496a3c4c416e888537ec47ff8eb35fda2b667"},
+    {file = "duckdb-1.3.2-cp39-cp39-macosx_12_0_x86_64.whl", hash = "sha256:875193ae9f718bc80ab5635435de5b313e3de3ec99420a9b25275ddc5c45ff58"},
+    {file = "duckdb-1.3.2-cp39-cp39-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:09b5fd8a112301096668903781ad5944c3aec2af27622bd80eae54149de42b42"},
+    {file = "duckdb-1.3.2-cp39-cp39-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl", hash = "sha256:10cb87ad964b989175e7757d7ada0b1a7264b401a79be2f828cf8f7c366f7f95"},
+    {file = "duckdb-1.3.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:4389fc3812e26977034fe3ff08d1f7dbfe6d2d8337487b4686f2b50e254d7ee3"},
+    {file = "duckdb-1.3.2-cp39-cp39-win_amd64.whl", hash = "sha256:07952ec6f45dd3c7db0f825d231232dc889f1f2490b97a4e9b7abb6830145a19"},
+    {file = "duckdb-1.3.2.tar.gz", hash = "sha256:c658df8a1bc78704f702ad0d954d82a1edd4518d7a04f00027ec53e40f591ff5"},
+]
+
 [[package]]
 name = "et-xmlfile"
 version = "1.1.0"
@@ -1862,4 +1907,4 @@ testing = ["big-O", "flake8 (<5)", "jaraco.functools", "jaraco.itertools", "more
 [metadata]
 lock-version = "2.0"
 python-versions = "^3.8.1"
-content-hash = "6fc01e57c7c15380b7dd514737f1b1bb95a5b7d573b7d6e90ea6182df76036f7"
+content-hash = "0aefa155d312654cf35bf78516b649fccd9a86096e9b7188d28055612a528446"
diff --git a/pyproject.toml b/pyproject.toml
index aee85ed..aa24888 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -14,6 +14,8 @@ linkml-runtime = ">=1.7.4"
 pysolr = "^3.9.0"
 linkml-dataops = "^0.1.0"
 jsonasobj = "^1.3.1"
+duckdb = "*"
+requests = "^2.25.0"
 
 [tool.poetry.dev-dependencies]
 tox = "^3.24.5"
diff --git a/similarity.yaml b/similarity.yaml
new file mode 100644
index 0000000..391baee
--- /dev/null
+++ b/similarity.yaml
@@ -0,0 +1,217 @@
+id: https://w3id.org/monarch/monarch-py-similarity
+name: monarch-py-similarity
+description: Data models for the Monarch Initiative data access library
+prefixes:
+  linkml: https://w3id.org/linkml/
+imports:
+  - linkml:types
+default_range: string
+
+# This is a copy and paste from https://w3id.org/oak/similarity.yaml with some slots commented out because they
+# conflicted with slots in the monarch-py schema
+
+classes:
+  PairwiseSimilarity:
+    abstract: true
+    description: >-
+      Abstract grouping for representing individual pairwise similarities
+
+  TermPairwiseSimilarity:
+    is_a: PairwiseSimilarity
+    description: >-
+      A simple pairwise similarity between two atomic concepts/terms
+    slots:
+      - subject_id
+      - subject_label
+      - subject_source
+      - object_id
+      - object_label
+      - object_source
+      - ancestor_id
+      - ancestor_label
+      - ancestor_source
+      - object_information_content
+      - subject_information_content
+      - ancestor_information_content
+      - jaccard_similarity
+      - cosine_similarity
+      - dice_similarity
+      - phenodigm_score
+
+  TermSetPairwiseSimilarity:
+    is_a: PairwiseSimilarity
+    description: >-
+      A simple pairwise similarity between two sets of concepts/terms
+    slots:
+      - subject_termset
+      - object_termset
+      - subject_best_matches
+      - object_best_matches
+      - average_score
+      - best_score
+      - metric
+
+  TermInfo:
+    attributes:
+      id:
+        identifier: true
+      label:
+        slot_uri: rdfs:label
+
+  BestMatch:
+    attributes:
+      match_source:
+        identifier: true
+        comments:
+          - note that the match_source is either the subject or the object
+      match_source_label:
+      match_target:
+        description: the entity matches
+      match_target_label:
+      score:
+        range: float
+        required: true
+      match_subsumer:
+        range: uriorcurie
+      match_subsumer_label:
+      similarity:
+        range: TermPairwiseSimilarity
+        required: true
+
+  SemsimSearchResult:
+    slots:
+      - subject
+      - score
+      - similarity
+    slot_usage:
+      subject:
+        range: Entity
+        inlined: true
+
+types:
+  ZeroToOne:
+    typeof: float
+    minimum_value: 0
+    maximum_value: 0
+  NonNegativeFloat:
+    typeof: float
+    minimum_value: 0
+  NegativeLogValue:
+    typeof: float
+    minimum_value: 0
+  ItemCount:
+    typeof: integer
+    minimum_value: 0
+
+slots:
+  similarity:
+    range: TermSetPairwiseSimilarity
+  #  subject_id:
+  #    slot_uri: sssom:subject_id
+  #    required: true
+  #    range: uriorcurie
+  #    description: The first of the two entities being compared
+  # Excluded, since it conflicts with subject_label from this schema
+  #  subject_label:
+  #    slot_uri: sssom:subject_label
+  #    description: the label or name for the first entity
+  subject_source:
+    slot_uri: sssom:subject_source
+    description: the source for the first entity
+  #  object_id:
+  #    slot_uri: sssom:object_id
+  #    range: uriorcurie
+  #    description: The second of the two entities being compared
+  # Excluded, since it conflicts with object_label from this schema
+  #  object_label:
+  #    slot_uri: sssom:object_label
+  #    description: the label or name for the second entity
+  object_source:
+    slot_uri: sssom:object_source
+    description: the source for the second entity
+  ancestor_id:
+    range: uriorcurie
+    description: >-
+      the most recent common ancestor of the two compared entities. If there are multiple MRCAs then
+      the most informative one is selected
+    todos:
+      - decide on what to do when there are multiple possible ancestos
+  ancestor_label:
+    description: the name or label of the ancestor concept
+  ancestor_source:
+  # Excluded, conflicts with score from this schema
+  #  score:
+  #    abstract: true
+  #    description: Abstract base slot for different kinds of scores
+  information_content:
+    abstract: true
+    aliases:
+      - IC
+    is_a: score
+    range: NegativeLogValue
+    description: The IC is the negative log of the probability of the concept
+  subject_information_content:
+    is_a: information_content
+    description: The IC of the subject
+  object_information_content:
+    is_a: information_content
+    description: The IC of the object
+  ancestor_information_content:
+    is_a: information_content
+    description: The IC of the object
+  jaccard_similarity:
+    is_a: score
+    range: ZeroToOne
+    description: The number of concepts in the intersection divided by the number in the union
+  cosine_similarity:
+    is_a: score
+    range: float
+    description: the dot product of two node embeddings divided by the product of their lengths
+  dice_similarity:
+    is_a: score
+    range: ZeroToOne
+  phenodigm_score:
+    is_a: score
+    range: NonNegativeFloat
+    description: the geometric mean of the jaccard similarity and the information content
+    equals_expression: sqrt({jaccard_similarity} * {information_content})
+  overlap_coefficient:
+    is_a: score
+    range: ZeroToOne
+  subsumes_score:
+    is_a: score
+    range: ZeroToOne
+  subsumed_by_score:
+    is_a: score
+    range: ZeroToOne
+  intersection_count:
+    is_a: score
+    range: ItemCount
+  union_count:
+    is_a: score
+    range: ItemCount
+  # TermSets
+  subject_termset:
+    range: TermInfo
+    multivalued: true
+    inlined: true
+  object_termset:
+    range: TermInfo
+    multivalued: true
+    inlined: true
+  subject_best_matches:
+    range: BestMatch
+    multivalued: true
+    inlined: true
+  object_best_matches:
+    range: BestMatch
+    multivalued: true
+    inlined: true
+  metric:
+    range: uriorcurie
+  average_score:
+    range: float
+    required: false
+  best_score:
+    range: float
+    required: false
diff --git a/test-docker-env b/test-docker-env
new file mode 100644
index 0000000..e69de29