datopian · steveoni · Oct 3, 2025 · Oct 3, 2025 · coderabbitai · Oct 3, 2025
diff --git a/src/harvesters/dcat.ts b/src/harvesters/dcat.ts
@@ -0,0 +1,176 @@
+import { env } from "../../config";
+import { BaseHarvester, BaseHarvesterConfig } from "./base";
+import { PortalJsCloudDataset } from "@/schemas/portaljs-cloud";
+import { Harvester } from ".";
+import {
+  DCATDataset,
+  DCATDistribution,
+  extractString,
+  extractAgentName,
+  extractStringArray,
+  extractDistributions,
+} from "../lib/dcat";
+
+@Harvester
+class DCATHarvester extends BaseHarvester<DCATDataset> {
+  constructor(args: BaseHarvesterConfig) {
+    super(args);
+  }
+
+  async getSourceDatasets(): Promise<DCATDataset[]> {
+    const url = this.config.source.url;
+    const res = await fetch(url);
+    if (!res.ok) {
+      throw new Error(
+        `Failed to fetch DCAT JSON-LD: ${res.status} ${res.statusText}`
+      );
+    }
+    const jsonLd: any[] = await res.json();
+
+    const objectMap = new Map<string, any>();
+    jsonLd.forEach((obj) => objectMap.set(obj["@id"], obj));
+
+    const datasets: DCATDataset[] = jsonLd
+      .filter((obj) =>
+        obj["@type"]?.includes("http://www.w3.org/ns/dcat#Dataset")
+      )
+      .map((dataset) => ({
+        ...dataset,
+        distributions: extractDistributions(dataset, jsonLd),
+        resolvedPublisherName: extractAgentName(
+          dataset,
+          "http://purl.org/dc/terms/publisher",
+          jsonLd
+        ),
+      }));
+
+    return datasets;
+  }
+
+  mapSourceDatasetToTarget(pkg: DCATDataset): PortalJsCloudDataset {
+    const owner_org = env.PORTALJS_CLOUD_MAIN_ORG;
+
+    // Map distributions to resources
+    const resources = (pkg.distributions || []).map(
+      (dist: DCATDistribution) => ({
+        name:
+          extractString(dist, "http://purl.org/dc/terms/title") ||
+          "Unnamed Resource",
+        url:
+          extractString(dist, "http://www.w3.org/ns/dcat#downloadURL") ||
+          extractString(dist, "http://www.w3.org/ns/dcat#accessURL") ||
+          "",
+        format:
+          extractString(dist, "http://purl.org/dc/terms/format") ||
+          extractString(dist, "http://www.w3.org/ns/dcat#mediaType") ||
+          "",
+        description:
+          extractString(dist, "http://purl.org/dc/terms/description") || "",
+        license_url:
+          extractString(dist, "http://purl.org/dc/terms/license") || "",
+      })
+    );
+
+    const extras: Array<{ key: string; value: string }> = [];
+    const extraMappings = [
+      { predicate: "http://purl.org/dc/terms/issued", key: "issued" },
+      { predicate: "http://purl.org/dc/terms/modified", key: "modified" },
+      {
+        predicate: "http://www.w3.org/2002/07/owl#versionInfo",
+        key: "dcat_version",
+      },
+      {
+        predicate: "http://purl.org/dc/terms/accrualPeriodicity",
+        key: "frequency",
+      },
+      {
+        predicate: "http://purl.org/dc/terms/conformsTo",
+        key: "conforms_to",
+        isArray: true,
+      },
+      {
+        predicate: "http://purl.org/dc/terms/accessRights",
+        key: "access_rights",
+      },
+      { predicate: "http://purl.org/dc/terms/provenance", key: "provenance" },
+      { predicate: "http://purl.org/dc/terms/type", key: "dcat_type" },
+      { predicate: "http://purl.org/dc/terms/spatial", key: "spatial_uri" },
+      { predicate: "http://purl.org/dc/terms/publisher", key: "publisher_uri" },
+    ];
+
+    extraMappings.forEach(({ predicate, key, isArray = false }) => {
+      const value = isArray
+        ? extractStringArray(pkg, predicate).join(", ")
+        : extractString(pkg, predicate);
+      if (value) extras.push({ key, value });
+    });
+
+    const skippedKeys = [
+      "@id",
+      "@type",
+      "distributions",
+      "http://www.w3.org/ns/dcat#distribution",
+      "http://purl.org/dc/terms/title",
+      "http://purl.org/dc/terms/description",
+      "http://purl.org/dc/terms/identifier",
+      "http://purl.org/dc/terms/issued",
+      "http://purl.org/dc/terms/modified",
+      "http://www.w3.org/2002/07/owl#versionInfo",
+      "http://purl.org/dc/terms/language",
+      "http://www.w3.org/ns/dcat#landingPage",
+      "http://xmlns.com/foaf/0.1/page",
+      "http://purl.org/dc/terms/accrualPeriodicity",
+      "http://purl.org/dc/terms/conformsTo",
+      "http://purl.org/dc/terms/accessRights",
+      "http://purl.org/dc/terms/provenance",
+      "http://purl.org/dc/terms/type",
+      "http://purl.org/dc/terms/spatial",
+      "http://purl.org/dc/terms/publisher",
+      "http://www.w3.org/ns/dcat#contactPoint",
+      "http://purl.org/dc/terms/creator",
+      "http://purl.org/dc/terms/license",
+    ];
+    Object.keys(pkg).forEach((key) => {
+      if (!skippedKeys.includes(key)) {
+        const value = extractString(pkg, key) || JSON.stringify(pkg[key]);
+        if (value) extras.push({ key, value });
+      }
+    });
+
+    const extractedLanguage = extractString(
+      pkg,
+      "http://purl.org/dc/terms/language"
+    );
+    const validLanguages = ["EN", "FR", "ES", "DE", "IT"];
+    const language = (
+      validLanguages.includes(extractedLanguage) ? extractedLanguage : "EN"
+    ) as "EN" | "FR" | "ES" | "DE" | "IT";
-    const extractedLanguage = extractString(
-      pkg,
-      "http://purl.org/dc/terms/language"
-    );
-    const validLanguages = ["EN", "FR", "ES", "DE", "IT"];
-    const language = (
-      validLanguages.includes(extractedLanguage) ? extractedLanguage : "EN"
-    ) as "EN" | "FR" | "ES" | "DE" | "IT";
+    const rawLanguage =
+      extractString(pkg, "http://purl.org/dc/terms/language") || "";
+    const extractedLanguage = rawLanguage
+      .split("/")
+      .pop()
+      ?.slice(0, 2)
+      .toUpperCase() || "";
+    const validLanguages = ["EN", "FR", "ES", "DE", "IT"];
+    const language = (
+      validLanguages.includes(extractedLanguage) ? extractedLanguage : "EN"
+    ) as "EN" | "FR" | "ES" | "DE" | "IT";
-    const extractedLanguage = extractString(
-      pkg,
-      "http://purl.org/dc/terms/language"
-    );
-    const validLanguages = ["EN", "FR", "ES", "DE", "IT"];
-    const language = (
-      validLanguages.includes(extractedLanguage) ? extractedLanguage : "EN"
-    ) as "EN" | "FR" | "ES" | "DE" | "IT";
+    const rawLanguage =
+      extractString(pkg, "http://purl.org/dc/terms/language") || "";
+    const extractedLanguage = rawLanguage
+      .split("/")
+      .pop()
+      ?.slice(0, 2)
+      .toUpperCase() || "";
+    const validLanguages = ["EN", "FR", "ES", "DE", "IT"];
+    const language = (
+      validLanguages.includes(extractedLanguage) ? extractedLanguage : "EN"
+    ) as "EN" | "FR" | "ES" | "DE" | "IT";
+    const datasetLicense =
+      extractString(pkg, "http://purl.org/dc/terms/license") ||
+      (resources.length > 0 ? (resources[0] as any).license_url || "" : "");
+
+    // Map to PortalJsCloudDataset (based on ckanext-dcat mappings)
+    return {
+      owner_org,
+      name: `${owner_org}--${
+        extractString(pkg, "http://purl.org/dc/terms/identifier") ||
+        pkg["@id"].split("/").pop() ||
+        "unknown"
+      }`,
+      title: extractString(pkg, "http://purl.org/dc/terms/title") || "",
+      notes: extractString(pkg, "http://purl.org/dc/terms/description") || "",
+      url: extractString(pkg, "http://www.w3.org/ns/dcat#landingPage") || "",
+      language,
+      author: extractString(pkg, "http://purl.org/dc/terms/creator") || "",
+      maintainer: (pkg as any).resolvedPublisherName || "",
+      license_id: extractString(pkg, "http://purl.org/dc/terms/license") || "",
+      license_url: datasetLicense,
+      contact_point:
+        extractString(pkg, "http://www.w3.org/ns/dcat#contactPoint") || "",
+      resources,
+      extras,
+    };
+  }
+}
+
+export { DCATHarvester };
diff --git a/src/lib/dcat.ts b/src/lib/dcat.ts
@@ -0,0 +1,66 @@
+// Interfaces for DCAT JSON-LD (expanded form)
+export interface DCATDistribution {
+  "@id": string;
+  "@type": string[];
+  [key: string]: any;
+}
+
+export interface DCATDataset {
+  "@id": string;
+  "@type": string[];
+  [key: string]: any;
+}
+
+// Helper to extract string value from JSON-LD predicate (e.g., [{"@value": "title"}])
+export function extractString(obj: any, predicate: string): string {
+  const values = obj[predicate];
+  if (Array.isArray(values) && values.length > 0) {
+    const first = values[0];
+    return first["@value"] || first["@id"] || "";
+  }
+  return "";
+}
+
+export function extractAgentName(
+  obj: any,
+  predicate: string,
+  allObjects: any[]
+): string {
+  const agentId = extractString(obj, predicate);
+  if (agentId) {
+    const agent = allObjects.find((o) => o["@id"] === agentId);
+    if (agent) {
+      return (
+        extractString(agent, "http://xmlns.com/foaf/0.1/name") ||
+        extractString(agent, "http://purl.org/dc/terms/title") ||
+        ""
+      );
+    }
+  }
+  return "";
+}
-export function extractAgentName(
-  obj: any,
-  predicate: string,
-  allObjects: any[]
-): string {
-  const agentId = extractString(obj, predicate);
-  if (agentId) {
-    const agent = allObjects.find((o) => o["@id"] === agentId);
-    if (agent) {
-      return (
-        extractString(agent, "http://xmlns.com/foaf/0.1/name") ||
-        extractString(agent, "http://purl.org/dc/terms/title") ||
-        ""
-      );
-    }
-  }
-  return "";
-}
+export function extractAgentName(
+  obj: any,
+  predicate: string,
+  allObjects: any[]
+): string {
+  const agentId = extractString(obj, predicate);
+  if (agentId) {
+    const agent = allObjects.find((o) => o["@id"] === agentId);
+    if (agent) {
+      return (
+        extractString(agent, "http://xmlns.com/foaf/0.1/name") ||
+        extractString(agent, "http://purl.org/dc/terms/title") ||
+        ""
+      );
+    }
+    return agentId;
+  }
+  return "";
+}
-export function extractAgentName(
-  obj: any,
-  predicate: string,
-  allObjects: any[]
-): string {
-  const agentId = extractString(obj, predicate);
-  if (agentId) {
-    const agent = allObjects.find((o) => o["@id"] === agentId);
-    if (agent) {
-      return (
-        extractString(agent, "http://xmlns.com/foaf/0.1/name") ||
-        extractString(agent, "http://purl.org/dc/terms/title") ||
-        ""
-      );
-    }
-  }
-  return "";
-}
+export function extractAgentName(
+  obj: any,
+  predicate: string,
+  allObjects: any[]
+): string {
+  const agentId = extractString(obj, predicate);
+  if (agentId) {
+    const agent = allObjects.find((o) => o["@id"] === agentId);
+    if (agent) {
+      return (
+        extractString(agent, "http://xmlns.com/foaf/0.1/name") ||
+        extractString(agent, "http://purl.org/dc/terms/title") ||
+        ""
+      );
+    }
+    return agentId;
+  }
+  return "";
+}
+
+// Helper to extract array of strings (e.g., for multiple values)
+export function extractStringArray(obj: any, predicate: string): string[] {
+  const values = obj[predicate];
+  if (Array.isArray(values)) {
+    return values.map((v) => v["@value"] || v["@id"] || "").filter(Boolean);
+  }
+  return [];
+}
+
+// Helper to extract distributions by @id references
+export function extractDistributions(
+  dataset: DCATDataset,
+  allObjects: any[]
+): DCATDistribution[] {
+  const distIds = extractStringArray(
+    dataset,
+    "http://www.w3.org/ns/dcat#distribution"
+  );
+  return allObjects.filter(
+    (obj) =>
+      distIds.includes(obj["@id"]) &&
+      obj["@type"]?.includes("http://www.w3.org/ns/dcat#Distribution")
+  ) as DCATDistribution[];
+}
diff --git a/src/schemas/portaljs-cloud.d.ts b/src/schemas/portaljs-cloud.d.ts
@@ -10,6 +10,7 @@ export interface PortalJsCloudDataset {
   author?: string;
   author_email?: string;
   maintainer?: string;
+  url?: string;
   maintainer_email?: string;
   language: "EN" | "FR" | "ES" | "DE" | "IT";
   coverage?: string;
@@ -19,6 +20,8 @@ export interface PortalJsCloudDataset {
   is_version_of?: string;
   contact_point?: string;
   resources?: CkanResource[];
+  license_url?: string;
+  extras?: Array<{ key: string; value: string }>;
 }
 
 export interface CkanResource {