Skip to content
Open
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
176 changes: 176 additions & 0 deletions src/harvesters/dcat.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,176 @@
import { env } from "../../config";
import { BaseHarvester, BaseHarvesterConfig } from "./base";
import { PortalJsCloudDataset } from "@/schemas/portaljs-cloud";
import { Harvester } from ".";
import {
DCATDataset,
DCATDistribution,
extractString,
extractAgentName,
extractStringArray,
extractDistributions,
} from "../lib/dcat";

@Harvester
class DCATHarvester extends BaseHarvester<DCATDataset> {
constructor(args: BaseHarvesterConfig) {
super(args);
}

async getSourceDatasets(): Promise<DCATDataset[]> {
const url = this.config.source.url;
const res = await fetch(url);
if (!res.ok) {
throw new Error(
`Failed to fetch DCAT JSON-LD: ${res.status} ${res.statusText}`
);
}
const jsonLd: any[] = await res.json();

const objectMap = new Map<string, any>();
jsonLd.forEach((obj) => objectMap.set(obj["@id"], obj));

const datasets: DCATDataset[] = jsonLd
.filter((obj) =>
obj["@type"]?.includes("http://www.w3.org/ns/dcat#Dataset")
)
.map((dataset) => ({
...dataset,
distributions: extractDistributions(dataset, jsonLd),
resolvedPublisherName: extractAgentName(
dataset,
"http://purl.org/dc/terms/publisher",
jsonLd
),
}));

return datasets;
}

mapSourceDatasetToTarget(pkg: DCATDataset): PortalJsCloudDataset {
const owner_org = env.PORTALJS_CLOUD_MAIN_ORG;

// Map distributions to resources
const resources = (pkg.distributions || []).map(
(dist: DCATDistribution) => ({
name:
extractString(dist, "http://purl.org/dc/terms/title") ||
"Unnamed Resource",
url:
extractString(dist, "http://www.w3.org/ns/dcat#downloadURL") ||
extractString(dist, "http://www.w3.org/ns/dcat#accessURL") ||
"",
format:
extractString(dist, "http://purl.org/dc/terms/format") ||
extractString(dist, "http://www.w3.org/ns/dcat#mediaType") ||
"",
description:
extractString(dist, "http://purl.org/dc/terms/description") || "",
license_url:
extractString(dist, "http://purl.org/dc/terms/license") || "",
})
);

const extras: Array<{ key: string; value: string }> = [];
const extraMappings = [
{ predicate: "http://purl.org/dc/terms/issued", key: "issued" },
{ predicate: "http://purl.org/dc/terms/modified", key: "modified" },
{
predicate: "http://www.w3.org/2002/07/owl#versionInfo",
key: "dcat_version",
},
{
predicate: "http://purl.org/dc/terms/accrualPeriodicity",
key: "frequency",
},
{
predicate: "http://purl.org/dc/terms/conformsTo",
key: "conforms_to",
isArray: true,
},
{
predicate: "http://purl.org/dc/terms/accessRights",
key: "access_rights",
},
{ predicate: "http://purl.org/dc/terms/provenance", key: "provenance" },
{ predicate: "http://purl.org/dc/terms/type", key: "dcat_type" },
{ predicate: "http://purl.org/dc/terms/spatial", key: "spatial_uri" },
{ predicate: "http://purl.org/dc/terms/publisher", key: "publisher_uri" },
];

extraMappings.forEach(({ predicate, key, isArray = false }) => {
const value = isArray
? extractStringArray(pkg, predicate).join(", ")
: extractString(pkg, predicate);
if (value) extras.push({ key, value });
});

const skippedKeys = [
"@id",
"@type",
"distributions",
"http://www.w3.org/ns/dcat#distribution",
"http://purl.org/dc/terms/title",
"http://purl.org/dc/terms/description",
"http://purl.org/dc/terms/identifier",
"http://purl.org/dc/terms/issued",
"http://purl.org/dc/terms/modified",
"http://www.w3.org/2002/07/owl#versionInfo",
"http://purl.org/dc/terms/language",
"http://www.w3.org/ns/dcat#landingPage",
"http://xmlns.com/foaf/0.1/page",
"http://purl.org/dc/terms/accrualPeriodicity",
"http://purl.org/dc/terms/conformsTo",
"http://purl.org/dc/terms/accessRights",
"http://purl.org/dc/terms/provenance",
"http://purl.org/dc/terms/type",
"http://purl.org/dc/terms/spatial",
"http://purl.org/dc/terms/publisher",
"http://www.w3.org/ns/dcat#contactPoint",
"http://purl.org/dc/terms/creator",
"http://purl.org/dc/terms/license",
];
Object.keys(pkg).forEach((key) => {
if (!skippedKeys.includes(key)) {
const value = extractString(pkg, key) || JSON.stringify(pkg[key]);
if (value) extras.push({ key, value });
}
});

const extractedLanguage = extractString(
pkg,
"http://purl.org/dc/terms/language"
);
const validLanguages = ["EN", "FR", "ES", "DE", "IT"];
const language = (
validLanguages.includes(extractedLanguage) ? extractedLanguage : "EN"
) as "EN" | "FR" | "ES" | "DE" | "IT";
Comment on lines +140 to +147
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Normalize incoming language codes.

Most DCAT feeds emit dct:language as lowercase codes (e.g. en) or URIs (e.g. /language/ENG). Comparing the raw value against ["EN","FR","ES","DE","IT"] forces nearly everything to the fallback "EN". Normalize the value (uppercase, trim URI suffix, etc.) before validation so genuine locales survive.

One minimal fix:

-    const extractedLanguage = extractString(
-      pkg,
-      "http://purl.org/dc/terms/language"
-    );
+    const rawLanguage =
+      extractString(pkg, "http://purl.org/dc/terms/language") || "";
+    const extractedLanguage = rawLanguage
+      .split("/")
+      .pop()
+      ?.slice(0, 2)
+      .toUpperCase() || "";

This keeps the existing whitelist working for common inputs instead of hard-defaulting to "EN".

📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
const extractedLanguage = extractString(
pkg,
"http://purl.org/dc/terms/language"
);
const validLanguages = ["EN", "FR", "ES", "DE", "IT"];
const language = (
validLanguages.includes(extractedLanguage) ? extractedLanguage : "EN"
) as "EN" | "FR" | "ES" | "DE" | "IT";
const rawLanguage =
extractString(pkg, "http://purl.org/dc/terms/language") || "";
const extractedLanguage = rawLanguage
.split("/")
.pop()
?.slice(0, 2)
.toUpperCase() || "";
const validLanguages = ["EN", "FR", "ES", "DE", "IT"];
const language = (
validLanguages.includes(extractedLanguage) ? extractedLanguage : "EN"
) as "EN" | "FR" | "ES" | "DE" | "IT";
🤖 Prompt for AI Agents
In src/harvesters/dcat.ts around lines 140 to 147, the code currently compares
the raw extractedLanguage directly to the uppercase whitelist causing most
inputs (like "en" or "/language/ENG") to fall back to "EN"; normalize the
extractedLanguage before validation by trimming whitespace, converting to
uppercase, and if it looks like a URI or contains slashes or hashes, take the
last path/fragment segment (or strip non-letter characters) to yield a plain
code, then check that normalized value against the existing validLanguages array
and use it if valid, otherwise default to "EN".

const datasetLicense =
extractString(pkg, "http://purl.org/dc/terms/license") ||
(resources.length > 0 ? (resources[0] as any).license_url || "" : "");

// Map to PortalJsCloudDataset (based on ckanext-dcat mappings)
return {
owner_org,
name: `${owner_org}--${
extractString(pkg, "http://purl.org/dc/terms/identifier") ||
pkg["@id"].split("/").pop() ||
"unknown"
}`,
title: extractString(pkg, "http://purl.org/dc/terms/title") || "",
notes: extractString(pkg, "http://purl.org/dc/terms/description") || "",
url: extractString(pkg, "http://www.w3.org/ns/dcat#landingPage") || "",
language,
author: extractString(pkg, "http://purl.org/dc/terms/creator") || "",
maintainer: (pkg as any).resolvedPublisherName || "",
license_id: extractString(pkg, "http://purl.org/dc/terms/license") || "",
license_url: datasetLicense,
contact_point:
extractString(pkg, "http://www.w3.org/ns/dcat#contactPoint") || "",
resources,
extras,
};
}
}

export { DCATHarvester };
66 changes: 66 additions & 0 deletions src/lib/dcat.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
// Interfaces for DCAT JSON-LD (expanded form)
export interface DCATDistribution {
"@id": string;
"@type": string[];
[key: string]: any;
}

export interface DCATDataset {
"@id": string;
"@type": string[];
[key: string]: any;
}

// Helper to extract string value from JSON-LD predicate (e.g., [{"@value": "title"}])
export function extractString(obj: any, predicate: string): string {
const values = obj[predicate];
if (Array.isArray(values) && values.length > 0) {
const first = values[0];
return first["@value"] || first["@id"] || "";
}
return "";
}

export function extractAgentName(
obj: any,
predicate: string,
allObjects: any[]
): string {
const agentId = extractString(obj, predicate);
if (agentId) {
const agent = allObjects.find((o) => o["@id"] === agentId);
if (agent) {
return (
extractString(agent, "http://xmlns.com/foaf/0.1/name") ||
extractString(agent, "http://purl.org/dc/terms/title") ||
""
);
}
}
return "";
}
Comment on lines +24 to +41
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Respect literal publisher values.

If a dataset encodes dct:publisher as a literal (common in the wild), extractString returns that literal, but the lookup for an object with the same @id fails and we drop the original text, so maintainer ends up empty. Add a fallback to return the literal when no matching agent object exists.

Suggested tweak:

   const agentId = extractString(obj, predicate);
   if (agentId) {
     const agent = allObjects.find((o) => o["@id"] === agentId);
     if (agent) {
       return (
         extractString(agent, "http://xmlns.com/foaf/0.1/name") ||
         extractString(agent, "http://purl.org/dc/terms/title") ||
         ""
       );
     }
+    return agentId;
   }
   return "";
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
export function extractAgentName(
obj: any,
predicate: string,
allObjects: any[]
): string {
const agentId = extractString(obj, predicate);
if (agentId) {
const agent = allObjects.find((o) => o["@id"] === agentId);
if (agent) {
return (
extractString(agent, "http://xmlns.com/foaf/0.1/name") ||
extractString(agent, "http://purl.org/dc/terms/title") ||
""
);
}
}
return "";
}
export function extractAgentName(
obj: any,
predicate: string,
allObjects: any[]
): string {
const agentId = extractString(obj, predicate);
if (agentId) {
const agent = allObjects.find((o) => o["@id"] === agentId);
if (agent) {
return (
extractString(agent, "http://xmlns.com/foaf/0.1/name") ||
extractString(agent, "http://purl.org/dc/terms/title") ||
""
);
}
return agentId;
}
return "";
}
🤖 Prompt for AI Agents
In src/lib/dcat.ts around lines 24 to 41, extractAgentName currently treats the
value returned by extractString as an ID and only returns a resolved agent's
name, which drops literal publisher values when no matching object exists;
change the logic so that if extractString(obj, predicate) yields a non-empty
value but no object in allObjects matches that value, return that literal value
as a fallback (i.e., return the original agentId string) while keeping the
existing behavior when a matching agent object is found.


// Helper to extract array of strings (e.g., for multiple values)
export function extractStringArray(obj: any, predicate: string): string[] {
const values = obj[predicate];
if (Array.isArray(values)) {
return values.map((v) => v["@value"] || v["@id"] || "").filter(Boolean);
}
return [];
}

// Helper to extract distributions by @id references
export function extractDistributions(
dataset: DCATDataset,
allObjects: any[]
): DCATDistribution[] {
const distIds = extractStringArray(
dataset,
"http://www.w3.org/ns/dcat#distribution"
);
return allObjects.filter(
(obj) =>
distIds.includes(obj["@id"]) &&
obj["@type"]?.includes("http://www.w3.org/ns/dcat#Distribution")
) as DCATDistribution[];
}
3 changes: 3 additions & 0 deletions src/schemas/portaljs-cloud.d.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@ export interface PortalJsCloudDataset {
author?: string;
author_email?: string;
maintainer?: string;
url?: string;
maintainer_email?: string;
language: "EN" | "FR" | "ES" | "DE" | "IT";
coverage?: string;
Expand All @@ -19,6 +20,8 @@ export interface PortalJsCloudDataset {
is_version_of?: string;
contact_point?: string;
resources?: CkanResource[];
license_url?: string;
extras?: Array<{ key: string; value: string }>;
}

export interface CkanResource {
Expand Down