From 292465112fe43a6c3157e0f7e1a25e7bc64e2e27 Mon Sep 17 00:00:00 2001 From: em Date: Fri, 6 Dec 2024 15:29:51 +0100 Subject: [PATCH] Implement DrupalWiki collector --- collector/extensions/index.js | 26 ++ collector/extensions/resync/index.js | 53 +++- .../extensions/DrupalWiki/DrupalWiki/index.js | 280 ++++++++++++++++++ .../utils/extensions/DrupalWiki/index.js | 115 +++++++ .../DataConnectorOption/media/drupalwiki.jpg | Bin 0 -> 7293 bytes .../DataConnectorOption/media/index.js | 2 + .../Connectors/DrupalWiki/index.jsx | 192 ++++++++++++ .../ManageWorkspace/DataConnectors/index.jsx | 7 + .../ChatHistory/Citation/index.jsx | 5 + frontend/src/models/dataConnector.js | 23 ++ server/endpoints/extensions/index.js | 21 ++ server/jobs/sync-watched-documents.js | 2 +- server/models/documentSyncQueue.js | 2 +- 13 files changed, 719 insertions(+), 9 deletions(-) create mode 100644 collector/utils/extensions/DrupalWiki/DrupalWiki/index.js create mode 100644 collector/utils/extensions/DrupalWiki/index.js create mode 100644 frontend/src/components/DataConnectorOption/media/drupalwiki.jpg create mode 100644 frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/DrupalWiki/index.jsx diff --git a/collector/extensions/index.js b/collector/extensions/index.js index 81a3a3dd79..087df6f21f 100644 --- a/collector/extensions/index.js +++ b/collector/extensions/index.js @@ -154,6 +154,32 @@ function extensions(app) { return; } ); + + app.post( + "/ext/drupalwiki", + [verifyPayloadIntegrity, setDataSigner], + async function (request, response) { + try { + const { loadAndStoreSpaces } = require("../utils/extensions/DrupalWiki"); + const { success, reason, data } = await loadAndStoreSpaces( + reqBody(request), + response + ); + response.status(200).json({ success, reason, data }); + } catch (e) { + console.error(e); + response.status(400).json({ + success: false, + reason: e.message, + data: { + title: null, + author: null, + }, + }); + } + return; + } + ); } module.exports = extensions; diff --git a/collector/extensions/resync/index.js b/collector/extensions/resync/index.js index 024935f5cf..dae5f91b6a 100644 --- a/collector/extensions/resync/index.js +++ b/collector/extensions/resync/index.js @@ -2,7 +2,7 @@ const { getLinkText } = require("../../processLink"); /** * Fetches the content of a raw link. Returns the content as a text string of the link in question. - * @param {object} data - metadata from document (eg: link) + * @param {object} data - metadata from document (eg: link) * @param {import("../../middleware/setDataSigner").ResponseWithSigner} response */ async function resyncLink({ link }, response) { @@ -24,7 +24,7 @@ async function resyncLink({ link }, response) { * Fetches the content of a YouTube link. Returns the content as a text string of the video in question. * We offer this as there may be some videos where a transcription could be manually edited after initial scraping * but in general - transcriptions often never change. - * @param {object} data - metadata from document (eg: link) + * @param {object} data - metadata from document (eg: link) * @param {import("../../middleware/setDataSigner").ResponseWithSigner} response */ async function resyncYouTube({ link }, response) { @@ -44,9 +44,9 @@ async function resyncYouTube({ link }, response) { } /** - * Fetches the content of a specific confluence page via its chunkSource. + * Fetches the content of a specific confluence page via its chunkSource. * Returns the content as a text string of the page in question and only that page. - * @param {object} data - metadata from document (eg: chunkSource) + * @param {object} data - metadata from document (eg: chunkSource) * @param {import("../../middleware/setDataSigner").ResponseWithSigner} response */ async function resyncConfluence({ chunkSource }, response) { @@ -76,9 +76,9 @@ async function resyncConfluence({ chunkSource }, response) { } /** - * Fetches the content of a specific confluence page via its chunkSource. + * Fetches the content of a specific confluence page via its chunkSource. * Returns the content as a text string of the page in question and only that page. - * @param {object} data - metadata from document (eg: chunkSource) + * @param {object} data - metadata from document (eg: chunkSource) * @param {import("../../middleware/setDataSigner").ResponseWithSigner} response */ async function resyncGithub({ chunkSource }, response) { @@ -106,9 +106,48 @@ async function resyncGithub({ chunkSource }, response) { } } + +/** + * Fetches the content of a specific DrupalWiki page via its chunkSource. + * Returns the content as a text string of the page in question and only that page. + * @param {object} data - metadata from document (eg: chunkSource) + * @param {import("../../middleware/setDataSigner").ResponseWithSigner} response + */ +async function resyncDrupalWiki({ chunkSource }, response) { + if (!chunkSource) throw new Error('Invalid source property provided'); + try { + // DrupalWiki data is `payload` encrypted. So we need to expand its + // encrypted payload back into query params so we can reFetch the page with same access token/params. + const source = response.locals.encryptionWorker.expandPayload(chunkSource); + const { loadPage } = require("../../utils/extensions/DrupalWiki"); + const { success, reason, content } = await loadPage({ + baseUrl: source.searchParams.get('baseUrl'), + pageId: source.searchParams.get('pageId'), + accessToken: source.searchParams.get('accessToken'), + }); + + if (!success) { + console.error(`Failed to sync DrupalWiki page content. ${reason}`); + response.status(200).json({ + success: false, + content: null, + }); + } else { + response.status(200).json({ success, content }); + } + } catch (e) { + console.error(e); + response.status(200).json({ + success: false, + content: null, + }); + } +} + module.exports = { link: resyncLink, youtube: resyncYouTube, confluence: resyncConfluence, github: resyncGithub, -} \ No newline at end of file + drupalwiki: resyncDrupalWiki, +} diff --git a/collector/utils/extensions/DrupalWiki/DrupalWiki/index.js b/collector/utils/extensions/DrupalWiki/DrupalWiki/index.js new file mode 100644 index 0000000000..43f2858eef --- /dev/null +++ b/collector/utils/extensions/DrupalWiki/DrupalWiki/index.js @@ -0,0 +1,280 @@ +/** + * Copyright 2024 + * + * Authors: + * - Eugen Mayer (KontextWork) + */ + +const { htmlToText } = require("html-to-text"); +const { tokenizeString } = require("../../../tokenizer"); +const { sanitizeFileName, writeToServerDocuments } = require("../../../files"); +const { default: slugify } = require("slugify"); +const path = require("path"); +const fs = require("fs"); + +class Page { + /** + * + * @param {number }id + * @param {string }title + * @param {string} created + * @param {string} type + * @param {string} processedBody + * @param {string} url + * @param {number} spaceId + */ + constructor({ id, title, created, type, processedBody, url, spaceId }) { + this.id = id; + this.title = title; + this.url = url; + this.created = created; + this.type = type; + this.processedBody = processedBody; + this.spaceId = spaceId; + } +} + +class DrupalWiki { + /** + * + * @param baseUrl + * @param spaceId + * @param accessToken + */ + constructor({ baseUrl, accessToken }) { + this.baseUrl = baseUrl; + this.accessToken = accessToken; + this.storagePath = this.#prepareStoragePath(baseUrl); + } + + /** + * Load all pages for the given space, fetching storing each page one by one + * to minimize the memory usage + * + * @param {number} spaceId + * @param {import("../../EncryptionWorker").EncryptionWorker} encryptionWorker + * @returns {Promise} + */ + async loadAndStoreAllPagesForSpace(spaceId, encryptionWorker) { + const pageIndex = await this.#getPageIndexForSpace(spaceId); + for (const pageId of pageIndex) { + try { + const page = await this.loadPage(pageId); + + // Pages with an empty body will lead to embedding issues / exceptions + if (page.processedBody.trim() !== "") { + this.#storePage(page, encryptionWorker); + } else { + console.log(`Skipping page (${page.id}) since it has no content`); + } + } catch (e) { + console.error( + `Could not process DrupalWiki page ${pageId} (skipping and continuing): ` + ); + console.error(e); + } + } + } + + /** + * @param {number} pageId + * @returns {Promise} + */ + async loadPage(pageId) { + return this.#fetchPage(pageId); + } + + /** + * Fetches the page ids for the configured space + * @param {number} spaceId + * @returns{Promise} array of pageIds + */ + async #getPageIndexForSpace(spaceId) { + // errors on fetching the pageIndex is fatal, no error handling + let hasNext = true; + let pageIds = []; + let pageNr = 0; + do { + let { isLast, pageIdsForPage } = await this.#getPagesForSpacePaginated( + spaceId, + pageNr + ); + hasNext = !isLast; + pageNr++; + if (pageIdsForPage.length) { + pageIds = pageIds.concat(pageIdsForPage); + } + } while (hasNext); + + return pageIds; + } + + /** + * + * @param {number} pageNr + * @param {number} spaceId + * @returns {Promise<{isLast,pageIds}>} + */ + async #getPagesForSpacePaginated(spaceId, pageNr) { + /* + * { + * content: Page[], + * last: boolean, + * pageable: { + * pageNumber: number + * } + * } + */ + const data = await this._doFetch( + `${this.baseUrl}/api/rest/scope/api/page?size=100&space=${spaceId}&page=${pageNr}` + ); + + const pageIds = data.content.map((page) => { + return Number(page.id); + }); + + return { + isLast: data.last, + pageIdsForPage: pageIds, + }; + } + + /** + * @param pageId + * @returns {Promise} + */ + async #fetchPage(pageId) { + const data = await this._doFetch( + `${this.baseUrl}/api/rest/scope/api/page/${pageId}` + ); + const url = `${this.baseUrl}/node/${data.id}`; + return new Page({ + id: data.id, + title: data.title, + created: data.lastModified, + type: data.type, + processedBody: this.#processPageBody({ + body: data.body, + title: data.title, + lastModified: data.lastModified, + url: url, + }), + url: url, + }); + } + + /** + * @param {Page} page + * @param {import("../../EncryptionWorker").EncryptionWorker} encryptionWorker + */ + #storePage(page, encryptionWorker) { + const { hostname } = new URL(this.baseUrl); + + // This UUID will ensure that re-importing the same page without any changes will not + // show up (deduplication). + const targetUUID = `${hostname}.${page.spaceId}.${page.id}.${page.created}`; + const wordCount = page.processedBody.split(" ").length; + const tokenCount = + page.processedBody.length > 0 + ? tokenizeString(page.processedBody).length + : 0; + const data = { + id: targetUUID, + url: page.url, + title: page.title, + docAuthor: this.baseUrl, + description: page.title, + docSource: `${this.baseUrl} DrupalWiki`, + chunkSource: this.#generateChunkSource(page.id, encryptionWorker), + published: new Date().toLocaleString(), + wordCount: wordCount, + pageContent: page.processedBody, + token_count_estimate: tokenCount, + }; + + const fileName = sanitizeFileName(`${slugify(page.title)}-${data.id}`); + console.log( + `[DrupalWiki Loader]: Saving page '${page.title}' (${page.id}) to '${this.storagePath}/${fileName}'` + ); + writeToServerDocuments(data, fileName, this.storagePath); + } + + /** + * Generate the full chunkSource for a specific Confluence page so that we can resync it later. + * This data is encrypted into a single `payload` query param so we can replay credentials later + * since this was encrypted with the systems persistent password and salt. + * @param {number} pageId + * @param {import("../../EncryptionWorker").EncryptionWorker} encryptionWorker + * @returns {string} + */ + #generateChunkSource(pageId, encryptionWorker) { + const payload = { + baseUrl: this.baseUrl, + pageId: pageId, + accessToken: this.accessToken, + }; + return `drupalwiki://${this.baseUrl}?payload=${encryptionWorker.encrypt( + JSON.stringify(payload) + )}`; + } + + async _doFetch(url) { + const response = await fetch(url, { + headers: this.#getHeaders(), + }); + if (!response.ok) { + throw new Error(`Failed to fetch ${url}: ${response.status}`); + } + return response.json(); + } + + #getHeaders() { + return { + "Content-Type": "application/json", + Accept: "application/json", + Authorization: `Bearer ${this.accessToken}`, + }; + } + + #prepareStoragePath(baseUrl) { + const { hostname } = new URL(baseUrl); + const subFolder = slugify(`drupalwiki-${hostname}`).toLowerCase(); + + const outFolder = + process.env.NODE_ENV === "development" + ? path.resolve( + __dirname, + `../../../../server/storage/documents/${subFolder}` + ) + : path.resolve(process.env.STORAGE_DIR, `documents/${subFolder}`); + + if (!fs.existsSync(outFolder)) { + fs.mkdirSync(outFolder, { recursive: true }); + } + return outFolder; + } + + /** + * @param {string} body + * @param {string} url + * @param {string} title + * @param {string} lastModified + * @returns {string} + * @private + */ + #processPageBody({ body, url, title, lastModified }) { + // use the title as content if there is none + const textContent = body.trim() !== "" ? body : title; + + const plainTextContent = htmlToText(textContent, { + wordwrap: false, + preserveNewlines: true, + }); + // preserve structure + const plainBody = plainTextContent.replace(/\n{3,}/g, "\n\n"); + // add the link to the document + return `Link/URL: ${url}\n\n${plainBody}`; + } +} + +module.exports = { DrupalWiki }; diff --git a/collector/utils/extensions/DrupalWiki/index.js b/collector/utils/extensions/DrupalWiki/index.js new file mode 100644 index 0000000000..eddad92850 --- /dev/null +++ b/collector/utils/extensions/DrupalWiki/index.js @@ -0,0 +1,115 @@ +/** + * Copyright 2024 + * + * Authors: + * - Eugen Mayer (KontextWork) + */ + +const { DrupalWiki } = require("./DrupalWiki"); + +async function loadAndStoreSpaces( + { baseUrl = null, spaceIds = null, accessToken = null }, + response +) { + if (!baseUrl) { + return { + success: false, + reason: + "Please provide your baseUrl like https://mywiki.drupal-wiki.net.", + }; + } else if (!validBaseUrl(baseUrl)) { + return { + success: false, + reason: "Provided base URL is not a valid URL.", + }; + } + + if (!spaceIds) { + return { + success: false, + reason: + "Please provide a list of spaceIds like 21,56,67 you want to extract", + }; + } + + if (!accessToken) { + return { + success: false, + reason: "Please provide a REST API-Token.", + }; + } + + console.log(`-- Working Drupal Wiki ${baseUrl} for spaceIds: ${spaceIds} --`); + const drupalWiki = new DrupalWiki({ baseUrl, accessToken }); + + const encryptionWorker = response.locals.encryptionWorker; + const spaceIdsArr = spaceIds.split(",").map((idStr) => { + return Number(idStr.trim()); + }); + + for (const spaceId of spaceIdsArr) { + try { + await drupalWiki.loadAndStoreAllPagesForSpace(spaceId, encryptionWorker); + console.log(`--- Finished space ${spaceId} ---`); + } catch (e) { + console.error(e); + return { + success: false, + reason: e.message, + data: {}, + }; + } + } + console.log(`-- Finished all spaces--`); + + return { + success: true, + reason: null, + data: { + spaceIds, + destination: drupalWiki.storagePath, + }, + }; +} + +/** + * Gets the page content from a specific Confluence page, not all pages in a workspace. + * @returns + */ +async function loadPage({ baseUrl, pageId, accessToken }) { + console.log(`-- Working Drupal Wiki Page ${pageId} of ${baseUrl} --`); + const drupalWiki = new DrupalWiki({ baseUrl, accessToken }); + try { + const page = await drupalWiki.loadPage(pageId); + return { + success: true, + reason: null, + content: page.processedBody, + }; + } catch (e) { + return { + success: false, + reason: `Failed (re)-fetching DrupalWiki page ${pageId} form ${baseUrl}}`, + content: null, + }; + } +} + +/** + * Validates if the provided baseUrl is a valid URL at all. + * @param {string} baseUrl + * @returns {boolean} + */ +function validBaseUrl(baseUrl) { + try { + new URL(baseUrl); + return true; + } catch (e) { + return false; + } +} + +module.exports = { + loadAndStoreSpaces, + loadPage, +}; diff --git a/frontend/src/components/DataConnectorOption/media/drupalwiki.jpg b/frontend/src/components/DataConnectorOption/media/drupalwiki.jpg new file mode 100644 index 0000000000000000000000000000000000000000..3bf9eeb032c11b3aa64a5c0b3b4616bff781fe16 GIT binary patch literal 7293 zcmeHLX;f3^y55S2GKvsU5mH4I8Ol)^>O^c6&{9uC2oVrcMMOyXqPpbQ3;Nr&WGOh`a_6MJ`@lg* zC+9^H=-Bv#aBhBKQ6!c~ zWlJEHznTO7{?(xWWREVeXX&zKD$CS9+OuS77+5O0%TzaRU%vWtSGAKNdYg89w?hAm zga-w$R+{WQCcvFK->kl7v(2EX@S{n;TJ*0OboGB^(QgL*W)Ic_X{sy%0aMY12$0e( z>y{bx|BL#WqvdkUkQrboEi)*0{=3ji3rST3R~_gSY~dF z!=P711O*AY+=u&+Dd^y-V$j&5a54rhe}_Rrcev7PvC)znF>;5>Lrrp>mB#}>)z5u z0YB7O-Sab>zkq*aLmn0@Eb%z_UBln*#0#F$Nf>m%t^k9+Yk`HNDVT*Lj1CUL`S&nr zv|Dz~kwJl9zY>y_nmskB+F`JZD+Xz(y`e1Zq~aw;Tfw=VWd&|1LrvCAM58kQg-zUl z-DBW?cMJb>FivqU+t)di%8w_Vrbs6*wpc6Ycvexl^4*F+T3p747cPr;_JtAsU2<** zdEzRjZdugVoaQ{UQF85Vh-9m#XVM!)zq&e;u18t7q(kFh+%o%d>Bp5fffo*iphKiQ z)+4wkzD6FuA78J-{2qf|hw`oED^9>;)lal)A2FjmE;a5P_RL2I=iL zCcb3-B;ZytagsGs+TLA!AMXP%eLapi%pE#}K^0!b^aBDNKJj{fPn6%!4koQuZHkun z;}^`?3XPy|H~Zy7QSAaes@^T648CbE&y7EVFpF$^*rJ@{G*`Ggl_6HF^g=uc0hFH1 zpYcV!XqdCth_y2KYm$B1k=pLCv%KkCG2C+^AGS~GR+vlj`Lt`sw$Fs*>zrK|^Uu_Z zH{a|cQ~2?nk|d<$tcol&t4vMgpvUbd(Dt@(7c1IMhu0`}@T!_*21Hc#W1*6N2#=|h#oP!@Z?JIV%IzKm_Md2;>Ej>+ zwz3Blb+ilJfYwS_#y`eA-*>EW*E*z!l7uYa*i`dIs$YR|bZPqp_M)^;I zZ0fnX#qFf`JabM|oZK#)it1)w%fMfMTKhDcT1)6p;KE~jsLi2LT&(0rI%(xDxSY9o6>J<4)n2}ViDqkbshPThHW$qSpT!lJF*cgT}4HrIN zDLdp-7aFl42%VWt#~`CVN^VG2o_XFhNzR$~tfsnI-}7^x#*^+2>(86DJ@z^#CN_C9 zyP5ob`DKYNKU#iU{iMDjROc-VgO)zUAkl_17<7HI%B6G|ZJ;REau#%^4f(Ld(P35i z)j|Ae@hasVg7_yXykOKmKRN{}z-06$GPNF&ZKpQLhQGN)_R2Y%5!3dqlo1q@?R3nc zIqR0i0JHh(KdaI@3}PZ5B7}J5^S9or(+~zZ*?#>_Z6;T-dJp>5 ztP2J`r-#Hcb~i~+yA!T z<7IZ$v-cKoL_5QiFhE*}*I%OGFaT8;TNhBR!)ud+1=0k`TMIP{H(Q@ zC{B@3t1V#QDz z=Xl&X_{|P(zuSIesT{{>=|l>*)>z@I9KN|$*;${wyYD%-&Xi?u&6PJRhcPHjnTJMS zRDQuD6>!TqNn1HD;{BtKHx4Hp5BBgvJuSqg=t@|4l@^D8o^efUsyc{RdA>I5{m!V7 zq#j9vqSyhEdkon0o^?sdeEIA{>q zi}W+b+_}s=+j-~d^1-HJ3E@CT0)uK;9qoic56@d;kUopzcrNYw?u`4?NR8mDBerRF zA4?)M1dmtj!l0Fe*Z9?*Su;sNUVMT$TRDo>QFf)Y%PMyKfRsf!1-MKeUw>~%kvnQc zd3W)p?eqm=>(@T5z9*c65y2KvBpo15+VJZG!gS{|%!_swrhxTvQdl_RrS#S#Pb~Pm zNr}~hqKbl>xz5s`q81;X6P_yx&HusV8$00>b3!+%mmIju=+y1PsP-q!mEm_=tJiB~ z6rS$Q4^B#jXY&!c6=QqdHs|k^ytM5pg$@SFyuIMmE5y}7UXDJ`CNzxZfLBte@X*`# z52VGI>6*c^%E9tbgZnMVf8j~Mk5lb)Q4DSpNz}FbJYj9)*(i$9F@i{pE{kMNMGo?& zGvngubtU^g?QmgA^cd6d!WMSvse*VO(Q<8%ghXa8st;-?ZBYYIW~H|y%DYC%{hd}p|3z$ zXD7cP_C&tH3+~ZRcNE)AMiGK;!=k2Hc&NU;@Ao%WB)dDX?}GDRlpgPtRYx@_XO)}f zfMMB<#MamUb_sL;p;+nO7_{sT1|{xk0Zy}u!k`CMQ8~ZrEG_@{+bNA<)2{~BR;*{V zBdcxpoK>($?dxaO3dMe@9spyTla%W^2iOW61t3UtvoM%fF4|+nn~loY4M~?HZWuIh zlVP4Xjo066S5Hs!-uHzq22sTcT##F@?7Hv zo}7ZdK~2#%2{kllZ~9NIpLUsJQ0p!Sno=Y5b2(LVZyeEC=rb}LZDTut*FFq+I;+$O ziinva@hMZt&CAJmr*AG=?0btWwAl4w(2s@WxC3epiEWQXFJB@8{DgCPY|Z&m_i<&0 zFGZmtiv=|F`Cv1jc+IP$OWfdBJV%h9;n|~mN)QL?BEhTGq{Ajs=|Jk!YO~g`mRpum z=eIPy7O$4P#cNN~IJR?L(;)Koy}p^+WM#vb=j9H<4bQFWb&qeuWT<3?%U80xnaOH`vI6DZ7Zp=Sen)DN_4EdYq)rDpyjc%^|a_ zTac6dA#$JHJ6fF}5sS(tsd=AoO<#67rOS^tYgI zM4vb;bg9xrO5kT*EV!uRbo@tbV0d)S)F@V88@$NV0e;uo<6e66O$0x(mP1QN!xaUU z7<8wm6jj}iC;yQJQwm@fMND1P_yggU$0@x4Zc^Y%eB~oX(3{LEkoq*^9J#F|8`Xtx zduiT1Tx?iPT}PPCQXmphk8+i7l9cEdvE_agEkZ9St>5t@xt;S4WjqM?J`q<{Vc4>G z9BJ?Gt@#!anD(8ZyBtW)*$Jy_~j$i0KZpes*%fT&mRmgMbOF}r-Yuq;|ckc}G#dv%@ZHXfO# z>>?YOcU=yLhesSTqP1;8NCo(Yw{%B&KsZzG#`k}kVi?RkQb_Vgr|!~uGVLA!@7K^R zKo7X^!=e|Rf(mLPHk^k$?W6#UjRIr%xgb>rtt1J2EALCgFUk@DwmuE)1u1Dndg|ko zl$0`S#dvU|(kVmxvj}*bbc=<7`S;_eOcXT}vx3grWJH_J?%hFr55o0ynmVIEY+%Lt zidzy2-&9DeoYk3-b8Xvy2(D4)mx2wAwa98Wg50Ws9!183P8IJX#MJju(oVAKRLt8j zC~ZfqtYWLyhSxTMuw}Lhqrz6n-Rmmi>5W}hx33Kd?mT8KKW5jCK{eof#_Z>7r@a(C z5*`q&E~vRA@t9Txo@4VT4C?$0C1KDzYAYbmR@^k!Ko%5w6b1mBx*<=%2nd5R|9^#a8zv};>G38 z>xxKaVFeCtl%1XnPp=XcP6*jE3rFloFUk{;(qT3TM?`Vwz-TPt9P<`xeItL7cC&v= z1(#N?GLg8VNulbE8)+q)R>g0A+Oj9~Oi=ZeeUqXRC7bRl_YCh2wK{5KUD;up;?le+ z@#3Y4K2N0F1lA*8arMs75(IOEYE{X`c)?712#abkXK5$8RQvKnAT&cb({yz=ds;)t ztn6RWG8DvZW%-`S6Vd_^o4fW3wZ&da^XQE-`&`MM$`aq0oY%!Ym(qib^2;m~+q0%! zDLWaj2nGJksq6udl6FmU1);;$+r9L5uuG`Cj%Eml!m&s!=mZzzp$oiK02vPmw<@oJ z4#Jx}JVlx4RQ4#p_+tlaeF}p-keN=Nphhr(PY($Ynqg3H<>V}(dn+LBztA^6=Ol2C zrJ(vnoJr=Uv30GDiCMWLd?U9NJ9gYPj6R5hvd!IVl2A9>Awf z)1nR5pNKCx@~JSvS-CTDgrbNHjZ&fLGu{}RqK!h)vC&LevCuPU?!%gb4X(~&N_i~+uIK`9K?bEL_Q_4SRt zsTF4eELWgfg;`2Gl`=Tx?{)DMGlRHHV^ zb>Ls5VE(e4jo$H+R#g1u_bhig0+Gibc>7;fmS5J}8{SDaOB7{)Xbn#WvB&x+MP`z9 z5uvtKU?!&6W7)d%@aA~x{7(CRTfE{=;-6_UQ^p3M*u5b1f?abll1-4DoV~(Z`0`^f za-|5a)S)j2^9}MKsDmGMM(v8}dj|T#s_pCie;n(H=7!hYw2_ljF=#f)3ymv6x=jYE zuTPq0g4Rfb#U}1_xkFU0^eV?Y3!X9ue_3O+^~-{m0pA2q@68rc^LJ`|{LhHQ)V>;j zD`l0B8pt>pbg2T!V?xYPMQ-%VJIgqZcJ4gPRra-~I^tY!`^{9Pet9g#=IS3)2?(aI(Q1M= zZNX=cyO5-?4z^v}(cfU|^Q4&-e2&m}rWeTO&ba&T@q4xYOe>1)*4pM?zim#LI4(2G zLbef3shee8`De?Va=h|!*ZZCmKi=)<+ub?@3iL^-C(z%6X%6cFy|C4k%0vvh#Xg$F z&GqHr2kY*=e!4!M#&_DW7eUwM4ziURt6<(ZL6S!pOk;d}q(DuH{_PQtOk{yFzw+&%Qy6+B5^>un*!IW zwS5cEj}Ro9M9_pspOa8A=!+`6SVlz6nUXV^(a3@&a6bGK81S*$%(5rD*ixyrVueHY zH4Mt!4l>|W1gzLetfa`+e1bs;7)68Oxs3v(--A;+a^=TZ`95&O`@VsjYT<#;7dPM) zQYNaR2QtVs#j0+FZCZ3K>WYJ5GAU`v6-TU}0<94WK y``GsCXL)&MBww6IPS&&gUP(yor+$~+w>6_1B0xL(B>p$@{d-gTfACue>;4ZY(9y>L literal 0 HcmV?d00001 diff --git a/frontend/src/components/DataConnectorOption/media/index.js b/frontend/src/components/DataConnectorOption/media/index.js index cbc80b642d..b0c5f883e1 100644 --- a/frontend/src/components/DataConnectorOption/media/index.js +++ b/frontend/src/components/DataConnectorOption/media/index.js @@ -3,6 +3,7 @@ import Gitlab from "./gitlab.svg"; import YouTube from "./youtube.svg"; import Link from "./link.svg"; import Confluence from "./confluence.jpeg"; +import DrupalWiki from "./drupalwiki.jpg"; const ConnectorImages = { github: Github, @@ -10,6 +11,7 @@ const ConnectorImages = { youtube: YouTube, websiteDepth: Link, confluence: Confluence, + drupalwiki: DrupalWiki, }; export default ConnectorImages; diff --git a/frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/DrupalWiki/index.jsx b/frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/DrupalWiki/index.jsx new file mode 100644 index 0000000000..f21172da77 --- /dev/null +++ b/frontend/src/components/Modals/ManageWorkspace/DataConnectors/Connectors/DrupalWiki/index.jsx @@ -0,0 +1,192 @@ +/** + * Copyright 2024 + * + * Authors: + * - Eugen Mayer (KontextWork) + */ + +import { useState } from "react"; +import System from "@/models/system"; +import showToast from "@/utils/toast"; +import { Warning } from "@phosphor-icons/react"; +import { Tooltip } from "react-tooltip"; + +export default function DrupalWikiOptions() { + const [loading, setLoading] = useState(false); + + const handleSubmit = async (e) => { + e.preventDefault(); + const form = new FormData(e.target); + + try { + setLoading(true); + showToast( + "Fetching all pages for the given Drupal Wiki spaces - this may take a while.", + "info", + { + clear: true, + autoClose: false, + } + ); + const { data, error } = await System.dataConnectors.drupalwiki.collect({ + baseUrl: form.get("baseUrl"), + spaceIds: form.get("spaceIds"), + accessToken: form.get("accessToken"), + }); + + if (!!error) { + showToast(error, "error", { clear: true }); + setLoading(false); + return; + } + + showToast( + `Pages collected from Drupal Wiki spaces ${data.spaceIds}. Output folder is ${data.destination}.`, + "success", + { clear: true } + ); + e.target.reset(); + setLoading(false); + } catch (e) { + console.error(e); + showToast(e.message, "error", { clear: true }); + setLoading(false); + } + }; + + return ( +
+
+
+
+
+
+
+ +

+ This is the base URL of your  + e.stopPropagation()} + > + Drupal Wiki + + . +

+
+ +
+
+
+ +

+ Comma seperated Space IDs you want to extract. See the  + e.stopPropagation()} + > + manual + +   on how to retrieve the Space IDs. Be sure that your + 'API-Token User' has access to those spaces. +

+
+ +
+
+
+ +

+ Access token for authentication. +

+
+ +
+
+
+ +
+ + {loading && ( +

+ Once complete, all pages will be available for embedding into + workspaces. +

+ )} +
+
+
+
+ ); +} diff --git a/frontend/src/components/Modals/ManageWorkspace/DataConnectors/index.jsx b/frontend/src/components/Modals/ManageWorkspace/DataConnectors/index.jsx index 647a026f6f..3baafa7512 100644 --- a/frontend/src/components/Modals/ManageWorkspace/DataConnectors/index.jsx +++ b/frontend/src/components/Modals/ManageWorkspace/DataConnectors/index.jsx @@ -4,6 +4,7 @@ import GithubOptions from "./Connectors/Github"; import GitlabOptions from "./Connectors/Gitlab"; import YoutubeOptions from "./Connectors/Youtube"; import ConfluenceOptions from "./Connectors/Confluence"; +import DrupalWikiOptions from "./Connectors/DrupalWiki"; import { useState } from "react"; import ConnectorOption from "./ConnectorOption"; import WebsiteDepthOptions from "./Connectors/WebsiteDepth"; @@ -42,6 +43,12 @@ export const DATA_CONNECTORS = { description: "Import an entire Confluence page in a single click.", options: , }, + drupalwiki: { + name: "Drupal Wiki", + image: ConnectorImages.drupalwiki, + description: "Import Drupal Wiki spaces in a single click.", + options: , + }, }; export default function DataConnectors() { diff --git a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/Citation/index.jsx b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/Citation/index.jsx index b2a6f73f23..ec30f95916 100644 --- a/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/Citation/index.jsx +++ b/frontend/src/components/WorkspaceChat/ChatContainer/ChatHistory/Citation/index.jsx @@ -224,6 +224,11 @@ function parseChunkSource({ title = "", chunks = [] }) { icon = "confluence"; } + if (url.host.includes("drupal-wiki.net")) { + text = title; + icon = "drupalwiki"; + } + return { isUrl: true, href: url.toString(), diff --git a/frontend/src/models/dataConnector.js b/frontend/src/models/dataConnector.js index 30c17fb12c..60924179e9 100644 --- a/frontend/src/models/dataConnector.js +++ b/frontend/src/models/dataConnector.js @@ -160,6 +160,29 @@ const DataConnector = { }); }, }, + + drupalwiki: { + collect: async function ({ baseUrl, spaceIds, accessToken }) { + return await fetch(`${API_BASE}/ext/drupalwiki`, { + method: "POST", + headers: baseHeaders(), + body: JSON.stringify({ + baseUrl, + spaceIds, + accessToken, + }), + }) + .then((res) => res.json()) + .then((res) => { + if (!res.success) throw new Error(res.reason); + return { data: res.data, error: null }; + }) + .catch((e) => { + console.error(e); + return { data: null, error: e.message }; + }); + }, + }, }; export default DataConnector; diff --git a/server/endpoints/extensions/index.js b/server/endpoints/extensions/index.js index 8f836ce071..7bfff06724 100644 --- a/server/endpoints/extensions/index.js +++ b/server/endpoints/extensions/index.js @@ -127,6 +127,27 @@ function extensionEndpoints(app) { } } ); + app.post( + "/ext/drupalwiki", + [validatedRequest, flexUserRoleValid([ROLES.admin, ROLES.manager])], + async (request, response) => { + try { + const responseFromProcessor = + await new CollectorApi().forwardExtensionRequest({ + endpoint: "/ext/drupalwiki", + method: "POST", + body: request.body, + }); + await Telemetry.sendTelemetry("extension_invoked", { + type: "drupalwiki", + }); + response.status(200).json(responseFromProcessor); + } catch (e) { + console.error(e); + response.sendStatus(500).end(); + } + } + ); } module.exports = { extensionEndpoints }; diff --git a/server/jobs/sync-watched-documents.js b/server/jobs/sync-watched-documents.js index c4f235a0d9..2c8d59236c 100644 --- a/server/jobs/sync-watched-documents.js +++ b/server/jobs/sync-watched-documents.js @@ -46,7 +46,7 @@ const { DocumentSyncRun } = require('../models/documentSyncRun.js'); newContent = response?.content; } - if (type === 'confluence' || type === 'github') { + if (type === 'confluence' || type === 'github' || type === 'drupalwiki' ) { const response = await collector.forwardExtensionRequest({ endpoint: "/ext/resync-source-document", method: "POST", diff --git a/server/models/documentSyncQueue.js b/server/models/documentSyncQueue.js index 0ebaa05294..cceceaecd0 100644 --- a/server/models/documentSyncQueue.js +++ b/server/models/documentSyncQueue.js @@ -10,7 +10,7 @@ const { Telemetry } = require("./telemetry"); const DocumentSyncQueue = { featureKey: "experimental_live_file_sync", // update the validFileTypes and .canWatch properties when adding elements here. - validFileTypes: ["link", "youtube", "confluence", "github"], + validFileTypes: ["link", "youtube", "confluence", "github", "drupalwiki"], defaultStaleAfter: 604800000, maxRepeatFailures: 5, // How many times a run can fail in a row before pruning. writable: [],