From f0d2b4f4e5681821c8c2c36d49412c8c0dd04c98 Mon Sep 17 00:00:00 2001 From: reach0908 Date: Fri, 4 Jul 2025 19:18:58 +0900 Subject: [PATCH 01/28] =?UTF-8?q?feat(scraper):=20Puppeteer=20=EA=B8=B0?= =?UTF-8?q?=EB=B0=98=EC=9D=98=20=EB=B8=8C=EB=9D=BC=EC=9A=B0=EC=A0=80=20?= =?UTF-8?q?=EC=84=9C=EB=B9=84=EC=8A=A4=20=EB=B0=8F=20=EC=8A=A4=ED=81=AC?= =?UTF-8?q?=EB=9E=98=ED=8D=BC=20=EB=AA=A8=EB=93=88=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- package-lock.json | 1171 ++++++++++++++++- package.json | 4 + src/config/app.config.ts | 1 + src/modules/scraper/scraper.module.ts | 8 + .../scraper/services/browser.service.ts | 112 ++ .../scraper/services/puppeteer-parse.ts | 0 6 files changed, 1239 insertions(+), 57 deletions(-) create mode 100644 src/modules/scraper/scraper.module.ts create mode 100644 src/modules/scraper/services/browser.service.ts create mode 100644 src/modules/scraper/services/puppeteer-parse.ts diff --git a/package-lock.json b/package-lock.json index 346afa7..2fb4a1d 100644 --- a/package-lock.json +++ b/package-lock.json @@ -24,6 +24,10 @@ "helmet": "^8.1.0", "passport": "^0.7.0", "passport-google-oauth20": "^2.0.0", + "puppeteer-core": "^24.11.2", + "puppeteer-extra": "^3.3.6", + "puppeteer-extra-plugin-adblocker": "^2.13.6", + "puppeteer-extra-plugin-stealth": "^2.11.2", "reflect-metadata": "^0.2.2", "rxjs": "^7.8.1", "swagger-ui-express": "^5.0.1" @@ -223,7 +227,6 @@ "version": "7.27.1", "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.27.1.tgz", "integrity": "sha512-cjQ7ZlQ0Mv3b47hABuTevyTuYN4i+loJKGeV9flcCgIK37cCXRh+L1bd3iBHlynerhQ7BhCkn2BPbQUL+rGqFg==", - "dev": true, "license": "MIT", "dependencies": { "@babel/helper-validator-identifier": "^7.27.1", @@ -385,7 +388,6 @@ "version": "7.27.1", "resolved": "https://registry.npmjs.org/@babel/helper-validator-identifier/-/helper-validator-identifier-7.27.1.tgz", "integrity": "sha512-D2hP9eA+Sqx1kBZgzxZh0y1trbuU+JoDkiEwqhQ36nodYqJwyEIhPSdMNd7lOm/4io72luTPWH20Yda0xOuUow==", - "dev": true, "license": "MIT", "engines": { "node": ">=6.9.0" @@ -735,6 +737,64 @@ "dev": true, "license": "MIT" }, + "node_modules/@cliqz/adblocker": { + "version": "1.34.0", + "resolved": "https://registry.npmjs.org/@cliqz/adblocker/-/adblocker-1.34.0.tgz", + "integrity": "sha512-d7TeUl5t+TOMJe7/CRYtf+x6hbd8N25DtH7guQTIjjr3AFVortxiAIgNejGvVqy0by4eNByw+oVil15oqxz2Eg==", + "deprecated": "This project has been renamed to @ghostery/adblocker. Install using @ghostery/adblocker instead", + "dependencies": { + "@cliqz/adblocker-content": "^1.34.0", + "@cliqz/adblocker-extended-selectors": "^1.34.0", + "@remusao/guess-url-type": "^1.3.0", + "@remusao/small": "^1.2.1", + "@remusao/smaz": "^1.9.1", + "@types/chrome": "^0.0.278", + "@types/firefox-webext-browser": "^120.0.0", + "tldts-experimental": "^6.0.14" + } + }, + "node_modules/@cliqz/adblocker-content": { + "version": "1.34.0", + "resolved": "https://registry.npmjs.org/@cliqz/adblocker-content/-/adblocker-content-1.34.0.tgz", + "integrity": "sha512-5LcV8UZv49RWwtpom9ve4TxJIFKd+bjT59tS/2Z2c22Qxx5CW1ncO/T+ybzk31z422XplQfd0ZE6gMGGKs3EMg==", + "deprecated": "This project has been renamed to @ghostery/adblocker-content. Install using @ghostery/adblocker-content instead", + "dependencies": { + "@cliqz/adblocker-extended-selectors": "^1.34.0" + } + }, + "node_modules/@cliqz/adblocker-extended-selectors": { + "version": "1.34.0", + "resolved": "https://registry.npmjs.org/@cliqz/adblocker-extended-selectors/-/adblocker-extended-selectors-1.34.0.tgz", + "integrity": "sha512-lNrgdUPpsBWHjrwXy2+Z5nX/Gy5YAvNwFMLqkeMdjzrybwPIalJJN2e+YtkS1I6mVmOMNppF5cv692OAVoI74g==", + "deprecated": "This project has been renamed to @ghostery/adblocker-extended-selectors. Install using @ghostery/adblocker-extended-selectors instead" + }, + "node_modules/@cliqz/adblocker-puppeteer": { + "version": "1.23.8", + "resolved": "https://registry.npmjs.org/@cliqz/adblocker-puppeteer/-/adblocker-puppeteer-1.23.8.tgz", + "integrity": "sha512-Ca1/DBqQXsOpKTFVAHX6OpLTSEupXmUkUWHj6iXhLLleC7RPISN5B0b801VDmaGRqoC5zKRxn0vYbIfpgCWVug==", + "deprecated": "This project has been renamed to @ghostery/adblocker-puppeteer. Install using @ghostery/adblocker-puppeteer instead", + "dependencies": { + "@cliqz/adblocker": "^1.23.8", + "@cliqz/adblocker-content": "^1.23.8", + "tldts-experimental": "^5.6.21" + }, + "peerDependencies": { + "puppeteer": ">5" + } + }, + "node_modules/@cliqz/adblocker/node_modules/tldts-core": { + "version": "6.1.86", + "resolved": "https://registry.npmjs.org/tldts-core/-/tldts-core-6.1.86.tgz", + "integrity": "sha512-Je6p7pkk+KMzMv2XXKmAE3McmolOQFdxkKw0R8EYNr7sELW46JqnNeTX8ybPiQgvg1ymCoF8LXs5fzFaZvJPTA==" + }, + "node_modules/@cliqz/adblocker/node_modules/tldts-experimental": { + "version": "6.1.86", + "resolved": "https://registry.npmjs.org/tldts-experimental/-/tldts-experimental-6.1.86.tgz", + "integrity": "sha512-X3N3+SrwSajvANDyIBFa6tf/nO0VoqaXvvINSnQkZMGbzNlD+9G7Xb24Mtk3ZBVZJRGY7UynAJJL8kRVt6Z46Q==", + "dependencies": { + "tldts-core": "^6.1.86" + } + }, "node_modules/@colors/colors": { "version": "1.5.0", "resolved": "https://registry.npmjs.org/@colors/colors/-/colors-1.5.0.tgz", @@ -2998,6 +3058,63 @@ "@prisma/debug": "6.10.1" } }, + "node_modules/@puppeteer/browsers": { + "version": "2.10.5", + "resolved": "https://registry.npmjs.org/@puppeteer/browsers/-/browsers-2.10.5.tgz", + "integrity": "sha512-eifa0o+i8dERnngJwKrfp3dEq7ia5XFyoqB17S4gK8GhsQE4/P8nxOfQSE0zQHxzzLo/cmF+7+ywEQ7wK7Fb+w==", + "dependencies": { + "debug": "^4.4.1", + "extract-zip": "^2.0.1", + "progress": "^2.0.3", + "proxy-agent": "^6.5.0", + "semver": "^7.7.2", + "tar-fs": "^3.0.8", + "yargs": "^17.7.2" + }, + "bin": { + "browsers": "lib/cjs/main-cli.js" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/@remusao/guess-url-type": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/@remusao/guess-url-type/-/guess-url-type-1.3.0.tgz", + "integrity": "sha512-SNSJGxH5ckvxb3EUHj4DqlAm/bxNxNv2kx/AESZva/9VfcBokwKNS+C4D1lQdWIDM1R3d3UG+xmVzlkNG8CPTQ==" + }, + "node_modules/@remusao/small": { + "version": "1.3.0", + "resolved": "https://registry.npmjs.org/@remusao/small/-/small-1.3.0.tgz", + "integrity": "sha512-bydAhJI+ywmg5xMUcbqoR8KahetcfkFywEZpsyFZ8EBofilvWxbXnMSe4vnjDI1Y+SWxnNhR4AL/2BAXkf4b8A==" + }, + "node_modules/@remusao/smaz": { + "version": "1.10.0", + "resolved": "https://registry.npmjs.org/@remusao/smaz/-/smaz-1.10.0.tgz", + "integrity": "sha512-GQzCxmmMpLkyZwcwNgz8TpuBEWl0RUQa8IcvKiYlPxuyYKqyqPkCr0hlHI15ckn3kDUPS68VmTVgyPnLNrdVmg==", + "dependencies": { + "@remusao/smaz-compress": "^1.10.0", + "@remusao/smaz-decompress": "^1.10.0" + } + }, + "node_modules/@remusao/smaz-compress": { + "version": "1.10.0", + "resolved": "https://registry.npmjs.org/@remusao/smaz-compress/-/smaz-compress-1.10.0.tgz", + "integrity": "sha512-E/lC8OSU+3bQrUl64vlLyPzIxo7dxF2RvNBe9KzcM4ax43J/d+YMinmMztHyCIHqRbz7rBCtkp3c0KfeIbHmEg==", + "dependencies": { + "@remusao/trie": "^1.5.0" + } + }, + "node_modules/@remusao/smaz-decompress": { + "version": "1.10.0", + "resolved": "https://registry.npmjs.org/@remusao/smaz-decompress/-/smaz-decompress-1.10.0.tgz", + "integrity": "sha512-aA5ImUH480Pcs5/cOgToKmFnzi7osSNG6ft+7DdmQTaQEEst3nLq3JLlBEk+gwidURymjbx6DYs60LHaZ415VQ==" + }, + "node_modules/@remusao/trie": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/@remusao/trie/-/trie-1.5.0.tgz", + "integrity": "sha512-UX+3utJKgwCsg6sUozjxd38gNMVRXrY4TNX9VvCdSrlZBS1nZjRPi98ON3QjRAdf6KCguJFyQARRsulTeqQiPg==" + }, "node_modules/@scarf/scarf": { "version": "1.4.0", "resolved": "https://registry.npmjs.org/@scarf/scarf/-/scarf-1.4.0.tgz", @@ -3385,6 +3502,11 @@ "integrity": "sha512-OvjF+z51L3ov0OyAU0duzsYuvO01PH7x4t6DJx+guahgTnBHkhJdG7soQeTSFLWN3efnHyibZ4Z8l2EuWwJN3A==", "license": "MIT" }, + "node_modules/@tootallnate/quickjs-emscripten": { + "version": "0.23.0", + "resolved": "https://registry.npmjs.org/@tootallnate/quickjs-emscripten/-/quickjs-emscripten-0.23.0.tgz", + "integrity": "sha512-C5Mc6rdnsaJDjO3UpGW/CQTHtCKaYlScZTly4JIu97Jxo/odCiH0ITnDXSJPTOrEKk/ycSZ0AOgTmkDtkOsvIA==" + }, "node_modules/@tsconfig/node10": { "version": "1.0.11", "resolved": "https://registry.npmjs.org/@tsconfig/node10/-/node10-1.0.11.tgz", @@ -3469,6 +3591,15 @@ "@types/node": "*" } }, + "node_modules/@types/chrome": { + "version": "0.0.278", + "resolved": "https://registry.npmjs.org/@types/chrome/-/chrome-0.0.278.tgz", + "integrity": "sha512-PDIJodOu7o54PpSOYLybPW/MDZBCjM1TKgf31I3Q/qaEbNpIH09rOM3tSEH3N7Q+FAqb1933LhF8ksUPYeQLNg==", + "dependencies": { + "@types/filesystem": "*", + "@types/har-format": "*" + } + }, "node_modules/@types/connect": { "version": "3.4.38", "resolved": "https://registry.npmjs.org/@types/connect/-/connect-3.4.38.tgz", @@ -3495,6 +3626,14 @@ "dev": true, "license": "MIT" }, + "node_modules/@types/debug": { + "version": "4.1.12", + "resolved": "https://registry.npmjs.org/@types/debug/-/debug-4.1.12.tgz", + "integrity": "sha512-vIChWdVG3LG1SMxEvI/AK+FWJthlrqlTu7fbrlywTkkaONwk/UAGaULXRlf8vkzFBLVm0zkMdCquhL5aOjhXPQ==", + "dependencies": { + "@types/ms": "*" + } + }, "node_modules/@types/eslint": { "version": "9.6.1", "resolved": "https://registry.npmjs.org/@types/eslint/-/eslint-9.6.1.tgz", @@ -3549,6 +3688,24 @@ "@types/send": "*" } }, + "node_modules/@types/filesystem": { + "version": "0.0.36", + "resolved": "https://registry.npmjs.org/@types/filesystem/-/filesystem-0.0.36.tgz", + "integrity": "sha512-vPDXOZuannb9FZdxgHnqSwAG/jvdGM8Wq+6N4D/d80z+D4HWH+bItqsZaVRQykAn6WEVeEkLm2oQigyHtgb0RA==", + "dependencies": { + "@types/filewriter": "*" + } + }, + "node_modules/@types/filewriter": { + "version": "0.0.33", + "resolved": "https://registry.npmjs.org/@types/filewriter/-/filewriter-0.0.33.tgz", + "integrity": "sha512-xFU8ZXTw4gd358lb2jw25nxY9QAgqn2+bKKjKOYfNCzN4DKCFetK7sPtrlpg66Ywe3vWY9FNxprZawAh9wfJ3g==" + }, + "node_modules/@types/firefox-webext-browser": { + "version": "120.0.4", + "resolved": "https://registry.npmjs.org/@types/firefox-webext-browser/-/firefox-webext-browser-120.0.4.tgz", + "integrity": "sha512-lBrpf08xhiZBigrtdQfUaqX1UauwZ+skbFiL8u2Tdra/rklkKadYmIzTwkNZSWtuZ7OKpFqbE2HHfDoFqvZf6w==" + }, "node_modules/@types/graceful-fs": { "version": "4.1.9", "resolved": "https://registry.npmjs.org/@types/graceful-fs/-/graceful-fs-4.1.9.tgz", @@ -3559,6 +3716,11 @@ "@types/node": "*" } }, + "node_modules/@types/har-format": { + "version": "1.2.16", + "resolved": "https://registry.npmjs.org/@types/har-format/-/har-format-1.2.16.tgz", + "integrity": "sha512-fluxdy7ryD3MV6h8pTfTYpy/xQzCFC7m89nOH9y94cNqJ1mDIDPut7MnRHI3F6qRmh/cT2fUjG1MLdCNb4hE9A==" + }, "node_modules/@types/http-cache-semantics": { "version": "4.0.4", "resolved": "https://registry.npmjs.org/@types/http-cache-semantics/-/http-cache-semantics-4.0.4.tgz", @@ -3640,6 +3802,11 @@ "dev": true, "license": "MIT" }, + "node_modules/@types/ms": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/@types/ms/-/ms-2.1.0.tgz", + "integrity": "sha512-GsCCIZDE/p3i96vtEqx+7dBUGXrc7zeSK3wwPHIaRThS+9OhWIXRqzs4d6k1SVU8g91DrNRWxWUGhp5KXQb2VA==" + }, "node_modules/@types/node": { "version": "22.15.33", "resolved": "https://registry.npmjs.org/@types/node/-/node-22.15.33.tgz", @@ -3783,6 +3950,15 @@ "dev": true, "license": "MIT" }, + "node_modules/@types/yauzl": { + "version": "2.10.3", + "resolved": "https://registry.npmjs.org/@types/yauzl/-/yauzl-2.10.3.tgz", + "integrity": "sha512-oJoftv0LSuaDZE3Le4DbKX+KS9G36NzOeSap90UIK0yMA/NhKJhqlSGtNDORNRaIbQfzjXDrQa0ytJ6mNRGz/Q==", + "optional": true, + "dependencies": { + "@types/node": "*" + } + }, "node_modules/@typescript-eslint/eslint-plugin": { "version": "8.35.0", "resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-8.35.0.tgz", @@ -4824,6 +5000,14 @@ "node": ">=0.4.0" } }, + "node_modules/agent-base": { + "version": "7.1.3", + "resolved": "https://registry.npmjs.org/agent-base/-/agent-base-7.1.3.tgz", + "integrity": "sha512-jRR5wdylq8CkOe6hei19GGZnxM6rBGwFl3Bg0YItGDimvjGtAvdZk4Pu6Cl4u4Igsws4a1fd1Vq3ezrhn4KmFw==", + "engines": { + "node": ">= 14" + } + }, "node_modules/ajv": { "version": "6.12.6", "resolved": "https://registry.npmjs.org/ajv/-/ajv-6.12.6.tgz", @@ -4936,7 +5120,6 @@ "version": "4.3.0", "resolved": "https://registry.npmjs.org/ansi-styles/-/ansi-styles-4.3.0.tgz", "integrity": "sha512-zbB9rCJAT1rbjiVDb2hqKFHNYLxgtk8NURxZ3IZwD3F6NtxbXZQCnnSi1Lkx+IDohdPlFp222wVALIheZJQSEg==", - "dev": true, "license": "MIT", "dependencies": { "color-convert": "^2.0.1" @@ -5025,6 +5208,14 @@ "integrity": "sha512-8+9WqebbFzpX9OR+Wa6O29asIogeRMzcGtAINdpMHHyAg10f05aSFVBbcEqGf/PXw1EjAZ+q2/bEBg3DvurK3Q==", "license": "Python-2.0" }, + "node_modules/arr-union": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/arr-union/-/arr-union-3.1.0.tgz", + "integrity": "sha512-sKpyeERZ02v1FeCZT8lrfJq5u6goHCtpTAzPwJYe7c8SPFOboNjNg1vz2L4VTn9T4PQxEx13TbXLmYUcS6Ug7Q==", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/array-timsort": { "version": "1.0.3", "resolved": "https://registry.npmjs.org/array-timsort/-/array-timsort-1.0.3.tgz", @@ -5039,6 +5230,17 @@ "dev": true, "license": "MIT" }, + "node_modules/ast-types": { + "version": "0.13.4", + "resolved": "https://registry.npmjs.org/ast-types/-/ast-types-0.13.4.tgz", + "integrity": "sha512-x1FCFnFifvYDDzTaLII71vG5uvDwgtmDTEVWAxrgeiR8VjMONcCXJx7E+USjDtHlwFmt9MysbqgF9b9Vjr6w+w==", + "dependencies": { + "tslib": "^2.0.1" + }, + "engines": { + "node": ">=4" + } + }, "node_modules/async": { "version": "3.2.6", "resolved": "https://registry.npmjs.org/async/-/async-3.2.6.tgz", @@ -5057,7 +5259,6 @@ "version": "1.6.7", "resolved": "https://registry.npmjs.org/b4a/-/b4a-1.6.7.tgz", "integrity": "sha512-OnAYlL5b7LEkALw87fUVafQw5rVR9RjwGd4KUwNQ6DrrNmaVaUCgLipfVlzrPQ4tWOR9P0IXGNOx50jYCCdSJg==", - "dev": true, "license": "Apache-2.0" }, "node_modules/babel-jest": { @@ -5190,17 +5391,76 @@ "version": "1.0.2", "resolved": "https://registry.npmjs.org/balanced-match/-/balanced-match-1.0.2.tgz", "integrity": "sha512-3oSeUO0TMV67hN1AmbXsK4yaqU7tjiHlbxRDZOpH0KW9+CeX4bRAaX0Anxt0tx2MrpRpWwQaPwIlISEJhYU5Pw==", - "dev": true, "license": "MIT" }, "node_modules/bare-events": { "version": "2.5.4", "resolved": "https://registry.npmjs.org/bare-events/-/bare-events-2.5.4.tgz", "integrity": "sha512-+gFfDkR8pj4/TrWCGUGWmJIkBwuxPS5F+a5yWjOHQt2hHvNZd5YLzadjmDUtFmMM4y429bnKLa8bYBMHcYdnQA==", - "dev": true, "license": "Apache-2.0", "optional": true }, + "node_modules/bare-fs": { + "version": "4.1.5", + "resolved": "https://registry.npmjs.org/bare-fs/-/bare-fs-4.1.5.tgz", + "integrity": "sha512-1zccWBMypln0jEE05LzZt+V/8y8AQsQQqxtklqaIyg5nu6OAYFhZxPXinJTSG+kU5qyNmeLgcn9AW7eHiCHVLA==", + "optional": true, + "dependencies": { + "bare-events": "^2.5.4", + "bare-path": "^3.0.0", + "bare-stream": "^2.6.4" + }, + "engines": { + "bare": ">=1.16.0" + }, + "peerDependencies": { + "bare-buffer": "*" + }, + "peerDependenciesMeta": { + "bare-buffer": { + "optional": true + } + } + }, + "node_modules/bare-os": { + "version": "3.6.1", + "resolved": "https://registry.npmjs.org/bare-os/-/bare-os-3.6.1.tgz", + "integrity": "sha512-uaIjxokhFidJP+bmmvKSgiMzj2sV5GPHaZVAIktcxcpCyBFFWO+YlikVAdhmUo2vYFvFhOXIAlldqV29L8126g==", + "optional": true, + "engines": { + "bare": ">=1.14.0" + } + }, + "node_modules/bare-path": { + "version": "3.0.0", + "resolved": "https://registry.npmjs.org/bare-path/-/bare-path-3.0.0.tgz", + "integrity": "sha512-tyfW2cQcB5NN8Saijrhqn0Zh7AnFNsnczRcuWODH0eYAXBsJ5gVxAUuNr7tsHSC6IZ77cA0SitzT+s47kot8Mw==", + "optional": true, + "dependencies": { + "bare-os": "^3.0.1" + } + }, + "node_modules/bare-stream": { + "version": "2.6.5", + "resolved": "https://registry.npmjs.org/bare-stream/-/bare-stream-2.6.5.tgz", + "integrity": "sha512-jSmxKJNJmHySi6hC42zlZnq00rga4jjxcgNZjY9N5WlOe/iOoGRtdwGsHzQv2RlH2KOYMwGUXhf2zXd32BA9RA==", + "optional": true, + "dependencies": { + "streamx": "^2.21.0" + }, + "peerDependencies": { + "bare-buffer": "*", + "bare-events": "*" + }, + "peerDependenciesMeta": { + "bare-buffer": { + "optional": true + }, + "bare-events": { + "optional": true + } + } + }, "node_modules/base64-js": { "version": "1.5.1", "resolved": "https://registry.npmjs.org/base64-js/-/base64-js-1.5.1.tgz", @@ -5231,6 +5491,14 @@ "node": ">=6.0.0" } }, + "node_modules/basic-ftp": { + "version": "5.0.5", + "resolved": "https://registry.npmjs.org/basic-ftp/-/basic-ftp-5.0.5.tgz", + "integrity": "sha512-4Bcg1P8xhUuqcii/S0Z9wiHIrQVPMermM1any+MX5GeGD7faD3/msQUDGLol9wOcz4/jbg/WJnGqoJF6LiBdtg==", + "engines": { + "node": ">=10.0.0" + } + }, "node_modules/bin-version": { "version": "6.0.0", "resolved": "https://registry.npmjs.org/bin-version/-/bin-version-6.0.0.tgz", @@ -5302,7 +5570,6 @@ "version": "1.1.12", "resolved": "https://registry.npmjs.org/brace-expansion/-/brace-expansion-1.1.12.tgz", "integrity": "sha512-9T9UjW3r0UW5c1Q7GTwllptXwhvYmEzFhzMfZ9H7FQWt+uZePjZPjBP/W1ZEyZ1twGWom5/56TF4lPcqjnDHcg==", - "dev": true, "license": "MIT", "dependencies": { "balanced-match": "^1.0.0", @@ -5407,7 +5674,6 @@ "version": "0.2.13", "resolved": "https://registry.npmjs.org/buffer-crc32/-/buffer-crc32-0.2.13.tgz", "integrity": "sha512-VO9Ht/+p3SN7SKWqcrgEzjGbRSJYTx+Q1pTQC0wrWqHx0vpJraQ6GtHx8tvcg1rlK1byhU5gccxgOgj7B0TDkQ==", - "dev": true, "license": "MIT", "engines": { "node": "*" @@ -5506,7 +5772,6 @@ "version": "3.1.0", "resolved": "https://registry.npmjs.org/callsites/-/callsites-3.1.0.tgz", "integrity": "sha512-P8BjAsXvZS+VIDUI11hHCQEv74YT67YUi5JJFNWIqL235sBmjX4+qx9Muvls5ivyNENctx46xQLQ3aTuE7ssaQ==", - "dev": true, "license": "MIT", "engines": { "node": ">=6" @@ -5603,6 +5868,18 @@ "node": ">=6.0" } }, + "node_modules/chromium-bidi": { + "version": "5.1.0", + "resolved": "https://registry.npmjs.org/chromium-bidi/-/chromium-bidi-5.1.0.tgz", + "integrity": "sha512-9MSRhWRVoRPDG0TgzkHrshFSJJNZzfY5UFqUMuksg7zL1yoZIZ3jLB0YAgHclbiAxPI86pBnwDX1tbzoiV8aFw==", + "dependencies": { + "mitt": "^3.0.1", + "zod": "^3.24.1" + }, + "peerDependencies": { + "devtools-protocol": "*" + } + }, "node_modules/ci-info": { "version": "3.9.0", "resolved": "https://registry.npmjs.org/ci-info/-/ci-info-3.9.0.tgz", @@ -5697,7 +5974,6 @@ "version": "8.0.1", "resolved": "https://registry.npmjs.org/cliui/-/cliui-8.0.1.tgz", "integrity": "sha512-BSeNnyus75C4//NQ9gQt1/csTXyo/8Sb+afLAkzAptFuMsod9HFokGNudZpi/oQV73hnVK+sR+5PVRMd+Dr7YQ==", - "dev": true, "license": "ISC", "dependencies": { "string-width": "^4.2.0", @@ -5712,7 +5988,6 @@ "version": "5.0.1", "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", - "dev": true, "license": "MIT", "engines": { "node": ">=8" @@ -5722,7 +5997,6 @@ "version": "6.0.1", "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", - "dev": true, "license": "MIT", "dependencies": { "ansi-regex": "^5.0.1" @@ -5735,7 +6009,6 @@ "version": "7.0.0", "resolved": "https://registry.npmjs.org/wrap-ansi/-/wrap-ansi-7.0.0.tgz", "integrity": "sha512-YVGIj2kamLSTxw6NsZjoBxfSwsn0ycdesmc4p+Q21c5zPuZ1pl+NfxVdxPtdHvmNVOQ6XSYG4AUtyt/Fi7D16Q==", - "dev": true, "license": "MIT", "dependencies": { "ansi-styles": "^4.0.0", @@ -5759,6 +6032,32 @@ "node": ">=0.8" } }, + "node_modules/clone-deep": { + "version": "0.2.4", + "resolved": "https://registry.npmjs.org/clone-deep/-/clone-deep-0.2.4.tgz", + "integrity": "sha512-we+NuQo2DHhSl+DP6jlUiAhyAjBQrYnpOk15rN6c6JSPScjiCLh8IbSU+VTcph6YS3o7mASE8a0+gbZ7ChLpgg==", + "dependencies": { + "for-own": "^0.1.3", + "is-plain-object": "^2.0.1", + "kind-of": "^3.0.2", + "lazy-cache": "^1.0.3", + "shallow-clone": "^0.1.2" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/clone-deep/node_modules/kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha512-NOW9QQXMoZGg/oqnVNoNTTIFEIid1627WCffUBJEdMxYApq7mNE7CpzucIPc+ZQg25Phej7IJSmX3hO+oblOtQ==", + "dependencies": { + "is-buffer": "^1.1.5" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/co": { "version": "4.6.0", "resolved": "https://registry.npmjs.org/co/-/co-4.6.0.tgz", @@ -5781,7 +6080,6 @@ "version": "2.0.1", "resolved": "https://registry.npmjs.org/color-convert/-/color-convert-2.0.1.tgz", "integrity": "sha512-RRECPsj7iu/xb5oKYcsFHSppFNnsj/52OVTRKb4zP5onXwVF3zVmmToNcOfGC+CRDpfK/U584fMg38ZHCaElKQ==", - "dev": true, "license": "MIT", "dependencies": { "color-name": "~1.1.4" @@ -5794,7 +6092,6 @@ "version": "1.1.4", "resolved": "https://registry.npmjs.org/color-name/-/color-name-1.1.4.tgz", "integrity": "sha512-dOy+3AuW3a2wNbZHIuMZpTcgjGuLU/uBL/ubcZF9OXbDo8ff4O8yVp5Bf0efS8uEoYo5q4Fx7dY9OgQGXgAsQA==", - "dev": true, "license": "MIT" }, "node_modules/combined-stream": { @@ -5851,7 +6148,6 @@ "version": "0.0.1", "resolved": "https://registry.npmjs.org/concat-map/-/concat-map-0.0.1.tgz", "integrity": "sha512-/Srv4dswyQNBfohGpz9o6Yb3Gz3SrUDqBH5rTuhGR7ahtlbYKnVxw2bCFMRljaA7EXHaXZ8wsHdodFvbkhKmqg==", - "dev": true, "license": "MIT" }, "node_modules/concat-stream": { @@ -6040,6 +6336,14 @@ "node": ">= 8" } }, + "node_modules/data-uri-to-buffer": { + "version": "6.0.2", + "resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz", + "integrity": "sha512-7hvf7/GW8e86rW0ptuwS3OcBGDjIi6SZva7hCyWC0yYry2cOPmLIjXAUHI6DK2HsnwJd9ifmt57i8eV2n4YNpw==", + "engines": { + "node": ">= 14" + } + }, "node_modules/debug": { "version": "4.4.1", "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.1.tgz", @@ -6112,7 +6416,6 @@ "version": "4.3.1", "resolved": "https://registry.npmjs.org/deepmerge/-/deepmerge-4.3.1.tgz", "integrity": "sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==", - "dev": true, "license": "MIT", "engines": { "node": ">=0.10.0" @@ -6141,6 +6444,19 @@ "node": ">=10" } }, + "node_modules/degenerator": { + "version": "5.0.1", + "resolved": "https://registry.npmjs.org/degenerator/-/degenerator-5.0.1.tgz", + "integrity": "sha512-TllpMR/t0M5sqCXfj85i4XaAzxmS5tVA16dqvdkMwGmzI+dXLXnw3J+3Vdv7VKw+ThlTMboK6i9rnZ6Nntj5CQ==", + "dependencies": { + "ast-types": "^0.13.4", + "escodegen": "^2.1.0", + "esprima": "^4.0.1" + }, + "engines": { + "node": ">= 14" + } + }, "node_modules/delayed-stream": { "version": "1.0.0", "resolved": "https://registry.npmjs.org/delayed-stream/-/delayed-stream-1.0.0.tgz", @@ -6170,6 +6486,11 @@ "node": ">=8" } }, + "node_modules/devtools-protocol": { + "version": "0.0.1464554", + "resolved": "https://registry.npmjs.org/devtools-protocol/-/devtools-protocol-0.0.1464554.tgz", + "integrity": "sha512-CAoP3lYfwAGQTaAXYvA6JZR0fjGUb7qec1qf4mToyoH2TZgUFeIqYcjh6f9jNuhHfuZiEdH+PONHYrLhRQX6aw==" + }, "node_modules/dezalgo": { "version": "1.0.4", "resolved": "https://registry.npmjs.org/dezalgo/-/dezalgo-1.0.4.tgz", @@ -6327,7 +6648,6 @@ "version": "8.0.0", "resolved": "https://registry.npmjs.org/emoji-regex/-/emoji-regex-8.0.0.tgz", "integrity": "sha512-MSjYzcWNOA0ewAHpz0MxpYFvwg6yjy1NG3xteoqz644VCo/RPgnr1/GGt+ic3iJTzQ8Eu3TdM14SawnVUmGE6A==", - "dev": true, "license": "MIT" }, "node_modules/encodeurl": { @@ -6339,6 +6659,14 @@ "node": ">= 0.8" } }, + "node_modules/end-of-stream": { + "version": "1.4.5", + "resolved": "https://registry.npmjs.org/end-of-stream/-/end-of-stream-1.4.5.tgz", + "integrity": "sha512-ooEGc6HP26xXq/N+GCGOT0JKCLDGrq2bQUZrQ7gyrJiZANJ/8YDTxTpQBXGMn+WbIQXNVpyWymm7KYVICQnyOg==", + "dependencies": { + "once": "^1.4.0" + } + }, "node_modules/enhanced-resolve": { "version": "5.18.2", "resolved": "https://registry.npmjs.org/enhanced-resolve/-/enhanced-resolve-5.18.2.tgz", @@ -6353,11 +6681,19 @@ "node": ">=10.13.0" } }, + "node_modules/env-paths": { + "version": "2.2.1", + "resolved": "https://registry.npmjs.org/env-paths/-/env-paths-2.2.1.tgz", + "integrity": "sha512-+h1lkLKhZMTYjog1VEpJNG7NZJWcuc2DDk/qsqSTRRCOXiLjeQ1d1/udrUGhqMxUgAlwKNZ0cf2uqan5GLuS2A==", + "peer": true, + "engines": { + "node": ">=6" + } + }, "node_modules/error-ex": { "version": "1.3.2", "resolved": "https://registry.npmjs.org/error-ex/-/error-ex-1.3.2.tgz", "integrity": "sha512-7dFHNmqeFSEt2ZBsCriorKnn3Z2pj+fd9kmI6QoWw4//DL+icEBfc0U7qJCisqrTsKTjw4fNFy2pW9OqStD84g==", - "dev": true, "license": "MIT", "dependencies": { "is-arrayish": "^0.2.1" @@ -6420,7 +6756,6 @@ "version": "3.2.0", "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz", "integrity": "sha512-WUj2qlxaQtO4g6Pq5c29GTcWGDyd8itL8zTlipgECz3JesAiiOKotd8JU6otB3PACgG6xkJUyVhboMS+bje/jA==", - "dev": true, "license": "MIT", "engines": { "node": ">=6" @@ -6445,6 +6780,35 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/escodegen": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/escodegen/-/escodegen-2.1.0.tgz", + "integrity": "sha512-2NlIDTwUWJN0mRPQOdtQBzbUHvdGY2P1VXSyU83Q3xKxM7WHX2Ql8dKq782Q9TgQUNOLEzEYu9bzLNj1q88I5w==", + "dependencies": { + "esprima": "^4.0.1", + "estraverse": "^5.2.0", + "esutils": "^2.0.2" + }, + "bin": { + "escodegen": "bin/escodegen.js", + "esgenerate": "bin/esgenerate.js" + }, + "engines": { + "node": ">=6.0" + }, + "optionalDependencies": { + "source-map": "~0.6.1" + } + }, + "node_modules/escodegen/node_modules/source-map": { + "version": "0.6.1", + "resolved": "https://registry.npmjs.org/source-map/-/source-map-0.6.1.tgz", + "integrity": "sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==", + "optional": true, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/eslint": { "version": "9.29.0", "resolved": "https://registry.npmjs.org/eslint/-/eslint-9.29.0.tgz", @@ -6605,7 +6969,6 @@ "version": "4.0.1", "resolved": "https://registry.npmjs.org/esprima/-/esprima-4.0.1.tgz", "integrity": "sha512-eGuFFw7Upda+g4p+QHvnW0RyTX/SVeJBDM/gCtMARO0cLuT2HcEKnTPvhjV6aGeqrCB/sbNop0Kszm0jsaWU4A==", - "dev": true, "license": "BSD-2-Clause", "bin": { "esparse": "bin/esparse.js", @@ -6645,7 +7008,6 @@ "version": "5.3.0", "resolved": "https://registry.npmjs.org/estraverse/-/estraverse-5.3.0.tgz", "integrity": "sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==", - "dev": true, "license": "BSD-2-Clause", "engines": { "node": ">=4.0" @@ -6655,7 +7017,6 @@ "version": "2.0.3", "resolved": "https://registry.npmjs.org/esutils/-/esutils-2.0.3.tgz", "integrity": "sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==", - "dev": true, "license": "BSD-2-Clause", "engines": { "node": ">=0.10.0" @@ -6846,6 +7207,48 @@ "node": ">=0.10.0" } }, + "node_modules/extract-zip": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/extract-zip/-/extract-zip-2.0.1.tgz", + "integrity": "sha512-GDhU9ntwuKyGXdZBUgTIe+vXnWj0fppUEtMDL0+idd5Sta8TGpHssn/eusA9mrPr9qNDym6SxAYZjNvCn/9RBg==", + "dependencies": { + "debug": "^4.1.1", + "get-stream": "^5.1.0", + "yauzl": "^2.10.0" + }, + "bin": { + "extract-zip": "cli.js" + }, + "engines": { + "node": ">= 10.17.0" + }, + "optionalDependencies": { + "@types/yauzl": "^2.9.1" + } + }, + "node_modules/extract-zip/node_modules/get-stream": { + "version": "5.2.0", + "resolved": "https://registry.npmjs.org/get-stream/-/get-stream-5.2.0.tgz", + "integrity": "sha512-nBF+F1rAZVCu/p7rjzgA+Yb4lfYXrpl7a6VmJrU8wF9I1CKvP/QwPNZHnOlwbTkY6dvtFIzFMSyQXbLoTQPRpA==", + "dependencies": { + "pump": "^3.0.0" + }, + "engines": { + "node": ">=8" + }, + "funding": { + "url": "https://github.com/sponsors/sindresorhus" + } + }, + "node_modules/extract-zip/node_modules/yauzl": { + "version": "2.10.0", + "resolved": "https://registry.npmjs.org/yauzl/-/yauzl-2.10.0.tgz", + "integrity": "sha512-p4a9I6X6nu6IhoGmBqAcbJy1mlC4j27vEPZX9F4L4/vZT3Lyq1VkFHw/V/PUcB9Buo+DG3iHkT0x3Qya58zc3g==", + "dependencies": { + "buffer-crc32": "~0.2.3", + "fd-slicer": "~1.1.0" + } + }, "node_modules/fast-deep-equal": { "version": "3.1.3", "resolved": "https://registry.npmjs.org/fast-deep-equal/-/fast-deep-equal-3.1.3.tgz", @@ -6864,7 +7267,6 @@ "version": "1.3.2", "resolved": "https://registry.npmjs.org/fast-fifo/-/fast-fifo-1.3.2.tgz", "integrity": "sha512-/d9sfos4yxzpwkDkuN7k2SqFKtYNmCTzgfEpz82x34IM9/zc8KGxQoXg1liNC/izpRM/MBdt44Nmx41ZWqk+FQ==", - "dev": true, "license": "MIT" }, "node_modules/fast-glob": { @@ -6954,6 +7356,14 @@ "bser": "2.1.1" } }, + "node_modules/fd-slicer": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/fd-slicer/-/fd-slicer-1.1.0.tgz", + "integrity": "sha512-cE1qsB/VwyQozZ+q1dGxR8LBYNZeofhEdUNGSMbQD3Gw2lAzX9Zb3uIU6Ebc/Fmyjo9AWWfnn0AUCHqtevs/8g==", + "dependencies": { + "pend": "~1.2.0" + } + }, "node_modules/fflate": { "version": "0.8.2", "resolved": "https://registry.npmjs.org/fflate/-/fflate-0.8.2.tgz", @@ -7137,6 +7547,25 @@ "dev": true, "license": "ISC" }, + "node_modules/for-in": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/for-in/-/for-in-1.0.2.tgz", + "integrity": "sha512-7EwmXrOjyL+ChxMhmG5lnW9MPt1aIeZEwKhQzoBUdTV0N3zuwWDZYVJatDvZ2OyzPUvdIAZDsCetk3coyMfcnQ==", + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/for-own": { + "version": "0.1.5", + "resolved": "https://registry.npmjs.org/for-own/-/for-own-0.1.5.tgz", + "integrity": "sha512-SKmowqGTJoPzLO1T0BBJpkfp3EMacCMOuH40hOUbrbzElVktk4DioXVM99QkLCyKoiuOmyjgcWMpVz2xjE7LZw==", + "dependencies": { + "for-in": "^1.0.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/foreground-child": { "version": "3.3.1", "resolved": "https://registry.npmjs.org/foreground-child/-/foreground-child-3.3.1.tgz", @@ -7272,7 +7701,6 @@ "version": "10.1.0", "resolved": "https://registry.npmjs.org/fs-extra/-/fs-extra-10.1.0.tgz", "integrity": "sha512-oRXApq54ETRj4eMiFzGnHWGy+zo5raudjuxN0b8H7s/RU2oW0Wvsx9O0ACRN/kRq9E8Vu/ReskGB5o3ji+FzHQ==", - "dev": true, "license": "MIT", "dependencies": { "graceful-fs": "^4.2.0", @@ -7294,7 +7722,6 @@ "version": "1.0.0", "resolved": "https://registry.npmjs.org/fs.realpath/-/fs.realpath-1.0.0.tgz", "integrity": "sha512-OO0pH2lK6a0hZnAdau5ItzHPI6pUlvI7jMVnxUQRtw4owF2wk8lOSabtGDCTP4Ggrg2MbGnWO9X8K1t4+fGMDw==", - "dev": true, "license": "ISC" }, "node_modules/fsevents": { @@ -7335,7 +7762,6 @@ "version": "2.0.5", "resolved": "https://registry.npmjs.org/get-caller-file/-/get-caller-file-2.0.5.tgz", "integrity": "sha512-DyFP3BM/3YHTQOCUL/w0OZHR0lpKeGrxotcHWcqNEdnltqFwXVfhEBQ94eIo34AfQpo0rGki4cyIiftY06h2Fg==", - "dev": true, "license": "ISC", "engines": { "node": "6.* || 8.* || >= 10.*" @@ -7401,6 +7827,19 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/get-uri": { + "version": "6.0.4", + "resolved": "https://registry.npmjs.org/get-uri/-/get-uri-6.0.4.tgz", + "integrity": "sha512-E1b1lFFLvLgak2whF2xDBcOy6NLVGZBqqjJjsIhvopKfWWEi64pLVTWWehV8KlLerZkfNTA95sTe2OdJKm1OzQ==", + "dependencies": { + "basic-ftp": "^5.0.2", + "data-uri-to-buffer": "^6.0.2", + "debug": "^4.3.4" + }, + "engines": { + "node": ">= 14" + } + }, "node_modules/glob": { "version": "11.0.1", "resolved": "https://registry.npmjs.org/glob/-/glob-11.0.1.tgz", @@ -7516,7 +7955,6 @@ "version": "4.2.11", "resolved": "https://registry.npmjs.org/graceful-fs/-/graceful-fs-4.2.11.tgz", "integrity": "sha512-RbJ5/jmFcNNCcDV5o9eTnBLJ/HszWV0P73bc+Ff4nS/rJj+YaS6IGyiOL0VoBYX+l1Wrl3k63h/KrH+nhJ0XvQ==", - "dev": true, "license": "ISC" }, "node_modules/graphemer": { @@ -7633,6 +8071,18 @@ "node": ">= 0.8" } }, + "node_modules/http-proxy-agent": { + "version": "7.0.2", + "resolved": "https://registry.npmjs.org/http-proxy-agent/-/http-proxy-agent-7.0.2.tgz", + "integrity": "sha512-T1gkAiYYDWYx3V5Bmyu7HcfcvL7mUrTWiM6yOfa3PIphViJ/gFPbvidQ+veqSOHci/PxBcDabeUNCzpOODJZig==", + "dependencies": { + "agent-base": "^7.1.0", + "debug": "^4.3.4" + }, + "engines": { + "node": ">= 14" + } + }, "node_modules/http2-wrapper": { "version": "2.2.1", "resolved": "https://registry.npmjs.org/http2-wrapper/-/http2-wrapper-2.2.1.tgz", @@ -7647,9 +8097,21 @@ "node": ">=10.19.0" } }, - "node_modules/human-signals": { - "version": "2.1.0", - "resolved": "https://registry.npmjs.org/human-signals/-/human-signals-2.1.0.tgz", + "node_modules/https-proxy-agent": { + "version": "7.0.6", + "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.6.tgz", + "integrity": "sha512-vK9P5/iUfdl95AI+JVyUuIcVtd4ofvtrOr3HNtM2yxC9bnMbEdp3x01OhQNnjb8IJYi38VlTE3mBXwcfvywuSw==", + "dependencies": { + "agent-base": "^7.1.2", + "debug": "4" + }, + "engines": { + "node": ">= 14" + } + }, + "node_modules/human-signals": { + "version": "2.1.0", + "resolved": "https://registry.npmjs.org/human-signals/-/human-signals-2.1.0.tgz", "integrity": "sha512-B4FFZ6q/T2jhhksgkbEW3HBvWIfDW85snkQgawt07S7J5QXTk6BkNV+0yAeZrM5QpMAdYlocGoljn0sJ/WQkFw==", "dev": true, "license": "Apache-2.0", @@ -7703,7 +8165,6 @@ "version": "3.3.1", "resolved": "https://registry.npmjs.org/import-fresh/-/import-fresh-3.3.1.tgz", "integrity": "sha512-TR3KfrTZTYLPB6jUjfx6MF9WcWrHL9su5TObK4ZkYgBdWKPOFoSoQIdEuTuR82pmtxH2spWG9h6etwfr1pLBqQ==", - "dev": true, "license": "MIT", "dependencies": { "parent-module": "^1.0.0", @@ -7751,7 +8212,6 @@ "resolved": "https://registry.npmjs.org/inflight/-/inflight-1.0.6.tgz", "integrity": "sha512-k92I/b08q4wvFscXCLvqfsHCrjrF7yiXsQuIVvVE7N82W3+aqpzuUdBbfhWcy/FZR3/4IgflMgKLOsvPDrGCJA==", "deprecated": "This module is not supported, and leaks memory. Do not use it. Check out lru-cache if you want a good and tested way to coalesce async requests by a key value, which is much more comprehensive and powerful.", - "dev": true, "license": "ISC", "dependencies": { "once": "^1.3.0", @@ -7774,6 +8234,23 @@ "kind-of": "^6.0.2" } }, + "node_modules/ip-address": { + "version": "9.0.5", + "resolved": "https://registry.npmjs.org/ip-address/-/ip-address-9.0.5.tgz", + "integrity": "sha512-zHtQzGojZXTwZTHQqra+ETKd4Sn3vgi7uBmlPoXVWZqYvuKmtI0l/VZTjqGmJY9x88GGOaZ9+G9ES8hC4T4X8g==", + "dependencies": { + "jsbn": "1.1.0", + "sprintf-js": "^1.1.3" + }, + "engines": { + "node": ">= 12" + } + }, + "node_modules/ip-address/node_modules/sprintf-js": { + "version": "1.1.3", + "resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.1.3.tgz", + "integrity": "sha512-Oo+0REFV59/rz3gfJNKQiBlwfHaSESl1pcGyABQsnnIfWOFt6JNj5gCog2U6MLZ//IGYD+nA8nI+mTShREReaA==" + }, "node_modules/ipaddr.js": { "version": "1.9.1", "resolved": "https://registry.npmjs.org/ipaddr.js/-/ipaddr.js-1.9.1.tgz", @@ -7787,9 +8264,13 @@ "version": "0.2.1", "resolved": "https://registry.npmjs.org/is-arrayish/-/is-arrayish-0.2.1.tgz", "integrity": "sha512-zz06S8t0ozoDXMG+ube26zeCTNXcKIPJZJi8hBrF4idCLms4CG9QtK7qBl1boi5ODzFpjswb5JPmHCbMpjaYzg==", - "dev": true, "license": "MIT" }, + "node_modules/is-buffer": { + "version": "1.1.6", + "resolved": "https://registry.npmjs.org/is-buffer/-/is-buffer-1.1.6.tgz", + "integrity": "sha512-NcdALwpXkTm5Zvvbk7owOUSvVvBKDgKP5/ewfXEznmQFfs4ZRmanOeKBTjRVjka3QFoN6XJ+9F3USqfHqTaU5w==" + }, "node_modules/is-core-module": { "version": "2.16.1", "resolved": "https://registry.npmjs.org/is-core-module/-/is-core-module-2.16.1.tgz", @@ -7806,6 +8287,14 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/is-extendable": { + "version": "0.1.1", + "resolved": "https://registry.npmjs.org/is-extendable/-/is-extendable-0.1.1.tgz", + "integrity": "sha512-5BMULNob1vgFX6EjQw5izWDxrecWK9AM72rugNr0TFldMOi0fj6Jk+zeKIt0xGj4cEfQIJth4w3OKWOJ4f+AFw==", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/is-extglob": { "version": "2.1.1", "resolved": "https://registry.npmjs.org/is-extglob/-/is-extglob-2.1.1.tgz", @@ -7820,7 +8309,6 @@ "version": "3.0.0", "resolved": "https://registry.npmjs.org/is-fullwidth-code-point/-/is-fullwidth-code-point-3.0.0.tgz", "integrity": "sha512-zymm5+u+sCsSWyD9qNaejV3DFvhCKclKdizYaJUuHA83RLjb7nSuGnddCHGv0hk+KY7BMAlsWeK4Ueg6EV6XQg==", - "dev": true, "license": "MIT", "engines": { "node": ">=8" @@ -7879,6 +8367,17 @@ "node": ">=0.10.0" } }, + "node_modules/is-plain-object": { + "version": "2.0.4", + "resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-2.0.4.tgz", + "integrity": "sha512-h5PpgXkWitc38BBMYawTYMWJHFZJVnBquFE57xFpjB8pJFiF6gZ+bU+WyI/yqXiFR5mdLsgYNaPe8uao6Uv9Og==", + "dependencies": { + "isobject": "^3.0.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/is-promise": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/is-promise/-/is-promise-4.0.0.tgz", @@ -7918,6 +8417,14 @@ "dev": true, "license": "ISC" }, + "node_modules/isobject": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/isobject/-/isobject-3.0.1.tgz", + "integrity": "sha512-WhB9zCku7EGTj/HQQRz5aUQEUeoQZH2bWcltRErOpymJ4boYE6wL9Tbr23krRPSZ+C5zqNSrSw+Cc7sZZ4b7vg==", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/istanbul-lib-coverage": { "version": "3.2.2", "resolved": "https://registry.npmjs.org/istanbul-lib-coverage/-/istanbul-lib-coverage-3.2.2.tgz", @@ -8718,7 +9225,6 @@ "version": "4.0.0", "resolved": "https://registry.npmjs.org/js-tokens/-/js-tokens-4.0.0.tgz", "integrity": "sha512-RdJUflcE3cUzKiMqQgsCu06FPu9UdIJO0beYbPhHN4k6apgJtifcoCtT9bcxOpYBtpD2kCM6Sbzg4CausW/PKQ==", - "dev": true, "license": "MIT" }, "node_modules/js-yaml": { @@ -8733,6 +9239,11 @@ "js-yaml": "bin/js-yaml.js" } }, + "node_modules/jsbn": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/jsbn/-/jsbn-1.1.0.tgz", + "integrity": "sha512-4bYVV3aAMtDTTu4+xsDYa6sy9GyJ69/amsu9sYF2zqjiEoZA5xJi3BrfX3uY+/IekIu7MwdObdbDWpoZdBv3/A==" + }, "node_modules/jsesc": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-3.1.0.tgz", @@ -8757,7 +9268,6 @@ "version": "2.3.1", "resolved": "https://registry.npmjs.org/json-parse-even-better-errors/-/json-parse-even-better-errors-2.3.1.tgz", "integrity": "sha512-xyFwyhro/JEof6Ghe2iz2NcXoj2sloNsWr/XsERDK/oiPCfaNhl5ONfp+jQdAZRQQ0IJWNzH9zIZF7li91kh2w==", - "dev": true, "license": "MIT" }, "node_modules/json-schema-traverse": { @@ -8798,7 +9308,6 @@ "version": "6.1.0", "resolved": "https://registry.npmjs.org/jsonfile/-/jsonfile-6.1.0.tgz", "integrity": "sha512-5dgndWOriYSm5cnYaJNhalLNDKOqFwyDB/rr1E9ZsGciGvKPs8R2xYGCacuf3z6K1YKDz182fd+fY3cn3pMqXQ==", - "dev": true, "license": "MIT", "dependencies": { "universalify": "^2.0.0" @@ -8877,6 +9386,14 @@ "node": ">=6" } }, + "node_modules/lazy-cache": { + "version": "1.0.4", + "resolved": "https://registry.npmjs.org/lazy-cache/-/lazy-cache-1.0.4.tgz", + "integrity": "sha512-RE2g0b5VGZsOCFOCgP7omTRYFqydmZkBwl5oNnQ1lDYC57uyO9KqNnNVxT7COSHTxrRCWVcAVOcbjk+tvh/rgQ==", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/leven": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/leven/-/leven-3.1.0.tgz", @@ -8910,7 +9427,6 @@ "version": "1.2.4", "resolved": "https://registry.npmjs.org/lines-and-columns/-/lines-and-columns-1.2.4.tgz", "integrity": "sha512-7ylylesZQ/PV29jhEDl3Ufjo6ZX7gCqJr5F7PKrqc93v7fzSymt1BpwEU8nAUXs8qzzvqhbjhK5QZg6Mt/HkBg==", - "dev": true, "license": "MIT" }, "node_modules/load-esm": { @@ -9127,6 +9643,30 @@ "node": ">= 4.0.0" } }, + "node_modules/merge-deep": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/merge-deep/-/merge-deep-3.0.3.tgz", + "integrity": "sha512-qtmzAS6t6grwEkNrunqTBdn0qKwFgNWvlxUbAV8es9M7Ot1EbyApytCnvE0jALPa46ZpKDUo527kKiaWplmlFA==", + "dependencies": { + "arr-union": "^3.1.0", + "clone-deep": "^0.2.4", + "kind-of": "^3.0.2" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/merge-deep/node_modules/kind-of": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-3.2.2.tgz", + "integrity": "sha512-NOW9QQXMoZGg/oqnVNoNTTIFEIid1627WCffUBJEdMxYApq7mNE7CpzucIPc+ZQg25Phej7IJSmX3hO+oblOtQ==", + "dependencies": { + "is-buffer": "^1.1.5" + }, + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/merge-descriptors": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/merge-descriptors/-/merge-descriptors-2.0.0.tgz", @@ -9254,7 +9794,6 @@ "version": "3.1.2", "resolved": "https://registry.npmjs.org/minimatch/-/minimatch-3.1.2.tgz", "integrity": "sha512-J7p63hRiAjw1NDEww1W7i37+ByIrOWO5XQQAzZ3VOcL0PNybwpfmV/N05zFAzwQ9USyEcX6t3UO+K5aqBQOIHw==", - "dev": true, "license": "ISC", "dependencies": { "brace-expansion": "^1.1.7" @@ -9282,6 +9821,31 @@ "node": ">=16 || 14 >=14.17" } }, + "node_modules/mitt": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/mitt/-/mitt-3.0.1.tgz", + "integrity": "sha512-vKivATfr97l2/QBCYAkXYDbrIWPM2IIKEl7YPhjCvKlG3kE2gm+uBo6nEXK3M5/Ffh/FLpKExzOQ3JJoJGFKBw==" + }, + "node_modules/mixin-object": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/mixin-object/-/mixin-object-2.0.1.tgz", + "integrity": "sha512-ALGF1Jt9ouehcaXaHhn6t1yGWRqGaHkPFndtFVHfZXOvkIZ/yoGaSi0AHVTafb3ZBGg4dr/bDwnaEKqCXzchMA==", + "dependencies": { + "for-in": "^0.1.3", + "is-extendable": "^0.1.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/mixin-object/node_modules/for-in": { + "version": "0.1.8", + "resolved": "https://registry.npmjs.org/for-in/-/for-in-0.1.8.tgz", + "integrity": "sha512-F0to7vbBSHP8E3l6dCjxNOLuSFAACIxFy3UehTUlG7svlXi37HHsDkyVcHo0Pq8QwrE+pXvWSVX3ZT1T9wAZ9g==", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/mkdirp": { "version": "0.5.6", "resolved": "https://registry.npmjs.org/mkdirp/-/mkdirp-0.5.6.tgz", @@ -9394,6 +9958,14 @@ "dev": true, "license": "MIT" }, + "node_modules/netmask": { + "version": "2.0.2", + "resolved": "https://registry.npmjs.org/netmask/-/netmask-2.0.2.tgz", + "integrity": "sha512-dBpDMdxv9Irdq66304OLfEmQ9tbNRFnFTuZiLo+bD+r332bBmMJ8GBLXklIXXgxd3+v9+KUnZaUR5PJMa75Gsg==", + "engines": { + "node": ">= 0.4.0" + } + }, "node_modules/node-abort-controller": { "version": "3.1.1", "resolved": "https://registry.npmjs.org/node-abort-controller/-/node-abort-controller-3.1.1.tgz", @@ -9411,6 +9983,25 @@ "lodash": "^4.17.21" } }, + "node_modules/node-fetch": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz", + "integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==", + "dependencies": { + "whatwg-url": "^5.0.0" + }, + "engines": { + "node": "4.x || >=6.0.0" + }, + "peerDependencies": { + "encoding": "^0.1.0" + }, + "peerDependenciesMeta": { + "encoding": { + "optional": true + } + } + }, "node_modules/node-int64": { "version": "0.4.0", "resolved": "https://registry.npmjs.org/node-int64/-/node-int64-0.4.0.tgz", @@ -9652,6 +10243,36 @@ "node": ">=6" } }, + "node_modules/pac-proxy-agent": { + "version": "7.2.0", + "resolved": "https://registry.npmjs.org/pac-proxy-agent/-/pac-proxy-agent-7.2.0.tgz", + "integrity": "sha512-TEB8ESquiLMc0lV8vcd5Ql/JAKAoyzHFXaStwjkzpOpC5Yv+pIzLfHvjTSdf3vpa2bMiUQrg9i6276yn8666aA==", + "dependencies": { + "@tootallnate/quickjs-emscripten": "^0.23.0", + "agent-base": "^7.1.2", + "debug": "^4.3.4", + "get-uri": "^6.0.1", + "http-proxy-agent": "^7.0.0", + "https-proxy-agent": "^7.0.6", + "pac-resolver": "^7.0.1", + "socks-proxy-agent": "^8.0.5" + }, + "engines": { + "node": ">= 14" + } + }, + "node_modules/pac-resolver": { + "version": "7.0.1", + "resolved": "https://registry.npmjs.org/pac-resolver/-/pac-resolver-7.0.1.tgz", + "integrity": "sha512-5NPgf87AT2STgwa2ntRMr45jTKrYBGkVU36yT0ig/n/GMAa3oPqhZfIQ2kMEimReg0+t9kZViDVZ83qfVUlckg==", + "dependencies": { + "degenerator": "^5.0.0", + "netmask": "^2.0.2" + }, + "engines": { + "node": ">= 14" + } + }, "node_modules/package-json-from-dist": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/package-json-from-dist/-/package-json-from-dist-1.0.1.tgz", @@ -9663,7 +10284,6 @@ "version": "1.0.1", "resolved": "https://registry.npmjs.org/parent-module/-/parent-module-1.0.1.tgz", "integrity": "sha512-GQ2EWRpQV8/o+Aw8YqtfZZPfNRWZYkbidE9k5rpl/hC3vtHHBfGm2Ifi6qWV+coDGkrUKZAxE3Lot5kcsRlh+g==", - "dev": true, "license": "MIT", "dependencies": { "callsites": "^3.0.0" @@ -9676,7 +10296,6 @@ "version": "5.2.0", "resolved": "https://registry.npmjs.org/parse-json/-/parse-json-5.2.0.tgz", "integrity": "sha512-ayCKvm/phCGxOkYRSCM82iDwct8/EonSEgCSxWxD7ve6jHggsFl4fZVQBPRNgQoKiuV/odhFrGzQXZwbifC8Rg==", - "dev": true, "license": "MIT", "dependencies": { "@babel/code-frame": "^7.0.0", @@ -9772,7 +10391,6 @@ "version": "1.0.1", "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", "integrity": "sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==", - "dev": true, "license": "MIT", "engines": { "node": ">=0.10.0" @@ -9864,14 +10482,12 @@ "version": "1.2.0", "resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz", "integrity": "sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==", - "dev": true, "license": "MIT" }, "node_modules/picocolors": { "version": "1.1.1", "resolved": "https://registry.npmjs.org/picocolors/-/picocolors-1.1.1.tgz", "integrity": "sha512-xceH2snhtb5M9liqDsmEw56le376mTZkEX/jEb/RxNFyegNul7eNslCXP9FDj/Lcu0X8KEyMceP2ntpaHrDEVA==", - "dev": true, "license": "ISC" }, "node_modules/picomatch": { @@ -10079,6 +10695,14 @@ } } }, + "node_modules/progress": { + "version": "2.0.3", + "resolved": "https://registry.npmjs.org/progress/-/progress-2.0.3.tgz", + "integrity": "sha512-7PiHtLll5LdnKIMw100I+8xJXR5gW2QwWYkT6iJva0bXitZKa/XMrSbdmg3r2Xnaidz9Qumd0VPaMrZlF9V9sA==", + "engines": { + "node": ">=0.4.0" + } + }, "node_modules/prompts": { "version": "2.4.2", "resolved": "https://registry.npmjs.org/prompts/-/prompts-2.4.2.tgz", @@ -10106,6 +10730,46 @@ "node": ">= 0.10" } }, + "node_modules/proxy-agent": { + "version": "6.5.0", + "resolved": "https://registry.npmjs.org/proxy-agent/-/proxy-agent-6.5.0.tgz", + "integrity": "sha512-TmatMXdr2KlRiA2CyDu8GqR8EjahTG3aY3nXjdzFyoZbmB8hrBsTyMezhULIXKnC0jpfjlmiZ3+EaCzoInSu/A==", + "dependencies": { + "agent-base": "^7.1.2", + "debug": "^4.3.4", + "http-proxy-agent": "^7.0.1", + "https-proxy-agent": "^7.0.6", + "lru-cache": "^7.14.1", + "pac-proxy-agent": "^7.1.0", + "proxy-from-env": "^1.1.0", + "socks-proxy-agent": "^8.0.5" + }, + "engines": { + "node": ">= 14" + } + }, + "node_modules/proxy-agent/node_modules/lru-cache": { + "version": "7.18.3", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-7.18.3.tgz", + "integrity": "sha512-jumlc0BIUrS3qJGgIkWZsyfAM7NCWiBcCDhnd+3NNM5KbBmLTgHVfWBcg6W+rLUsIpzpERPsvwUP7CckAQSOoA==", + "engines": { + "node": ">=12" + } + }, + "node_modules/proxy-from-env": { + "version": "1.1.0", + "resolved": "https://registry.npmjs.org/proxy-from-env/-/proxy-from-env-1.1.0.tgz", + "integrity": "sha512-D+zkORCbA9f1tdWRK0RaCR3GPv50cMxcrz4X8k5LTSUD1Dkw47mKJEZQNunItRTkWwgtaUSo1RVFRIG9ZXiFYg==" + }, + "node_modules/pump": { + "version": "3.0.3", + "resolved": "https://registry.npmjs.org/pump/-/pump-3.0.3.tgz", + "integrity": "sha512-todwxLMY7/heScKmntwQG8CXVkWUOdYxIvY2s0VWAAMh/nd8SoYiRaKjlr7+iCs984f2P8zvrfWcDDYVb73NfA==", + "dependencies": { + "end-of-stream": "^1.1.0", + "once": "^1.3.1" + } + }, "node_modules/punycode": { "version": "2.3.1", "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz", @@ -10116,6 +10780,230 @@ "node": ">=6" } }, + "node_modules/puppeteer": { + "version": "24.11.2", + "resolved": "https://registry.npmjs.org/puppeteer/-/puppeteer-24.11.2.tgz", + "integrity": "sha512-HopdRZWHa5zk0HSwd8hU+GlahQ3fmesTAqMIDHVY9HasCvppcYuHYXyjml0nlm+nbwVCqAQWV+dSmiNCrZGTGQ==", + "hasInstallScript": true, + "peer": true, + "dependencies": { + "@puppeteer/browsers": "2.10.5", + "chromium-bidi": "5.1.0", + "cosmiconfig": "^9.0.0", + "devtools-protocol": "0.0.1464554", + "puppeteer-core": "24.11.2", + "typed-query-selector": "^2.12.0" + }, + "bin": { + "puppeteer": "lib/cjs/puppeteer/node/cli.js" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/puppeteer-core": { + "version": "24.11.2", + "resolved": "https://registry.npmjs.org/puppeteer-core/-/puppeteer-core-24.11.2.tgz", + "integrity": "sha512-c49WifNb8hix+gQH17TldmD6TC/Md2HBaTJLHexIUq4sZvo2pyHY/Pp25qFQjibksBu/SJRYUY7JsoaepNbiRA==", + "dependencies": { + "@puppeteer/browsers": "2.10.5", + "chromium-bidi": "5.1.0", + "debug": "^4.4.1", + "devtools-protocol": "0.0.1464554", + "typed-query-selector": "^2.12.0", + "ws": "^8.18.3" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/puppeteer-extra": { + "version": "3.3.6", + "resolved": "https://registry.npmjs.org/puppeteer-extra/-/puppeteer-extra-3.3.6.tgz", + "integrity": "sha512-rsLBE/6mMxAjlLd06LuGacrukP2bqbzKCLzV1vrhHFavqQE/taQ2UXv3H5P0Ls7nsrASa+6x3bDbXHpqMwq+7A==", + "dependencies": { + "@types/debug": "^4.1.0", + "debug": "^4.1.1", + "deepmerge": "^4.2.2" + }, + "engines": { + "node": ">=8" + }, + "peerDependencies": { + "@types/puppeteer": "*", + "puppeteer": "*", + "puppeteer-core": "*" + }, + "peerDependenciesMeta": { + "@types/puppeteer": { + "optional": true + }, + "puppeteer": { + "optional": true + }, + "puppeteer-core": { + "optional": true + } + } + }, + "node_modules/puppeteer-extra-plugin": { + "version": "3.2.3", + "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin/-/puppeteer-extra-plugin-3.2.3.tgz", + "integrity": "sha512-6RNy0e6pH8vaS3akPIKGg28xcryKscczt4wIl0ePciZENGE2yoaQJNd17UiEbdmh5/6WW6dPcfRWT9lxBwCi2Q==", + "dependencies": { + "@types/debug": "^4.1.0", + "debug": "^4.1.1", + "merge-deep": "^3.0.1" + }, + "engines": { + "node": ">=9.11.2" + }, + "peerDependencies": { + "playwright-extra": "*", + "puppeteer-extra": "*" + }, + "peerDependenciesMeta": { + "playwright-extra": { + "optional": true + }, + "puppeteer-extra": { + "optional": true + } + } + }, + "node_modules/puppeteer-extra-plugin-adblocker": { + "version": "2.13.6", + "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-adblocker/-/puppeteer-extra-plugin-adblocker-2.13.6.tgz", + "integrity": "sha512-AftgnUZ1rg2RPe9RpX6rkYAxEohwp3iFeGIyjsAuTaIiw4VLZqOb1LSY8/S60vAxpeat60fbCajxoUetmLy4Dw==", + "dependencies": { + "@cliqz/adblocker-puppeteer": "1.23.8", + "debug": "^4.1.1", + "node-fetch": "^2.6.0", + "puppeteer-extra-plugin": "^3.2.3" + }, + "engines": { + "node": ">=8" + }, + "peerDependencies": { + "puppeteer": "*", + "puppeteer-core": "*", + "puppeteer-extra": "*" + }, + "peerDependenciesMeta": { + "puppeteer": { + "optional": true + }, + "puppeteer-core": { + "optional": true + }, + "puppeteer-extra": { + "optional": true + } + } + }, + "node_modules/puppeteer-extra-plugin-stealth": { + "version": "2.11.2", + "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-stealth/-/puppeteer-extra-plugin-stealth-2.11.2.tgz", + "integrity": "sha512-bUemM5XmTj9i2ZerBzsk2AN5is0wHMNE6K0hXBzBXOzP5m5G3Wl0RHhiqKeHToe/uIH8AoZiGhc1tCkLZQPKTQ==", + "dependencies": { + "debug": "^4.1.1", + "puppeteer-extra-plugin": "^3.2.3", + "puppeteer-extra-plugin-user-preferences": "^2.4.1" + }, + "engines": { + "node": ">=8" + }, + "peerDependencies": { + "playwright-extra": "*", + "puppeteer-extra": "*" + }, + "peerDependenciesMeta": { + "playwright-extra": { + "optional": true + }, + "puppeteer-extra": { + "optional": true + } + } + }, + "node_modules/puppeteer-extra-plugin-user-data-dir": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-user-data-dir/-/puppeteer-extra-plugin-user-data-dir-2.4.1.tgz", + "integrity": "sha512-kH1GnCcqEDoBXO7epAse4TBPJh9tEpVEK/vkedKfjOVOhZAvLkHGc9swMs5ChrJbRnf8Hdpug6TJlEuimXNQ+g==", + "dependencies": { + "debug": "^4.1.1", + "fs-extra": "^10.0.0", + "puppeteer-extra-plugin": "^3.2.3", + "rimraf": "^3.0.2" + }, + "engines": { + "node": ">=8" + }, + "peerDependencies": { + "playwright-extra": "*", + "puppeteer-extra": "*" + }, + "peerDependenciesMeta": { + "playwright-extra": { + "optional": true + }, + "puppeteer-extra": { + "optional": true + } + } + }, + "node_modules/puppeteer-extra-plugin-user-preferences": { + "version": "2.4.1", + "resolved": "https://registry.npmjs.org/puppeteer-extra-plugin-user-preferences/-/puppeteer-extra-plugin-user-preferences-2.4.1.tgz", + "integrity": "sha512-i1oAZxRbc1bk8MZufKCruCEC3CCafO9RKMkkodZltI4OqibLFXF3tj6HZ4LZ9C5vCXZjYcDWazgtY69mnmrQ9A==", + "dependencies": { + "debug": "^4.1.1", + "deepmerge": "^4.2.2", + "puppeteer-extra-plugin": "^3.2.3", + "puppeteer-extra-plugin-user-data-dir": "^2.4.1" + }, + "engines": { + "node": ">=8" + }, + "peerDependencies": { + "playwright-extra": "*", + "puppeteer-extra": "*" + }, + "peerDependenciesMeta": { + "playwright-extra": { + "optional": true + }, + "puppeteer-extra": { + "optional": true + } + } + }, + "node_modules/puppeteer/node_modules/cosmiconfig": { + "version": "9.0.0", + "resolved": "https://registry.npmjs.org/cosmiconfig/-/cosmiconfig-9.0.0.tgz", + "integrity": "sha512-itvL5h8RETACmOTFc4UfIyB2RfEHi71Ax6E/PivVxq9NseKbOWpeyHEOIbmAw1rs8Ak0VursQNww7lf7YtUwzg==", + "peer": true, + "dependencies": { + "env-paths": "^2.2.1", + "import-fresh": "^3.3.0", + "js-yaml": "^4.1.0", + "parse-json": "^5.2.0" + }, + "engines": { + "node": ">=14" + }, + "funding": { + "url": "https://github.com/sponsors/d-fischer" + }, + "peerDependencies": { + "typescript": ">=4.9.5" + }, + "peerDependenciesMeta": { + "typescript": { + "optional": true + } + } + }, "node_modules/pure-rand": { "version": "6.1.0", "resolved": "https://registry.npmjs.org/pure-rand/-/pure-rand-6.1.0.tgz", @@ -10271,7 +11159,6 @@ "version": "2.1.1", "resolved": "https://registry.npmjs.org/require-directory/-/require-directory-2.1.1.tgz", "integrity": "sha512-fGxEI7+wsG9xrvdjsrlmL22OMTTiHRwAMroiEeMgq8gzoLC/PQr7RsRDSTLUg/bZAZtF+TVIkHc6/4RIKrui+Q==", - "dev": true, "license": "MIT", "engines": { "node": ">=0.10.0" @@ -10342,7 +11229,6 @@ "version": "4.0.0", "resolved": "https://registry.npmjs.org/resolve-from/-/resolve-from-4.0.0.tgz", "integrity": "sha512-pb/MYmXstAkysRFx8piNI1tGFNQIFA3vkE3Gq4EuA1dF6gHp/+vgZqsCGJapvy8N3Q+4o7FwvquPJcnZ7RYy4g==", - "dev": true, "license": "MIT", "engines": { "node": ">=4" @@ -10406,6 +11292,41 @@ "node": ">=0.10.0" } }, + "node_modules/rimraf": { + "version": "3.0.2", + "resolved": "https://registry.npmjs.org/rimraf/-/rimraf-3.0.2.tgz", + "integrity": "sha512-JZkJMZkAGFFPP2YqXZXPbMlMBgsxzE8ILs4lMIX/2o0L9UBw9O/Y3o6wFw/i9YLapcUJWwqbi3kdxIPdC62TIA==", + "deprecated": "Rimraf versions prior to v4 are no longer supported", + "dependencies": { + "glob": "^7.1.3" + }, + "bin": { + "rimraf": "bin.js" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, + "node_modules/rimraf/node_modules/glob": { + "version": "7.2.3", + "resolved": "https://registry.npmjs.org/glob/-/glob-7.2.3.tgz", + "integrity": "sha512-nFR0zLpU2YCaRxwoCJvL6UvCH2JFyFVIvwTLsIf21AuHlMskA1hhTdk+LlYJtOlYt9v6dvszD2BGRqBL+iQK9Q==", + "deprecated": "Glob versions prior to v9 are no longer supported", + "dependencies": { + "fs.realpath": "^1.0.0", + "inflight": "^1.0.4", + "inherits": "2", + "minimatch": "^3.1.1", + "once": "^1.3.0", + "path-is-absolute": "^1.0.0" + }, + "engines": { + "node": "*" + }, + "funding": { + "url": "https://github.com/sponsors/isaacs" + } + }, "node_modules/router": { "version": "2.2.0", "resolved": "https://registry.npmjs.org/router/-/router-2.2.0.tgz", @@ -10618,6 +11539,39 @@ "integrity": "sha512-E5LDX7Wrp85Kil5bhZv46j8jOeboKq5JMmYM3gVGdGH8xFpPWXUMsNrlODCrkoxMEeNi/XZIwuRvY4XNwYMJpw==", "license": "ISC" }, + "node_modules/shallow-clone": { + "version": "0.1.2", + "resolved": "https://registry.npmjs.org/shallow-clone/-/shallow-clone-0.1.2.tgz", + "integrity": "sha512-J1zdXCky5GmNnuauESROVu31MQSnLoYvlyEn6j2Ztk6Q5EHFIhxkMhYcv6vuDzl2XEzoRr856QwzMgWM/TmZgw==", + "dependencies": { + "is-extendable": "^0.1.1", + "kind-of": "^2.0.1", + "lazy-cache": "^0.2.3", + "mixin-object": "^2.0.1" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/shallow-clone/node_modules/kind-of": { + "version": "2.0.1", + "resolved": "https://registry.npmjs.org/kind-of/-/kind-of-2.0.1.tgz", + "integrity": "sha512-0u8i1NZ/mg0b+W3MGGw5I7+6Eib2nx72S/QvXa0hYjEkjTknYmEYQJwGu3mLC0BrhtJjtQafTkyRUQ75Kx0LVg==", + "dependencies": { + "is-buffer": "^1.0.2" + }, + "engines": { + "node": ">=0.10.0" + } + }, + "node_modules/shallow-clone/node_modules/lazy-cache": { + "version": "0.2.7", + "resolved": "https://registry.npmjs.org/lazy-cache/-/lazy-cache-0.2.7.tgz", + "integrity": "sha512-gkX52wvU/R8DVMMt78ATVPFMJqfW8FPz1GZ1sVHBVQHmu/WvhIWE4cE1GBzhJNFicDeYhnwp6Rl35BcAIM3YOQ==", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/shebang-command": { "version": "2.0.0", "resolved": "https://registry.npmjs.org/shebang-command/-/shebang-command-2.0.0.tgz", @@ -10743,6 +11697,41 @@ "node": ">=8" } }, + "node_modules/smart-buffer": { + "version": "4.2.0", + "resolved": "https://registry.npmjs.org/smart-buffer/-/smart-buffer-4.2.0.tgz", + "integrity": "sha512-94hK0Hh8rPqQl2xXc3HsaBoOXKV20MToPkcXvwbISWLEs+64sBq5kFgn2kJDHb1Pry9yrP0dxrCI9RRci7RXKg==", + "engines": { + "node": ">= 6.0.0", + "npm": ">= 3.0.0" + } + }, + "node_modules/socks": { + "version": "2.8.5", + "resolved": "https://registry.npmjs.org/socks/-/socks-2.8.5.tgz", + "integrity": "sha512-iF+tNDQla22geJdTyJB1wM/qrX9DMRwWrciEPwWLPRWAUEM8sQiyxgckLxWT1f7+9VabJS0jTGGr4QgBuvi6Ww==", + "dependencies": { + "ip-address": "^9.0.5", + "smart-buffer": "^4.2.0" + }, + "engines": { + "node": ">= 10.0.0", + "npm": ">= 3.0.0" + } + }, + "node_modules/socks-proxy-agent": { + "version": "8.0.5", + "resolved": "https://registry.npmjs.org/socks-proxy-agent/-/socks-proxy-agent-8.0.5.tgz", + "integrity": "sha512-HehCEsotFqbPW9sJ8WVYB6UbmIMv7kUUORIF2Nncq4VQvBfNBLibW9YZR5dlYCSUhwcD628pRllm7n+E+YTzJw==", + "dependencies": { + "agent-base": "^7.1.2", + "debug": "^4.3.4", + "socks": "^2.8.3" + }, + "engines": { + "node": ">= 14" + } + }, "node_modules/sort-keys": { "version": "1.1.2", "resolved": "https://registry.npmjs.org/sort-keys/-/sort-keys-1.1.2.tgz", @@ -10851,7 +11840,6 @@ "version": "2.22.1", "resolved": "https://registry.npmjs.org/streamx/-/streamx-2.22.1.tgz", "integrity": "sha512-znKXEBxfatz2GBNK02kRnCXjV+AA4kjZIUxeWSr3UGirZMJfTE9uiwKHobnbgxWyL/JWro8tTq+vOqAK1/qbSA==", - "dev": true, "license": "MIT", "dependencies": { "fast-fifo": "^1.3.2", @@ -10911,7 +11899,6 @@ "version": "4.2.3", "resolved": "https://registry.npmjs.org/string-width/-/string-width-4.2.3.tgz", "integrity": "sha512-wKyQRQpjJ0sIp62ErSZdGsjMJWsap5oRNihHhu6G7JVO/9jIB6UyevL+tXuOqrng8j/cxKTWyWUwvSTriiZz/g==", - "dev": true, "license": "MIT", "dependencies": { "emoji-regex": "^8.0.0", @@ -10965,7 +11952,6 @@ "version": "5.0.1", "resolved": "https://registry.npmjs.org/ansi-regex/-/ansi-regex-5.0.1.tgz", "integrity": "sha512-quJQXlTSUGL2LH9SUXo8VwsY4soanhgo6LNSm84E1LBcE8s3O0wpdiRzyR9z/ZZJMlMWv37qOOb9pdJlMUEKFQ==", - "dev": true, "license": "MIT", "engines": { "node": ">=8" @@ -10975,7 +11961,6 @@ "version": "6.0.1", "resolved": "https://registry.npmjs.org/strip-ansi/-/strip-ansi-6.0.1.tgz", "integrity": "sha512-Y38VPSHcqkFrCpFnQ9vuSXmquuv5oXOKpGeT6aGrr3o3Gc9AlVa6JBfUSOCnbxGGZF+/0ooI7KrPuUSztUdU5A==", - "dev": true, "license": "MIT", "dependencies": { "ansi-regex": "^5.0.1" @@ -11203,11 +12188,23 @@ "node": ">=6" } }, + "node_modules/tar-fs": { + "version": "3.1.0", + "resolved": "https://registry.npmjs.org/tar-fs/-/tar-fs-3.1.0.tgz", + "integrity": "sha512-5Mty5y/sOF1YWj1J6GiBodjlDc05CUR8PKXrsnFAiSG0xA+GHeWLovaZPYUDXkH/1iKRf2+M5+OrRgzC7O9b7w==", + "dependencies": { + "pump": "^3.0.0", + "tar-stream": "^3.1.5" + }, + "optionalDependencies": { + "bare-fs": "^4.0.1", + "bare-path": "^3.0.0" + } + }, "node_modules/tar-stream": { "version": "3.1.7", "resolved": "https://registry.npmjs.org/tar-stream/-/tar-stream-3.1.7.tgz", "integrity": "sha512-qJj60CXt7IU1Ffyc3NJMjh6EkuCFej46zUqJ4J7pqYlThyd9bO0XBTmcOIhSzZJVWfsLks0+nle/j538YAW9RQ==", - "dev": true, "license": "MIT", "dependencies": { "b4a": "^1.6.4", @@ -11423,7 +12420,6 @@ "version": "1.2.3", "resolved": "https://registry.npmjs.org/text-decoder/-/text-decoder-1.2.3.tgz", "integrity": "sha512-3/o9z3X0X0fTupwsYvR03pJ/DjWuqqrfwBgTQzdWDiQSm9KitAyz/9WqsT2JQW7KV2m+bC2ol/zqpW37NHxLaA==", - "dev": true, "license": "Apache-2.0", "dependencies": { "b4a": "^1.6.4" @@ -11436,6 +12432,19 @@ "dev": true, "license": "MIT" }, + "node_modules/tldts-core": { + "version": "5.7.112", + "resolved": "https://registry.npmjs.org/tldts-core/-/tldts-core-5.7.112.tgz", + "integrity": "sha512-mutrEUgG2sp0e/MIAnv9TbSLR0IPbvmAImpzqul5O/HJ2XM1/I1sajchQ/fbj0fPdA31IiuWde8EUhfwyldY1Q==" + }, + "node_modules/tldts-experimental": { + "version": "5.7.112", + "resolved": "https://registry.npmjs.org/tldts-experimental/-/tldts-experimental-5.7.112.tgz", + "integrity": "sha512-Nq5qWN4OiLziAOOOEoSME7cZI4Hz8Srt+9q6cl8mZ5EAhCfmeE6l7K5XjuIKN+pySuGUvthE5aPiD185YU1/lg==", + "dependencies": { + "tldts-core": "^5.7.112" + } + }, "node_modules/tmp": { "version": "0.0.33", "resolved": "https://registry.npmjs.org/tmp/-/tmp-0.0.33.tgz", @@ -11495,6 +12504,11 @@ "url": "https://github.com/sponsors/Borewit" } }, + "node_modules/tr46": { + "version": "0.0.3", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz", + "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==" + }, "node_modules/tree-kill": { "version": "1.2.2", "resolved": "https://registry.npmjs.org/tree-kill/-/tree-kill-1.2.2.tgz", @@ -11746,6 +12760,11 @@ "node": ">= 0.6" } }, + "node_modules/typed-query-selector": { + "version": "2.12.0", + "resolved": "https://registry.npmjs.org/typed-query-selector/-/typed-query-selector-2.12.0.tgz", + "integrity": "sha512-SbklCd1F0EiZOyPiW192rrHZzZ5sBijB6xM+cpmrwDqObvdtunOHHIk9fCGsoK5JVIYXoyEp4iEdE3upFH3PAg==" + }, "node_modules/typedarray": { "version": "0.0.6", "resolved": "https://registry.npmjs.org/typedarray/-/typedarray-0.0.6.tgz", @@ -11840,7 +12859,6 @@ "version": "2.0.1", "resolved": "https://registry.npmjs.org/universalify/-/universalify-2.0.1.tgz", "integrity": "sha512-gptHNQghINnc/vTGIk0SOFGFNXw7JVrlRUtConJRlvaw6DuX0wO5Jeko9sWrMBhh+PsYAZ7oXAiOnf/UKogyiw==", - "dev": true, "license": "MIT", "engines": { "node": ">= 10.0.0" @@ -11997,6 +13015,11 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/webidl-conversions": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz", + "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==" + }, "node_modules/webpack": { "version": "5.99.9", "resolved": "https://registry.npmjs.org/webpack/-/webpack-5.99.9.tgz", @@ -12197,6 +13220,15 @@ "url": "https://opencollective.com/webpack" } }, + "node_modules/whatwg-url": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz", + "integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==", + "dependencies": { + "tr46": "~0.0.3", + "webidl-conversions": "^3.0.0" + } + }, "node_modules/which": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz", @@ -12330,6 +13362,26 @@ "dev": true, "license": "ISC" }, + "node_modules/ws": { + "version": "8.18.3", + "resolved": "https://registry.npmjs.org/ws/-/ws-8.18.3.tgz", + "integrity": "sha512-PEIGCY5tSlUt50cqyMXfCzX+oOPqN0vuGqWzbcJ2xvnkzkq46oOpz7dQaTDBdfICb4N14+GARUDw2XV2N4tvzg==", + "engines": { + "node": ">=10.0.0" + }, + "peerDependencies": { + "bufferutil": "^4.0.1", + "utf-8-validate": ">=5.0.2" + }, + "peerDependenciesMeta": { + "bufferutil": { + "optional": true + }, + "utf-8-validate": { + "optional": true + } + } + }, "node_modules/xtend": { "version": "4.0.2", "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz", @@ -12343,7 +13395,6 @@ "version": "5.0.8", "resolved": "https://registry.npmjs.org/y18n/-/y18n-5.0.8.tgz", "integrity": "sha512-0pfFzegeDWJHJIAmTLRP2DwHjdF5s7jo9tuztdQxAhINCdvS+3nGINqPd00AphqJR/0LhANUS6/+7SCb98YOfA==", - "dev": true, "license": "ISC", "engines": { "node": ">=10" @@ -12360,7 +13411,6 @@ "version": "17.7.2", "resolved": "https://registry.npmjs.org/yargs/-/yargs-17.7.2.tgz", "integrity": "sha512-7dSzzRQ++CKnNI/krKnYRV7JKKPUXMEh61soaHKg9mrWEhzFWhFnxPxGl+69cD1Ou63C13NUPCnmIcrvqCuM6w==", - "dev": true, "license": "MIT", "dependencies": { "cliui": "^8.0.1", @@ -12379,7 +13429,6 @@ "version": "21.1.1", "resolved": "https://registry.npmjs.org/yargs-parser/-/yargs-parser-21.1.1.tgz", "integrity": "sha512-tVpsJW7DdjecAiFpbIB1e3qxIQsE6NoPc5/eTdrbbIC4h0LVsWhnoa3g+m2HclBIujHzsxZ4VJVA+GUuc2/LBw==", - "dev": true, "license": "ISC", "engines": { "node": ">=12" @@ -12434,6 +13483,14 @@ "funding": { "url": "https://github.com/sponsors/sindresorhus" } + }, + "node_modules/zod": { + "version": "3.25.72", + "resolved": "https://registry.npmjs.org/zod/-/zod-3.25.72.tgz", + "integrity": "sha512-Cl+fe4dNL4XumOBNBsr0lHfA80PQiZXHI4xEMTEr8gt6aGz92t3lBA32e71j9+JeF/VAYvdfBnuwJs+BMx/BrA==", + "funding": { + "url": "https://github.com/sponsors/colinhacks" + } } } } diff --git a/package.json b/package.json index 22492b8..e2bb31b 100644 --- a/package.json +++ b/package.json @@ -42,6 +42,10 @@ "helmet": "^8.1.0", "passport": "^0.7.0", "passport-google-oauth20": "^2.0.0", + "puppeteer-core": "^24.11.2", + "puppeteer-extra": "^3.3.6", + "puppeteer-extra-plugin-adblocker": "^2.13.6", + "puppeteer-extra-plugin-stealth": "^2.11.2", "reflect-metadata": "^0.2.2", "rxjs": "^7.8.1", "swagger-ui-express": "^5.0.1" diff --git a/src/config/app.config.ts b/src/config/app.config.ts index bf5118d..65f22da 100644 --- a/src/config/app.config.ts +++ b/src/config/app.config.ts @@ -3,4 +3,5 @@ import { registerAs } from '@nestjs/config'; export default registerAs('app', () => ({ BASE_URL: process.env.BASE_URL || `http://localhost:${process.env.PORT || 4000}`, CLIENT_URL: process.env.CLIENT_URL || 'http://localhost:3000', + BROWSER_PATH: process.env.BROWSER_PATH || '/usr/bin/chromium-browser', })); diff --git a/src/modules/scraper/scraper.module.ts b/src/modules/scraper/scraper.module.ts new file mode 100644 index 0000000..54cb42d --- /dev/null +++ b/src/modules/scraper/scraper.module.ts @@ -0,0 +1,8 @@ +import { Module } from '@nestjs/common'; + +@Module({ + imports: [], + controllers: [], + providers: [], +}) +export class ScraperModule {} diff --git a/src/modules/scraper/services/browser.service.ts b/src/modules/scraper/services/browser.service.ts new file mode 100644 index 0000000..82f0b75 --- /dev/null +++ b/src/modules/scraper/services/browser.service.ts @@ -0,0 +1,112 @@ +import { Injectable, Logger, OnApplicationShutdown } from '@nestjs/common'; +import { ConfigService } from '@nestjs/config'; +import puppeteer from 'puppeteer-extra'; +import { Browser } from 'puppeteer-core'; +import StealthPlugin from 'puppeteer-extra-plugin-stealth'; +import AdblockerPlugin from 'puppeteer-extra-plugin-adblocker'; + +@Injectable() +export class BrowserService implements OnApplicationShutdown { + private readonly logger = new Logger(BrowserService.name); + private browser: Browser | null = null; + + constructor(private readonly configService: ConfigService) { + // 플러그인은 Chrome 전용이므로 생성자에서 한 번만 등록 + puppeteer.use(StealthPlugin()); + puppeteer.use(AdblockerPlugin({ blockTrackers: true })); + } + + /** + * Puppeteer(Chrome) 브라우저 싱글턴을 반환합니다. + */ + async getBrowser(): Promise { + if (this.browser) { + return this.browser; + } + + const executablePath = this.configService.get('app.BROWSER_PATH'); + + this.logger.log('Starting Puppeteer (Chrome) browser...'); + + this.browser = (await puppeteer.launch({ + args: [ + '--autoplay-policy=user-gesture-required', + '--disable-component-update', + '--disable-domain-reliability', + '--disable-print-preview', + '--disable-setuid-sandbox', + '--disable-speech-api', + '--enable-features=SharedArrayBuffer', + '--hide-scrollbars', + '--mute-audio', + '--no-default-browser-check', + '--no-pings', + '--no-sandbox', + '--no-zygote', + '--disable-extensions', + '--disable-dev-shm-usage', + '--no-first-run', + '--disable-background-networking', + '--disable-gpu', + '--disable-software-rasterizer', + ], + defaultViewport: { + deviceScaleFactor: 1, + hasTouch: false, + height: 1080, + isLandscape: true, + isMobile: false, + width: 1920, + }, + executablePath, + headless: true, + timeout: 30_000, + })) as unknown as Browser; + + const version = await this.browser.version(); + this.logger.log(`Browser started: ${version}`); + + // 비정상 종료 시 자동 재연결 + this.browser.on('disconnected', () => { + void this.handleDisconnection(); + }); + + return this.browser; + } + + /** + * 브라우저가 끊어졌을 때 재연결을 시도합니다. + */ + private async handleDisconnection(): Promise { + this.logger.warn('Browser disconnected, reconnecting...'); + this.browser = null; + await this.getBrowser(); + } + + /** + * Nest 애플리케이션 종료 훅 + */ + async onApplicationShutdown(): Promise { + await this.closeBrowser(); + } + + /** + * 브라우저를 안전하게 종료합니다. + */ + async closeBrowser(): Promise { + if (!this.browser) { + this.logger.log('No browser instance to close'); + return; + } + + this.logger.log('Closing browser...'); + try { + await this.browser.close(); + this.logger.log('Browser closed successfully'); + } catch (error) { + this.logger.error('Error closing browser', error as Error); + } finally { + this.browser = null; + } + } +} diff --git a/src/modules/scraper/services/puppeteer-parse.ts b/src/modules/scraper/services/puppeteer-parse.ts new file mode 100644 index 0000000..e69de29 From 6b4bef17f5209446eb718b3fe76fcb81c6377daa Mon Sep 17 00:00:00 2001 From: reach0908 Date: Fri, 4 Jul 2025 20:38:30 +0900 Subject: [PATCH 02/28] =?UTF-8?q?feat(rules):=20NestJS=20=EB=AA=A8?= =?UTF-8?q?=EB=B2=94=20=EC=82=AC=EB=A1=80=20=EB=AC=B8=EC=84=9C=20=EC=B6=94?= =?UTF-8?q?=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .cursor/rules/nestjs-best-practice.mdc | 240 +++++++++++++++++++++++++ 1 file changed, 240 insertions(+) create mode 100644 .cursor/rules/nestjs-best-practice.mdc diff --git a/.cursor/rules/nestjs-best-practice.mdc b/.cursor/rules/nestjs-best-practice.mdc new file mode 100644 index 0000000..9e69a69 --- /dev/null +++ b/.cursor/rules/nestjs-best-practice.mdc @@ -0,0 +1,240 @@ +--- +alwaysApply: true +--- + +You are a senior TypeScript programmer with experience in the NestJS framework and a preference for clean programming and design patterns. Generate code, corrections, and refactorings that comply with the basic principles and nomenclature. + +## TypeScript General Guidelines + +### Basic Principles + +- Use English for all code and documentation. +- Always declare the type of each variable and function (parameters and return value). +- Avoid using any. +- Create necessary types. +- Use JSDoc to document public classes and methods. +- Don't leave blank lines within a function. +- One export per file. + +### Nomenclature + +- Use PascalCase for classes. +- Use camelCase for variables, functions, and methods. +- Use kebab-case for file and directory names. +- Use UPPERCASE for environment variables. +- Avoid magic numbers and define constants. +- Start each function with a verb. +- Use verbs for boolean variables. Example: isLoading, hasError, canDelete, etc. +- Use complete words instead of abbreviations and correct spelling. +- Except for standard abbreviations like API, URL, etc. +- Except for well-known abbreviations: + - i, j for loops + - err for errors + - ctx for contexts + - req, res, next for middleware function parameters + +### Functions + +- In this context, what is understood as a function will also apply to a method. +- Write short functions with a single purpose. Less than 20 instructions. +- Name functions with a verb and something else. +- If it returns a boolean, use isX or hasX, canX, etc. +- If it doesn't return anything, use executeX or saveX, etc. +- Avoid nesting blocks by: + - Early checks and returns. + - Extraction to utility functions. +- Use higher-order functions (map, filter, reduce, etc.) to avoid function nesting. +- Use arrow functions for simple functions (less than 3 instructions). +- Use named functions for non-simple functions. +- Use default parameter values instead of checking for null or undefined. +- Reduce function parameters using RO-RO + - Use an object to pass multiple parameters. + - Use an object to return results. + - Declare necessary types for input arguments and output. +- Use a single level of abstraction. + +### Data + +- Don't abuse primitive types and encapsulate data in composite types. +- Avoid data validations in functions and use classes with internal validation. +- Prefer immutability for data. +- Use readonly for data that doesn't change. +- Use as const for literals that don't change. + +### Classes + +- Follow SOLID principles. +- Prefer composition over inheritance. +- Declare interfaces to define contracts. +- Write small classes with a single purpose. + - Less than 200 instructions. + - Less than 10 public methods. + - Less than 10 properties. + +### Exceptions + +- Use exceptions to handle errors you don't expect. +- If you catch an exception, it should be to: + - Fix an expected problem. + - Add context. + - Otherwise, use a global handler. + +### Testing + +- Follow the Arrange-Act-Assert convention for tests. +- Name test variables clearly. +- Follow the convention: inputX, mockX, actualX, expectedX, etc. +- Write unit tests for each public function. +- Use test doubles to simulate dependencies. + - Except for third-party dependencies that are not expensive to execute. +- Write acceptance tests for each module. +- Follow the Given-When-Then convention. + +## Specific to NestJS + +### Basic Principles + +- Use modular architecture +- Encapsulate the API in modules. + - One module per main domain/route. + - One controller for its route. + - And other controllers for secondary routes. + - A models folder with data types. + - DTOs validated with class-validator for inputs. + - Declare simple types for outputs. + - A services module with business logic and persistence. + - Entities with MikroORM for data persistence. + - One service per entity. +- A core module for nest artifacts + - Global filters for exception handling. + - Global middlewares for request management. + - Guards for permission management. + - Interceptors for request management. +- A shared module for services shared between modules. + - Utilities + - Shared business logic + +### Testing + +- Use the standard Jest framework for testing. +- Write tests for each controller and service. +- Write end to end tests for each api module. +- Add a admin/test method to each controller as a smoke test. + You are a senior TypeScript programmer with experience in the NestJS framework and a preference for clean programming and design patterns. Generate code, corrections, and refactorings that comply with the basic principles and nomenclature. + +## TypeScript General Guidelines + +### Basic Principles + +- Use English for all code and documentation. +- Always declare the type of each variable and function (parameters and return value). +- Avoid using any. +- Create necessary types. +- Use JSDoc to document public classes and methods. +- Don't leave blank lines within a function. +- One export per file. + +### Nomenclature + +- Use PascalCase for classes. +- Use camelCase for variables, functions, and methods. +- Use kebab-case for file and directory names. +- Use UPPERCASE for environment variables. +- Avoid magic numbers and define constants. +- Start each function with a verb. +- Use verbs for boolean variables. Example: isLoading, hasError, canDelete, etc. +- Use complete words instead of abbreviations and correct spelling. +- Except for standard abbreviations like API, URL, etc. +- Except for well-known abbreviations: + - i, j for loops + - err for errors + - ctx for contexts + - req, res, next for middleware function parameters + +### Functions + +- In this context, what is understood as a function will also apply to a method. +- Write short functions with a single purpose. Less than 20 instructions. +- Name functions with a verb and something else. +- If it returns a boolean, use isX or hasX, canX, etc. +- If it doesn't return anything, use executeX or saveX, etc. +- Avoid nesting blocks by: + - Early checks and returns. + - Extraction to utility functions. +- Use higher-order functions (map, filter, reduce, etc.) to avoid function nesting. +- Use arrow functions for simple functions (less than 3 instructions). +- Use named functions for non-simple functions. +- Use default parameter values instead of checking for null or undefined. +- Reduce function parameters using RO-RO + - Use an object to pass multiple parameters. + - Use an object to return results. + - Declare necessary types for input arguments and output. +- Use a single level of abstraction. + +### Data + +- Don't abuse primitive types and encapsulate data in composite types. +- Avoid data validations in functions and use classes with internal validation. +- Prefer immutability for data. +- Use readonly for data that doesn't change. +- Use as const for literals that don't change. + +### Classes + +- Follow SOLID principles. +- Prefer composition over inheritance. +- Declare interfaces to define contracts. +- Write small classes with a single purpose. + - Less than 200 instructions. + - Less than 10 public methods. + - Less than 10 properties. + +### Exceptions + +- Use exceptions to handle errors you don't expect. +- If you catch an exception, it should be to: + - Fix an expected problem. + - Add context. + - Otherwise, use a global handler. + +### Testing + +- Follow the Arrange-Act-Assert convention for tests. +- Name test variables clearly. +- Follow the convention: inputX, mockX, actualX, expectedX, etc. +- Write unit tests for each public function. +- Use test doubles to simulate dependencies. + - Except for third-party dependencies that are not expensive to execute. +- Write acceptance tests for each module. +- Follow the Given-When-Then convention. + +## Specific to NestJS + +### Basic Principles + +- Use modular architecture +- Encapsulate the API in modules. + - One module per main domain/route. + - One controller for its route. + - And other controllers for secondary routes. + - A models folder with data types. + - DTOs validated with class-validator for inputs. + - Declare simple types for outputs. + - A services module with business logic and persistence. + - Entities with MikroORM for data persistence. + - One service per entity. +- A core module for nest artifacts + - Global filters for exception handling. + - Global middlewares for request management. + - Guards for permission management. + - Interceptors for request management. +- A shared module for services shared between modules. + - Utilities + - Shared business logic + +### Testing + +- Use the standard Jest framework for testing. +- Write tests for each controller and service. +- Write end to end tests for each api module. +- Add a admin/test method to each controller as a smoke test. From 09c906eea3a3fd59a8ee8a4d824c6a13754bde68 Mon Sep 17 00:00:00 2001 From: reach0908 Date: Fri, 4 Jul 2025 20:47:09 +0900 Subject: [PATCH 03/28] =?UTF-8?q?feat(article):=20=EA=B8=B0=EC=82=AC=20?= =?UTF-8?q?=EB=AA=A8=EB=8D=B8=20=EB=B0=8F=20CRUD=20=EA=B8=B0=EB=8A=A5=20?= =?UTF-8?q?=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Article 모델을 추가하고, 관련 DTO 및 리포지토리, 서비스, 컨트롤러를 구현하여 기사 생성, 조회, 업데이트, 삭제 기능을 추가했습니다. * 스크래핑된 콘텐츠 저장 기능과 URL 중복 확인 기능도 포함되었습니다. * 데이터베이스 마이그레이션 파일을 생성하여 Article 테이블을 정의했습니다. * Swagger 문서화를 통해 API 문서화 작업을 진행했습니다. --- package-lock.json | 414 +++++++++++++++++- package.json | 3 + .../migration.sql | 40 ++ prisma/schema.prisma | 29 ++ src/app.module.ts | 4 + src/modules/article/article.controller.ts | 313 +++++++++++++ src/modules/article/article.module.ts | 17 + src/modules/article/dto/article.output.ts | 109 +++++ .../article/dto/create-article.input.ts | 119 +++++ .../article/dto/list-articles.input.ts | 85 ++++ .../article/dto/paginated-articles.output.ts | 49 +++ .../article/dto/save-scraped-content.input.ts | 44 ++ .../article/dto/update-article.input.ts | 99 +++++ .../repositories/article.repository.ts | 314 +++++++++++++ .../article/services/article.service.ts | 236 ++++++++++ 15 files changed, 1874 insertions(+), 1 deletion(-) create mode 100644 prisma/migrations/20250704113432_add_article_model/migration.sql create mode 100644 src/modules/article/article.controller.ts create mode 100644 src/modules/article/article.module.ts create mode 100644 src/modules/article/dto/article.output.ts create mode 100644 src/modules/article/dto/create-article.input.ts create mode 100644 src/modules/article/dto/list-articles.input.ts create mode 100644 src/modules/article/dto/paginated-articles.output.ts create mode 100644 src/modules/article/dto/save-scraped-content.input.ts create mode 100644 src/modules/article/dto/update-article.input.ts create mode 100644 src/modules/article/repositories/article.repository.ts create mode 100644 src/modules/article/services/article.service.ts diff --git a/package-lock.json b/package-lock.json index 2fb4a1d..850613b 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,6 +9,7 @@ "version": "0.0.1", "license": "UNLICENSED", "dependencies": { + "@mozilla/readability": "^0.6.0", "@nestjs/common": "^11.0.1", "@nestjs/config": "^4.0.2", "@nestjs/core": "^11.0.1", @@ -22,6 +23,7 @@ "class-validator": "^0.14.2", "cookie-parser": "^1.4.7", "helmet": "^8.1.0", + "jsdom": "^26.1.0", "passport": "^0.7.0", "passport-google-oauth20": "^2.0.0", "puppeteer-core": "^24.11.2", @@ -44,6 +46,7 @@ "@types/cookie-parser": "^1.4.9", "@types/express": "^5.0.0", "@types/jest": "^29.5.14", + "@types/jsdom": "^21.1.7", "@types/node": "^22.10.7", "@types/passport-google-oauth20": "^2.0.16", "@types/supertest": "^6.0.2", @@ -223,6 +226,23 @@ "tslib": "^2.1.0" } }, + "node_modules/@asamuzakjp/css-color": { + "version": "3.2.0", + "resolved": "https://registry.npmjs.org/@asamuzakjp/css-color/-/css-color-3.2.0.tgz", + "integrity": "sha512-K1A6z8tS3XsmCMM86xoWdn7Fkdn9m6RSVtocUrJYIwZnFVkng/PvkEoWtOWmP+Scc6saYWHWZYbndEEXxl24jw==", + "dependencies": { + "@csstools/css-calc": "^2.1.3", + "@csstools/css-color-parser": "^3.0.9", + "@csstools/css-parser-algorithms": "^3.0.4", + "@csstools/css-tokenizer": "^3.0.3", + "lru-cache": "^10.4.3" + } + }, + "node_modules/@asamuzakjp/css-color/node_modules/lru-cache": { + "version": "10.4.3", + "resolved": "https://registry.npmjs.org/lru-cache/-/lru-cache-10.4.3.tgz", + "integrity": "sha512-JNAzZcXrCt42VGLuYz0zfAzDfAvJWW6AfYlDBQyDV5DClI2m5sAmK+OIO7s59XfsRsWHp02jAJrRadPRGTt6SQ==" + }, "node_modules/@babel/code-frame": { "version": "7.27.1", "resolved": "https://registry.npmjs.org/@babel/code-frame/-/code-frame-7.27.1.tgz", @@ -830,6 +850,111 @@ "@jridgewell/sourcemap-codec": "^1.4.10" } }, + "node_modules/@csstools/color-helpers": { + "version": "5.0.2", + "resolved": "https://registry.npmjs.org/@csstools/color-helpers/-/color-helpers-5.0.2.tgz", + "integrity": "sha512-JqWH1vsgdGcw2RR6VliXXdA0/59LttzlU8UlRT/iUUsEeWfYq8I+K0yhihEUTTHLRm1EXvpsCx3083EU15ecsA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "engines": { + "node": ">=18" + } + }, + "node_modules/@csstools/css-calc": { + "version": "2.1.4", + "resolved": "https://registry.npmjs.org/@csstools/css-calc/-/css-calc-2.1.4.tgz", + "integrity": "sha512-3N8oaj+0juUw/1H3YwmDDJXCgTB1gKU6Hc/bB502u9zR0q2vd786XJH9QfrKIEgFlZmhZiq6epXl4rHqhzsIgQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@csstools/css-parser-algorithms": "^3.0.5", + "@csstools/css-tokenizer": "^3.0.4" + } + }, + "node_modules/@csstools/css-color-parser": { + "version": "3.0.10", + "resolved": "https://registry.npmjs.org/@csstools/css-color-parser/-/css-color-parser-3.0.10.tgz", + "integrity": "sha512-TiJ5Ajr6WRd1r8HSiwJvZBiJOqtH86aHpUjq5aEKWHiII2Qfjqd/HCWKPOW8EP4vcspXbHnXrwIDlu5savQipg==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "dependencies": { + "@csstools/color-helpers": "^5.0.2", + "@csstools/css-calc": "^2.1.4" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@csstools/css-parser-algorithms": "^3.0.5", + "@csstools/css-tokenizer": "^3.0.4" + } + }, + "node_modules/@csstools/css-parser-algorithms": { + "version": "3.0.5", + "resolved": "https://registry.npmjs.org/@csstools/css-parser-algorithms/-/css-parser-algorithms-3.0.5.tgz", + "integrity": "sha512-DaDeUkXZKjdGhgYaHNJTV9pV7Y9B3b644jCLs9Upc3VeNGg6LWARAT6O+Q+/COo+2gg/bM5rhpMAtf70WqfBdQ==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "@csstools/css-tokenizer": "^3.0.4" + } + }, + "node_modules/@csstools/css-tokenizer": { + "version": "3.0.4", + "resolved": "https://registry.npmjs.org/@csstools/css-tokenizer/-/css-tokenizer-3.0.4.tgz", + "integrity": "sha512-Vd/9EVDiu6PPJt9yAh6roZP6El1xHrdvIVGjyBsHR0RYwNHgL7FJPyIIW4fANJNG6FtyZfvlRPpFI4ZM/lubvw==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/csstools" + }, + { + "type": "opencollective", + "url": "https://opencollective.com/csstools" + } + ], + "engines": { + "node": ">=18" + } + }, "node_modules/@eslint-community/eslint-utils": { "version": "4.7.0", "resolved": "https://registry.npmjs.org/@eslint-community/eslint-utils/-/eslint-utils-4.7.0.tgz", @@ -2049,6 +2174,14 @@ "resolved": "https://registry.npmjs.org/@microsoft/tsdoc/-/tsdoc-0.15.1.tgz", "integrity": "sha512-4aErSrCR/On/e5G2hDP0wjooqDdauzEbIq8hIkIe5pXV0rtWJZvdCEKL0ykZxex+IxIwBp0eGeV48hQN07dXtw==" }, + "node_modules/@mozilla/readability": { + "version": "0.6.0", + "resolved": "https://registry.npmjs.org/@mozilla/readability/-/readability-0.6.0.tgz", + "integrity": "sha512-juG5VWh4qAivzTAeMzvY9xs9HY5rAcr2E4I7tiSSCokRFi7XIZCAu92ZkSTsIj1OPceCifL3cpfteP3pDT9/QQ==", + "engines": { + "node": ">=14.0.0" + } + }, "node_modules/@napi-rs/nice": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/@napi-rs/nice/-/nice-1.0.1.tgz", @@ -3773,6 +3906,17 @@ "pretty-format": "^29.0.0" } }, + "node_modules/@types/jsdom": { + "version": "21.1.7", + "resolved": "https://registry.npmjs.org/@types/jsdom/-/jsdom-21.1.7.tgz", + "integrity": "sha512-yOriVnggzrnQ3a9OKOCxaVuSug3w3/SbOj5i7VwXWZEyUNl3bLF9V3MfxGbZKuwqJOQyRfqXyROBB1CoZLFWzA==", + "dev": true, + "dependencies": { + "@types/node": "*", + "@types/tough-cookie": "*", + "parse5": "^7.0.0" + } + }, "node_modules/@types/json-schema": { "version": "7.0.15", "resolved": "https://registry.npmjs.org/@types/json-schema/-/json-schema-7.0.15.tgz", @@ -3928,6 +4072,12 @@ "@types/superagent": "^8.1.0" } }, + "node_modules/@types/tough-cookie": { + "version": "4.0.5", + "resolved": "https://registry.npmjs.org/@types/tough-cookie/-/tough-cookie-4.0.5.tgz", + "integrity": "sha512-/Ad8+nIOV7Rl++6f1BdKxFSMgmoqEoYbHRpPcx3JEfv8VRsQe9Z4mCXeJBzxs7mbHY/XOZZuXlRNfhpVPbs6ZA==", + "dev": true + }, "node_modules/@types/validator": { "version": "13.15.2", "resolved": "https://registry.npmjs.org/@types/validator/-/validator-13.15.2.tgz", @@ -6336,6 +6486,18 @@ "node": ">= 8" } }, + "node_modules/cssstyle": { + "version": "4.6.0", + "resolved": "https://registry.npmjs.org/cssstyle/-/cssstyle-4.6.0.tgz", + "integrity": "sha512-2z+rWdzbbSZv6/rhtvzvqeZQHrBaqgogqt85sqFNbabZOuFbCVFb8kPeEtZjiKkbrm395irpNKiYeFeLiQnFPg==", + "dependencies": { + "@asamuzakjp/css-color": "^3.2.0", + "rrweb-cssom": "^0.8.0" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/data-uri-to-buffer": { "version": "6.0.2", "resolved": "https://registry.npmjs.org/data-uri-to-buffer/-/data-uri-to-buffer-6.0.2.tgz", @@ -6344,6 +6506,49 @@ "node": ">= 14" } }, + "node_modules/data-urls": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/data-urls/-/data-urls-5.0.0.tgz", + "integrity": "sha512-ZYP5VBHshaDAiVZxjbRVcFJpc+4xGgT0bK3vzy1HLN8jTO975HEbuYzZJcHoQEY5K1a0z8YayJkyVETa08eNTg==", + "dependencies": { + "whatwg-mimetype": "^4.0.0", + "whatwg-url": "^14.0.0" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/data-urls/node_modules/tr46": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-5.1.1.tgz", + "integrity": "sha512-hdF5ZgjTqgAntKkklYw0R03MG2x/bSzTtkxmIRw/sTNV8YXsCJ1tfLAX23lhxhHJlEf3CRCOCGGWw3vI3GaSPw==", + "dependencies": { + "punycode": "^2.3.1" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/data-urls/node_modules/webidl-conversions": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-7.0.0.tgz", + "integrity": "sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==", + "engines": { + "node": ">=12" + } + }, + "node_modules/data-urls/node_modules/whatwg-url": { + "version": "14.2.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-14.2.0.tgz", + "integrity": "sha512-De72GdQZzNTUBBChsXueQUnPKDkg/5A5zp7pFDuQAj5UFoENpiACU0wlCvzpAGnTkj++ihpKwKyYewn/XNUbKw==", + "dependencies": { + "tr46": "^5.1.0", + "webidl-conversions": "^7.0.0" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/debug": { "version": "4.4.1", "resolved": "https://registry.npmjs.org/debug/-/debug-4.4.1.tgz", @@ -6361,6 +6566,11 @@ } } }, + "node_modules/decimal.js": { + "version": "10.5.0", + "resolved": "https://registry.npmjs.org/decimal.js/-/decimal.js-10.5.0.tgz", + "integrity": "sha512-8vDa8Qxvr/+d94hSh5P3IJwI5t8/c0KsMp+g8bNw9cY2icONa5aPfvKeieW1WlG0WQYwwhJ7mjui2xtiePQSXw==" + }, "node_modules/decompress-response": { "version": "6.0.0", "resolved": "https://registry.npmjs.org/decompress-response/-/decompress-response-6.0.0.tgz", @@ -6681,6 +6891,17 @@ "node": ">=10.13.0" } }, + "node_modules/entities": { + "version": "6.0.1", + "resolved": "https://registry.npmjs.org/entities/-/entities-6.0.1.tgz", + "integrity": "sha512-aN97NXWF6AWBTahfVOIrB/NShkzi5H7F9r1s9mD3cDj4Ko5f2qhhVoYMibXF7GlLveb/D2ioWay8lxI97Ven3g==", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, "node_modules/env-paths": { "version": "2.2.1", "resolved": "https://registry.npmjs.org/env-paths/-/env-paths-2.2.1.tgz", @@ -8032,6 +8253,17 @@ "node": ">=18.0.0" } }, + "node_modules/html-encoding-sniffer": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/html-encoding-sniffer/-/html-encoding-sniffer-4.0.0.tgz", + "integrity": "sha512-Y22oTqIU4uuPgEemfz7NDJz6OeKf12Lsu+QC+s3BVpda64lTiMYCyGwg5ki4vFxkMwQdeZDl2adZoqUgdFuTgQ==", + "dependencies": { + "whatwg-encoding": "^3.1.1" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/html-escaper": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/html-escaper/-/html-escaper-2.0.2.tgz", @@ -8378,6 +8610,11 @@ "node": ">=0.10.0" } }, + "node_modules/is-potential-custom-element-name": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/is-potential-custom-element-name/-/is-potential-custom-element-name-1.0.1.tgz", + "integrity": "sha512-bCYeRA2rVibKZd+s2625gGnGF/t7DSqDs4dP7CrLA1m7jKWz6pps0LpYLJN8Q64HtmPKJ1hrN3nzPNKFEKOUiQ==" + }, "node_modules/is-promise": { "version": "4.0.0", "resolved": "https://registry.npmjs.org/is-promise/-/is-promise-4.0.0.tgz", @@ -9244,6 +9481,75 @@ "resolved": "https://registry.npmjs.org/jsbn/-/jsbn-1.1.0.tgz", "integrity": "sha512-4bYVV3aAMtDTTu4+xsDYa6sy9GyJ69/amsu9sYF2zqjiEoZA5xJi3BrfX3uY+/IekIu7MwdObdbDWpoZdBv3/A==" }, + "node_modules/jsdom": { + "version": "26.1.0", + "resolved": "https://registry.npmjs.org/jsdom/-/jsdom-26.1.0.tgz", + "integrity": "sha512-Cvc9WUhxSMEo4McES3P7oK3QaXldCfNWp7pl2NNeiIFlCoLr3kfq9kb1fxftiwk1FLV7CvpvDfonxtzUDeSOPg==", + "dependencies": { + "cssstyle": "^4.2.1", + "data-urls": "^5.0.0", + "decimal.js": "^10.5.0", + "html-encoding-sniffer": "^4.0.0", + "http-proxy-agent": "^7.0.2", + "https-proxy-agent": "^7.0.6", + "is-potential-custom-element-name": "^1.0.1", + "nwsapi": "^2.2.16", + "parse5": "^7.2.1", + "rrweb-cssom": "^0.8.0", + "saxes": "^6.0.0", + "symbol-tree": "^3.2.4", + "tough-cookie": "^5.1.1", + "w3c-xmlserializer": "^5.0.0", + "webidl-conversions": "^7.0.0", + "whatwg-encoding": "^3.1.1", + "whatwg-mimetype": "^4.0.0", + "whatwg-url": "^14.1.1", + "ws": "^8.18.0", + "xml-name-validator": "^5.0.0" + }, + "engines": { + "node": ">=18" + }, + "peerDependencies": { + "canvas": "^3.0.0" + }, + "peerDependenciesMeta": { + "canvas": { + "optional": true + } + } + }, + "node_modules/jsdom/node_modules/tr46": { + "version": "5.1.1", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-5.1.1.tgz", + "integrity": "sha512-hdF5ZgjTqgAntKkklYw0R03MG2x/bSzTtkxmIRw/sTNV8YXsCJ1tfLAX23lhxhHJlEf3CRCOCGGWw3vI3GaSPw==", + "dependencies": { + "punycode": "^2.3.1" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/jsdom/node_modules/webidl-conversions": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-7.0.0.tgz", + "integrity": "sha512-VwddBukDzu71offAQR975unBIGqfKZpM+8ZX6ySk8nYhVoo5CYaZyzt3YBvYtRtO+aoGlqxPg/B87NGVZ/fu6g==", + "engines": { + "node": ">=12" + } + }, + "node_modules/jsdom/node_modules/whatwg-url": { + "version": "14.2.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-14.2.0.tgz", + "integrity": "sha512-De72GdQZzNTUBBChsXueQUnPKDkg/5A5zp7pFDuQAj5UFoENpiACU0wlCvzpAGnTkj++ihpKwKyYewn/XNUbKw==", + "dependencies": { + "tr46": "^5.1.0", + "webidl-conversions": "^7.0.0" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/jsesc": { "version": "3.1.0", "resolved": "https://registry.npmjs.org/jsesc/-/jsesc-3.1.0.tgz", @@ -10052,6 +10358,11 @@ "node": ">=8" } }, + "node_modules/nwsapi": { + "version": "2.2.20", + "resolved": "https://registry.npmjs.org/nwsapi/-/nwsapi-2.2.20.tgz", + "integrity": "sha512-/ieB+mDe4MrrKMT8z+mQL8klXydZWGR5Dowt4RAGKbJ3kIGEx3X4ljUo+6V73IXtUPWgfOlU5B9MlGxFO5T+cA==" + }, "node_modules/oauth": { "version": "0.10.2", "resolved": "https://registry.npmjs.org/oauth/-/oauth-0.10.2.tgz", @@ -10310,6 +10621,17 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/parse5": { + "version": "7.3.0", + "resolved": "https://registry.npmjs.org/parse5/-/parse5-7.3.0.tgz", + "integrity": "sha512-IInvU7fabl34qmi9gY8XOVxhYyMyuH2xUNpb2q8/Y+7552KlejkRvqvD19nMoUW/uQGGbqNpA6Tufu5FL5BZgw==", + "dependencies": { + "entities": "^6.0.0" + }, + "funding": { + "url": "https://github.com/inikulin/parse5?sponsor=1" + } + }, "node_modules/parseurl": { "version": "1.3.3", "resolved": "https://registry.npmjs.org/parseurl/-/parseurl-1.3.3.tgz", @@ -10774,7 +11096,6 @@ "version": "2.3.1", "resolved": "https://registry.npmjs.org/punycode/-/punycode-2.3.1.tgz", "integrity": "sha512-vYt7UD1U9Wg6138shLtLOvdAu+8DsC/ilFtEVHcH+wydcSpNE20AfSOduf6MkRFahL5FY7X1oU7nKVZFtfq8Fg==", - "dev": true, "license": "MIT", "engines": { "node": ">=6" @@ -11343,6 +11664,11 @@ "node": ">= 18" } }, + "node_modules/rrweb-cssom": { + "version": "0.8.0", + "resolved": "https://registry.npmjs.org/rrweb-cssom/-/rrweb-cssom-0.8.0.tgz", + "integrity": "sha512-guoltQEx+9aMf2gDZ0s62EcV8lsXR+0w8915TC3ITdn2YueuNjdAYh/levpU9nFaoChh9RUS5ZdQMrKfVEN9tw==" + }, "node_modules/run-parallel": { "version": "1.2.0", "resolved": "https://registry.npmjs.org/run-parallel/-/run-parallel-1.2.0.tgz", @@ -11402,6 +11728,17 @@ "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==", "license": "MIT" }, + "node_modules/saxes": { + "version": "6.0.0", + "resolved": "https://registry.npmjs.org/saxes/-/saxes-6.0.0.tgz", + "integrity": "sha512-xAg7SOnEhrm5zI3puOOKyy1OMcMlIJZYNJY7xLBwSze0UjhPLnWfj2GF2EpT0jmzaJKIWKHLsaSSajf35bcYnA==", + "dependencies": { + "xmlchars": "^2.2.0" + }, + "engines": { + "node": ">=v12.22.7" + } + }, "node_modules/schema-utils": { "version": "3.3.0", "resolved": "https://registry.npmjs.org/schema-utils/-/schema-utils-3.3.0.tgz", @@ -12162,6 +12499,11 @@ "node": ">=0.10" } }, + "node_modules/symbol-tree": { + "version": "3.2.4", + "resolved": "https://registry.npmjs.org/symbol-tree/-/symbol-tree-3.2.4.tgz", + "integrity": "sha512-9QNk5KwDF+Bvz+PyObkmSYjI5ksVUYtjW7AU22r2NKcfLJcXp96hkDWU3+XndOsUb+AQ9QhfzfCT2O+CNWT5Tw==" + }, "node_modules/synckit": { "version": "0.11.8", "resolved": "https://registry.npmjs.org/synckit/-/synckit-0.11.8.tgz", @@ -12432,6 +12774,17 @@ "dev": true, "license": "MIT" }, + "node_modules/tldts": { + "version": "6.1.86", + "resolved": "https://registry.npmjs.org/tldts/-/tldts-6.1.86.tgz", + "integrity": "sha512-WMi/OQ2axVTf/ykqCQgXiIct+mSQDFdH2fkwhPwgEwvJ1kSzZRiinb0zF2Xb8u4+OqPChmyI6MEu4EezNJz+FQ==", + "dependencies": { + "tldts-core": "^6.1.86" + }, + "bin": { + "tldts": "bin/cli.js" + } + }, "node_modules/tldts-core": { "version": "5.7.112", "resolved": "https://registry.npmjs.org/tldts-core/-/tldts-core-5.7.112.tgz", @@ -12445,6 +12798,11 @@ "tldts-core": "^5.7.112" } }, + "node_modules/tldts/node_modules/tldts-core": { + "version": "6.1.86", + "resolved": "https://registry.npmjs.org/tldts-core/-/tldts-core-6.1.86.tgz", + "integrity": "sha512-Je6p7pkk+KMzMv2XXKmAE3McmolOQFdxkKw0R8EYNr7sELW46JqnNeTX8ybPiQgvg1ymCoF8LXs5fzFaZvJPTA==" + }, "node_modules/tmp": { "version": "0.0.33", "resolved": "https://registry.npmjs.org/tmp/-/tmp-0.0.33.tgz", @@ -12504,6 +12862,17 @@ "url": "https://github.com/sponsors/Borewit" } }, + "node_modules/tough-cookie": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/tough-cookie/-/tough-cookie-5.1.2.tgz", + "integrity": "sha512-FVDYdxtnj0G6Qm/DhNPSb8Ju59ULcup3tuJxkFb5K8Bv2pUXILbf0xZWU8PX8Ov19OXljbUyveOFwRMwkXzO+A==", + "dependencies": { + "tldts": "^6.1.32" + }, + "engines": { + "node": ">=16" + } + }, "node_modules/tr46": { "version": "0.0.3", "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz", @@ -12968,6 +13337,17 @@ "node": ">= 0.8" } }, + "node_modules/w3c-xmlserializer": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/w3c-xmlserializer/-/w3c-xmlserializer-5.0.0.tgz", + "integrity": "sha512-o8qghlI8NZHU1lLPrpi2+Uq7abh4GGPpYANlalzWxyWteJOCsr/P+oPBA49TOLu5FTZO4d3F9MnWJfiMo4BkmA==", + "dependencies": { + "xml-name-validator": "^5.0.0" + }, + "engines": { + "node": ">=18" + } + }, "node_modules/walker": { "version": "1.0.8", "resolved": "https://registry.npmjs.org/walker/-/walker-1.0.8.tgz", @@ -13220,6 +13600,25 @@ "url": "https://opencollective.com/webpack" } }, + "node_modules/whatwg-encoding": { + "version": "3.1.1", + "resolved": "https://registry.npmjs.org/whatwg-encoding/-/whatwg-encoding-3.1.1.tgz", + "integrity": "sha512-6qN4hJdMwfYBtE3YBTTHhoeuUrDBPZmbQaxWAqSALV/MeEnR5z1xd8UKud2RAkFoPkmB+hli1TZSnyi84xz1vQ==", + "dependencies": { + "iconv-lite": "0.6.3" + }, + "engines": { + "node": ">=18" + } + }, + "node_modules/whatwg-mimetype": { + "version": "4.0.0", + "resolved": "https://registry.npmjs.org/whatwg-mimetype/-/whatwg-mimetype-4.0.0.tgz", + "integrity": "sha512-QaKxh0eNIi2mE9p2vEdzfagOKHCcj1pJ56EEHGQOVxp8r9/iszLUUV7v89x9O1p/T+NlTM5W7jW6+cz4Fq1YVg==", + "engines": { + "node": ">=18" + } + }, "node_modules/whatwg-url": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz", @@ -13382,6 +13781,19 @@ } } }, + "node_modules/xml-name-validator": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/xml-name-validator/-/xml-name-validator-5.0.0.tgz", + "integrity": "sha512-EvGK8EJ3DhaHfbRlETOWAS5pO9MZITeauHKJyb8wyajUfQUenkIg2MvLDTZ4T/TgIcm3HU0TFBgWWboAZ30UHg==", + "engines": { + "node": ">=18" + } + }, + "node_modules/xmlchars": { + "version": "2.2.0", + "resolved": "https://registry.npmjs.org/xmlchars/-/xmlchars-2.2.0.tgz", + "integrity": "sha512-JZnDKK8B0RCDw84FNdDAIpZK+JuJw+s7Lz8nksI7SIuU3UXJJslUthsi+uWBUYOwPFwW7W7PRLRfUKpxjtjFCw==" + }, "node_modules/xtend": { "version": "4.0.2", "resolved": "https://registry.npmjs.org/xtend/-/xtend-4.0.2.tgz", diff --git a/package.json b/package.json index e2bb31b..10c5df6 100644 --- a/package.json +++ b/package.json @@ -27,6 +27,7 @@ "prisma:studio": "prisma studio" }, "dependencies": { + "@mozilla/readability": "^0.6.0", "@nestjs/common": "^11.0.1", "@nestjs/config": "^4.0.2", "@nestjs/core": "^11.0.1", @@ -40,6 +41,7 @@ "class-validator": "^0.14.2", "cookie-parser": "^1.4.7", "helmet": "^8.1.0", + "jsdom": "^26.1.0", "passport": "^0.7.0", "passport-google-oauth20": "^2.0.0", "puppeteer-core": "^24.11.2", @@ -62,6 +64,7 @@ "@types/cookie-parser": "^1.4.9", "@types/express": "^5.0.0", "@types/jest": "^29.5.14", + "@types/jsdom": "^21.1.7", "@types/node": "^22.10.7", "@types/passport-google-oauth20": "^2.0.16", "@types/supertest": "^6.0.2", diff --git a/prisma/migrations/20250704113432_add_article_model/migration.sql b/prisma/migrations/20250704113432_add_article_model/migration.sql new file mode 100644 index 0000000..ffc719b --- /dev/null +++ b/prisma/migrations/20250704113432_add_article_model/migration.sql @@ -0,0 +1,40 @@ +-- CreateTable +CREATE TABLE "Article" ( + "id" TEXT NOT NULL, + "url" TEXT NOT NULL, + "finalUrl" TEXT NOT NULL, + "title" TEXT, + "content" TEXT, + "contentType" TEXT, + "summary" TEXT, + "author" TEXT, + "publishedAt" TIMESTAMP(3), + "wordCount" INTEGER, + "readingTime" INTEGER, + "tags" TEXT[], + "isBookmarked" BOOLEAN NOT NULL DEFAULT false, + "isArchived" BOOLEAN NOT NULL DEFAULT false, + "userId" TEXT NOT NULL, + "createdAt" TIMESTAMP(3) NOT NULL DEFAULT CURRENT_TIMESTAMP, + "updatedAt" TIMESTAMP(3) NOT NULL, + + CONSTRAINT "Article_pkey" PRIMARY KEY ("id") +); + +-- CreateIndex +CREATE INDEX "Article_userId_idx" ON "Article"("userId"); + +-- CreateIndex +CREATE INDEX "Article_userId_isBookmarked_idx" ON "Article"("userId", "isBookmarked"); + +-- CreateIndex +CREATE INDEX "Article_userId_isArchived_idx" ON "Article"("userId", "isArchived"); + +-- CreateIndex +CREATE INDEX "Article_createdAt_idx" ON "Article"("createdAt"); + +-- CreateIndex +CREATE UNIQUE INDEX "Article_url_userId_key" ON "Article"("url", "userId"); + +-- AddForeignKey +ALTER TABLE "Article" ADD CONSTRAINT "Article_userId_fkey" FOREIGN KEY ("userId") REFERENCES "User"("id") ON DELETE CASCADE ON UPDATE CASCADE; diff --git a/prisma/schema.prisma b/prisma/schema.prisma index d73c8b2..3efc506 100644 --- a/prisma/schema.prisma +++ b/prisma/schema.prisma @@ -24,6 +24,7 @@ model User { updatedAt DateTime @updatedAt refreshTokens RefreshToken[] + articles Article[] @@unique([provider, providerId]) } @@ -42,3 +43,31 @@ model RefreshToken { @@index([userId]) @@index([token]) } + +model Article { + id String @id @default(uuid()) + url String // 원본 URL + finalUrl String // 최종 URL (리디렉션 후) + title String? // 추출된 제목 + content String? // 스크래핑된 콘텐츠 (HTML) + contentType String? // MIME 타입 + summary String? // 요약 (선택적) + author String? // 저자 (선택적) + publishedAt DateTime? // 발행일 (선택적) + wordCount Int? // 단어 수 + readingTime Int? // 예상 읽기 시간 (분) + tags String[] // 태그 배열 + isBookmarked Boolean @default(false) // 북마크 여부 + isArchived Boolean @default(false) // 아카이브 여부 + userId String // 사용자 ID + createdAt DateTime @default(now()) + updatedAt DateTime @updatedAt + + user User @relation(fields: [userId], references: [id], onDelete: Cascade) + + @@unique([url, userId]) // 사용자별 URL 중복 방지 + @@index([userId]) + @@index([userId, isBookmarked]) + @@index([userId, isArchived]) + @@index([createdAt]) +} diff --git a/src/app.module.ts b/src/app.module.ts index 2bc3ee9..9e4bde5 100644 --- a/src/app.module.ts +++ b/src/app.module.ts @@ -12,6 +12,8 @@ import throttlerConfig from 'src/config/throttler.config'; import { ConfigModule, ConfigService } from '@nestjs/config'; import { AuthModule } from 'src/modules/auth/auth.module'; import { DatabaseModule } from 'src/database/database.module'; +import { ScraperModule } from 'src/modules/scraper/scraper.module'; +import { ArticleModule } from 'src/modules/article/article.module'; import { HealthController } from 'src/health.controller'; @Module({ @@ -34,6 +36,8 @@ import { HealthController } from 'src/health.controller'; DatabaseModule, // Modules AuthModule, + ScraperModule, + ArticleModule, ], controllers: [HealthController], providers: [ diff --git a/src/modules/article/article.controller.ts b/src/modules/article/article.controller.ts new file mode 100644 index 0000000..c224d7d --- /dev/null +++ b/src/modules/article/article.controller.ts @@ -0,0 +1,313 @@ +import { + Controller, + Get, + Post, + Put, + Delete, + Body, + Param, + Query, + UseGuards, + Request, + HttpCode, + HttpStatus, +} from '@nestjs/common'; +import { ApiTags, ApiOperation, ApiResponse, ApiBody, ApiQuery, ApiBearerAuth } from '@nestjs/swagger'; +import { JwtAuthGuard } from '../../common/guards/jwt.guard'; +import { ArticleService } from './services/article.service'; +import { CreateArticleInput } from './dto/create-article.input'; +import { UpdateArticleInput } from './dto/update-article.input'; +import { ListArticlesInput } from './dto/list-articles.input'; +import { ArticleOutput } from './dto/article.output'; +import { PaginatedArticlesOutput } from './dto/paginated-articles.output'; +interface AuthenticatedRequest extends Request { + user: { + id: string; + email: string; + }; +} + +@ApiTags('articles') +@Controller('articles') +@UseGuards(JwtAuthGuard) +@ApiBearerAuth() +export class ArticleController { + constructor(private readonly articleService: ArticleService) {} + + /** + * 새로운 Article을 생성합니다. + */ + @Post() + @ApiOperation({ + summary: 'Article 생성', + description: '새로운 Article을 생성합니다.', + }) + @ApiBody({ type: CreateArticleInput }) + @ApiResponse({ + status: 201, + description: 'Article 생성 성공', + type: ArticleOutput, + }) + @ApiResponse({ + status: 400, + description: '잘못된 요청 또는 중복된 URL', + }) + @ApiResponse({ + status: 401, + description: '인증 실패', + }) + async createArticle(@Request() req: any, @Body() input: CreateArticleInput): Promise { + return this.articleService.createArticle(req.user.id, input); + } + + /** + * 사용자의 Article 목록을 조회합니다. + */ + @Get() + @ApiOperation({ + summary: 'Article 목록 조회', + description: '사용자의 Article 목록을 페이지네이션과 함께 조회합니다.', + }) + @ApiQuery({ name: 'page', required: false, type: Number, description: '페이지 번호' }) + @ApiQuery({ name: 'limit', required: false, type: Number, description: '페이지당 항목 수' }) + @ApiQuery({ name: 'search', required: false, type: String, description: '검색 키워드' }) + @ApiQuery({ name: 'tags', required: false, type: [String], description: '태그 필터' }) + @ApiQuery({ name: 'isBookmarked', required: false, type: Boolean, description: '북마크 필터' }) + @ApiQuery({ name: 'isArchived', required: false, type: Boolean, description: '아카이브 필터' }) + @ApiQuery({ name: 'sortBy', required: false, type: String, description: '정렬 기준' }) + @ApiQuery({ name: 'sortOrder', required: false, type: String, description: '정렬 순서' }) + @ApiResponse({ + status: 200, + description: 'Article 목록 조회 성공', + type: PaginatedArticlesOutput, + }) + @ApiResponse({ + status: 401, + description: '인증 실패', + }) + async getArticles(@Request() req: any, @Query() query: ListArticlesInput): Promise { + return this.articleService.getArticles(req.user.id, query); + } + + /** + * 특정 Article을 조회합니다. + */ + @Get(':id') + @ApiOperation({ + summary: 'Article 조회', + description: '특정 Article을 조회합니다.', + }) + @ApiResponse({ + status: 200, + description: 'Article 조회 성공', + type: ArticleOutput, + }) + @ApiResponse({ + status: 401, + description: '인증 실패', + }) + @ApiResponse({ + status: 404, + description: 'Article을 찾을 수 없음', + }) + async getArticle(@Request() req: any, @Param('id') id: string): Promise { + return this.articleService.getArticle(req.user.id, id); + } + + /** + * Article을 업데이트합니다. + */ + @Put(':id') + @ApiOperation({ + summary: 'Article 업데이트', + description: 'Article의 정보를 업데이트합니다.', + }) + @ApiBody({ type: UpdateArticleInput }) + @ApiResponse({ + status: 200, + description: 'Article 업데이트 성공', + type: ArticleOutput, + }) + @ApiResponse({ + status: 401, + description: '인증 실패', + }) + @ApiResponse({ + status: 404, + description: 'Article을 찾을 수 없음', + }) + async updateArticle( + @Request() req: any, + @Param('id') id: string, + @Body() input: UpdateArticleInput, + ): Promise { + return this.articleService.updateArticle(req.user.id, id, input); + } + + /** + * Article을 삭제합니다. + */ + @Delete(':id') + @HttpCode(HttpStatus.NO_CONTENT) + @ApiOperation({ + summary: 'Article 삭제', + description: 'Article을 삭제합니다.', + }) + @ApiResponse({ + status: 204, + description: 'Article 삭제 성공', + }) + @ApiResponse({ + status: 401, + description: '인증 실패', + }) + @ApiResponse({ + status: 404, + description: 'Article을 찾을 수 없음', + }) + async deleteArticle(@Request() req: any, @Param('id') id: string): Promise { + return this.articleService.deleteArticle(req.user.id, id); + } + + /** + * Article 북마크 상태를 토글합니다. + */ + @Post(':id/bookmark') + @ApiOperation({ + summary: 'Article 북마크 토글', + description: 'Article의 북마크 상태를 토글합니다.', + }) + @ApiResponse({ + status: 200, + description: '북마크 상태 변경 성공', + type: ArticleOutput, + }) + @ApiResponse({ + status: 401, + description: '인증 실패', + }) + @ApiResponse({ + status: 404, + description: 'Article을 찾을 수 없음', + }) + async toggleBookmark(@Request() req: any, @Param('id') id: string): Promise { + return this.articleService.toggleBookmark(req.user.id, id); + } + + /** + * Article 아카이브 상태를 토글합니다. + */ + @Post(':id/archive') + @ApiOperation({ + summary: 'Article 아카이브 토글', + description: 'Article의 아카이브 상태를 토글합니다.', + }) + @ApiResponse({ + status: 200, + description: '아카이브 상태 변경 성공', + type: ArticleOutput, + }) + @ApiResponse({ + status: 401, + description: '인증 실패', + }) + @ApiResponse({ + status: 404, + description: 'Article을 찾을 수 없음', + }) + async toggleArchive(@Request() req: any, @Param('id') id: string): Promise { + return this.articleService.toggleArchive(req.user.id, id); + } + + /** + * 사용자의 Article 통계를 조회합니다. + */ + @Get('stats/overview') + @ApiOperation({ + summary: 'Article 통계 조회', + description: '사용자의 Article 통계를 조회합니다.', + }) + @ApiResponse({ + status: 200, + description: '통계 조회 성공', + schema: { + type: 'object', + properties: { + total: { type: 'number', description: '총 Article 수' }, + bookmarked: { type: 'number', description: '북마크된 Article 수' }, + archived: { type: 'number', description: '아카이브된 Article 수' }, + recent: { type: 'number', description: '최근 7일간 추가된 Article 수' }, + }, + }, + }) + @ApiResponse({ + status: 401, + description: '인증 실패', + }) + async getArticleStats(@Request() req: any): Promise<{ + total: number; + bookmarked: number; + archived: number; + recent: number; + }> { + return this.articleService.getArticleStats(req.user.id); + } + + /** + * 사용자의 모든 태그를 조회합니다. + */ + @Get('tags/all') + @ApiOperation({ + summary: '사용자 태그 목록 조회', + description: '사용자가 사용한 모든 태그를 조회합니다.', + }) + @ApiResponse({ + status: 200, + description: '태그 목록 조회 성공', + schema: { + type: 'array', + items: { type: 'string' }, + }, + }) + @ApiResponse({ + status: 401, + description: '인증 실패', + }) + async getUserTags(@Request() req: any): Promise { + return this.articleService.getUserTags(req.user.id); + } + + /** + * URL이 이미 저장되어 있는지 확인합니다. + */ + @Get('check-url') + @ApiOperation({ + summary: 'URL 중복 확인', + description: 'URL이 이미 저장되어 있는지 확인합니다.', + }) + @ApiQuery({ + name: 'url', + required: true, + type: String, + description: '확인할 URL', + }) + @ApiResponse({ + status: 200, + description: 'URL 확인 성공', + schema: { + type: 'object', + properties: { + exists: { type: 'boolean', description: 'URL 존재 여부' }, + url: { type: 'string', description: '확인한 URL' }, + }, + }, + }) + @ApiResponse({ + status: 401, + description: '인증 실패', + }) + async checkUrl(@Request() req: any, @Query('url') url: string): Promise<{ exists: boolean; url: string }> { + const exists = await this.articleService.isUrlAlreadySaved(req.user.id, url); + return { exists, url }; + } +} diff --git a/src/modules/article/article.module.ts b/src/modules/article/article.module.ts new file mode 100644 index 0000000..ad22d3e --- /dev/null +++ b/src/modules/article/article.module.ts @@ -0,0 +1,17 @@ +import { Module } from '@nestjs/common'; +import { DatabaseModule } from '../../database/database.module'; +import { ArticleController } from './article.controller'; +import { ArticleService } from './services/article.service'; +import { ArticleRepository } from './repositories/article.repository'; + +/** + * Article 모듈 + * 스크래핑된 콘텐츠 저장 및 관리 기능을 제공합니다. + */ +@Module({ + imports: [DatabaseModule], + controllers: [ArticleController], + providers: [ArticleService, ArticleRepository], + exports: [ArticleService], +}) +export class ArticleModule {} diff --git a/src/modules/article/dto/article.output.ts b/src/modules/article/dto/article.output.ts new file mode 100644 index 0000000..7339fda --- /dev/null +++ b/src/modules/article/dto/article.output.ts @@ -0,0 +1,109 @@ +import { ApiProperty, ApiPropertyOptional } from '@nestjs/swagger'; + +/** + * Article 출력 DTO + */ +export class ArticleOutput { + @ApiProperty({ + description: 'Article ID', + example: 'uuid-string', + }) + id!: string; + + @ApiProperty({ + description: '원본 URL', + example: 'https://example.com/article', + }) + url!: string; + + @ApiProperty({ + description: '최종 URL (리디렉션 후)', + example: 'https://example.com/article', + }) + finalUrl!: string; + + @ApiPropertyOptional({ + description: '추출된 제목', + example: 'Amazing Article Title', + }) + title?: string; + + @ApiPropertyOptional({ + description: '스크래핑된 콘텐츠 (HTML)', + example: '
Article content...
', + }) + content?: string; + + @ApiPropertyOptional({ + description: 'MIME 타입', + example: 'text/html', + }) + contentType?: string; + + @ApiPropertyOptional({ + description: '요약', + example: 'This article discusses...', + }) + summary?: string; + + @ApiPropertyOptional({ + description: '저자', + example: 'John Doe', + }) + author?: string; + + @ApiPropertyOptional({ + description: '발행일', + example: '2024-01-01T00:00:00Z', + }) + publishedAt?: Date; + + @ApiPropertyOptional({ + description: '단어 수', + example: 1500, + }) + wordCount?: number; + + @ApiPropertyOptional({ + description: '예상 읽기 시간 (분)', + example: 7, + }) + readingTime?: number; + + @ApiPropertyOptional({ + description: '태그 배열', + example: ['tech', 'ai', 'programming'], + type: [String], + }) + tags?: string[]; + + @ApiProperty({ + description: '북마크 여부', + example: false, + }) + isBookmarked!: boolean; + + @ApiProperty({ + description: '아카이브 여부', + example: false, + }) + isArchived!: boolean; + + @ApiProperty({ + description: '사용자 ID', + example: 'user-uuid', + }) + userId!: string; + + @ApiProperty({ + description: '생성일', + example: '2024-01-01T00:00:00Z', + }) + createdAt!: Date; + + @ApiProperty({ + description: '업데이트일', + example: '2024-01-01T00:00:00Z', + }) + updatedAt!: Date; +} diff --git a/src/modules/article/dto/create-article.input.ts b/src/modules/article/dto/create-article.input.ts new file mode 100644 index 0000000..4d58111 --- /dev/null +++ b/src/modules/article/dto/create-article.input.ts @@ -0,0 +1,119 @@ +import { IsNotEmpty, IsOptional, IsString, IsUrl, IsArray, IsBoolean, IsInt, IsDateString, Min } from 'class-validator'; +import { ApiProperty, ApiPropertyOptional } from '@nestjs/swagger'; + +/** + * Article 생성을 위한 입력 DTO + */ +export class CreateArticleInput { + @ApiProperty({ + description: '원본 URL', + example: 'https://example.com/article', + }) + @IsString() + @IsNotEmpty() + @IsUrl({ require_protocol: true }) + url!: string; + + @ApiProperty({ + description: '최종 URL (리디렉션 후)', + example: 'https://example.com/article', + }) + @IsString() + @IsNotEmpty() + @IsUrl({ require_protocol: true }) + finalUrl!: string; + + @ApiPropertyOptional({ + description: '추출된 제목', + example: 'Amazing Article Title', + }) + @IsOptional() + @IsString() + title?: string; + + @ApiPropertyOptional({ + description: '스크래핑된 콘텐츠 (HTML)', + example: '
Article content...
', + }) + @IsOptional() + @IsString() + content?: string; + + @ApiPropertyOptional({ + description: 'MIME 타입', + example: 'text/html', + }) + @IsOptional() + @IsString() + contentType?: string; + + @ApiPropertyOptional({ + description: '요약', + example: 'This article discusses...', + }) + @IsOptional() + @IsString() + summary?: string; + + @ApiPropertyOptional({ + description: '저자', + example: 'John Doe', + }) + @IsOptional() + @IsString() + author?: string; + + @ApiPropertyOptional({ + description: '발행일', + example: '2024-01-01T00:00:00Z', + }) + @IsOptional() + @IsDateString() + publishedAt?: string; + + @ApiPropertyOptional({ + description: '단어 수', + example: 1500, + }) + @IsOptional() + @IsInt() + @Min(0) + wordCount?: number; + + @ApiPropertyOptional({ + description: '예상 읽기 시간 (분)', + example: 7, + }) + @IsOptional() + @IsInt() + @Min(0) + readingTime?: number; + + @ApiPropertyOptional({ + description: '태그 배열', + example: ['tech', 'ai', 'programming'], + type: [String], + }) + @IsOptional() + @IsArray() + @IsString({ each: true }) + tags?: string[]; + + @ApiPropertyOptional({ + description: '북마크 여부', + example: false, + default: false, + }) + @IsOptional() + @IsBoolean() + isBookmarked?: boolean; + + @ApiPropertyOptional({ + description: '아카이브 여부', + example: false, + default: false, + }) + @IsOptional() + @IsBoolean() + isArchived?: boolean; +} diff --git a/src/modules/article/dto/list-articles.input.ts b/src/modules/article/dto/list-articles.input.ts new file mode 100644 index 0000000..7e5e7e4 --- /dev/null +++ b/src/modules/article/dto/list-articles.input.ts @@ -0,0 +1,85 @@ +import { IsOptional, IsString, IsInt, IsBoolean, IsArray, IsIn, Min, Max } from 'class-validator'; +import { ApiPropertyOptional } from '@nestjs/swagger'; +import { Type } from 'class-transformer'; + +/** + * Article 목록 조회를 위한 입력 DTO + */ +export class ListArticlesInput { + @ApiPropertyOptional({ + description: '페이지 번호 (1부터 시작)', + example: 1, + default: 1, + }) + @IsOptional() + @Type(() => Number) + @IsInt() + @Min(1) + page?: number = 1; + + @ApiPropertyOptional({ + description: '페이지당 항목 수', + example: 20, + default: 20, + }) + @IsOptional() + @Type(() => Number) + @IsInt() + @Min(1) + @Max(100) + limit?: number = 20; + + @ApiPropertyOptional({ + description: '검색 키워드 (제목, 내용에서 검색)', + example: 'javascript', + }) + @IsOptional() + @IsString() + search?: string; + + @ApiPropertyOptional({ + description: '태그 필터', + example: ['tech', 'programming'], + type: [String], + }) + @IsOptional() + @IsArray() + @IsString({ each: true }) + tags?: string[]; + + @ApiPropertyOptional({ + description: '북마크된 항목만 조회', + example: true, + }) + @IsOptional() + @IsBoolean() + isBookmarked?: boolean; + + @ApiPropertyOptional({ + description: '아카이브된 항목만 조회', + example: false, + }) + @IsOptional() + @IsBoolean() + isArchived?: boolean; + + @ApiPropertyOptional({ + description: '정렬 기준', + example: 'createdAt', + enum: ['createdAt', 'updatedAt', 'title', 'publishedAt'], + }) + @IsOptional() + @IsString() + @IsIn(['createdAt', 'updatedAt', 'title', 'publishedAt']) + sortBy?: string = 'createdAt'; + + @ApiPropertyOptional({ + description: '정렬 순서', + example: 'desc', + enum: ['asc', 'desc'], + }) + @IsOptional() + @IsString() + @IsIn(['asc', 'desc']) + sortOrder?: string = 'desc'; +} diff --git a/src/modules/article/dto/paginated-articles.output.ts b/src/modules/article/dto/paginated-articles.output.ts new file mode 100644 index 0000000..7fffabc --- /dev/null +++ b/src/modules/article/dto/paginated-articles.output.ts @@ -0,0 +1,49 @@ +import { ApiProperty } from '@nestjs/swagger'; +import { ArticleOutput } from './article.output'; + +/** + * 페이지네이션된 Article 목록 출력 DTO + */ +export class PaginatedArticlesOutput { + @ApiProperty({ + description: 'Article 목록', + type: [ArticleOutput], + }) + articles!: ArticleOutput[]; + + @ApiProperty({ + description: '총 항목 수', + example: 150, + }) + total!: number; + + @ApiProperty({ + description: '현재 페이지', + example: 1, + }) + page!: number; + + @ApiProperty({ + description: '페이지당 항목 수', + example: 20, + }) + limit!: number; + + @ApiProperty({ + description: '총 페이지 수', + example: 8, + }) + totalPages!: number; + + @ApiProperty({ + description: '다음 페이지 존재 여부', + example: true, + }) + hasNext!: boolean; + + @ApiProperty({ + description: '이전 페이지 존재 여부', + example: false, + }) + hasPrev!: boolean; +} diff --git a/src/modules/article/dto/save-scraped-content.input.ts b/src/modules/article/dto/save-scraped-content.input.ts new file mode 100644 index 0000000..b875084 --- /dev/null +++ b/src/modules/article/dto/save-scraped-content.input.ts @@ -0,0 +1,44 @@ +import { IsNotEmpty, IsOptional, IsString, IsUrl, IsArray, IsBoolean } from 'class-validator'; +import { ApiProperty, ApiPropertyOptional } from '@nestjs/swagger'; + +/** + * 스크래핑된 콘텐츠를 저장하기 위한 입력 DTO + */ +export class SaveScrapedContentInput { + @ApiProperty({ + description: '스크래핑할 URL', + example: 'https://example.com/article', + }) + @IsString() + @IsNotEmpty() + @IsUrl({ require_protocol: true }) + url!: string; + + @ApiPropertyOptional({ + description: '태그 배열', + example: ['tech', 'ai', 'programming'], + type: [String], + }) + @IsOptional() + @IsArray() + @IsString({ each: true }) + tags?: string[]; + + @ApiPropertyOptional({ + description: '북마크 여부', + example: false, + default: false, + }) + @IsOptional() + @IsBoolean() + isBookmarked?: boolean; + + @ApiPropertyOptional({ + description: '아카이브 여부', + example: false, + default: false, + }) + @IsOptional() + @IsBoolean() + isArchived?: boolean; +} diff --git a/src/modules/article/dto/update-article.input.ts b/src/modules/article/dto/update-article.input.ts new file mode 100644 index 0000000..cb4f200 --- /dev/null +++ b/src/modules/article/dto/update-article.input.ts @@ -0,0 +1,99 @@ +import { IsOptional, IsString, IsArray, IsBoolean, IsInt, IsDateString, Min } from 'class-validator'; +import { ApiPropertyOptional } from '@nestjs/swagger'; + +/** + * Article 업데이트를 위한 입력 DTO + */ +export class UpdateArticleInput { + @ApiPropertyOptional({ + description: '추출된 제목', + example: 'Updated Article Title', + }) + @IsOptional() + @IsString() + title?: string; + + @ApiPropertyOptional({ + description: '스크래핑된 콘텐츠 (HTML)', + example: '
Updated content...
', + }) + @IsOptional() + @IsString() + content?: string; + + @ApiPropertyOptional({ + description: 'MIME 타입', + example: 'text/html', + }) + @IsOptional() + @IsString() + contentType?: string; + + @ApiPropertyOptional({ + description: '요약', + example: 'Updated summary...', + }) + @IsOptional() + @IsString() + summary?: string; + + @ApiPropertyOptional({ + description: '저자', + example: 'Jane Doe', + }) + @IsOptional() + @IsString() + author?: string; + + @ApiPropertyOptional({ + description: '발행일', + example: '2024-01-01T00:00:00Z', + }) + @IsOptional() + @IsDateString() + publishedAt?: string; + + @ApiPropertyOptional({ + description: '단어 수', + example: 1500, + }) + @IsOptional() + @IsInt() + @Min(0) + wordCount?: number; + + @ApiPropertyOptional({ + description: '예상 읽기 시간 (분)', + example: 7, + }) + @IsOptional() + @IsInt() + @Min(0) + readingTime?: number; + + @ApiPropertyOptional({ + description: '태그 배열', + example: ['tech', 'ai', 'programming'], + type: [String], + }) + @IsOptional() + @IsArray() + @IsString({ each: true }) + tags?: string[]; + + @ApiPropertyOptional({ + description: '북마크 여부', + example: true, + }) + @IsOptional() + @IsBoolean() + isBookmarked?: boolean; + + @ApiPropertyOptional({ + description: '아카이브 여부', + example: false, + }) + @IsOptional() + @IsBoolean() + isArchived?: boolean; +} diff --git a/src/modules/article/repositories/article.repository.ts b/src/modules/article/repositories/article.repository.ts new file mode 100644 index 0000000..0437b24 --- /dev/null +++ b/src/modules/article/repositories/article.repository.ts @@ -0,0 +1,314 @@ +import { Injectable, Logger } from '@nestjs/common'; +import { PrismaService } from '../../../database/prisma.service'; +import { Article, Prisma } from '@prisma/client'; +import { CreateArticleInput } from '../dto/create-article.input'; +import { UpdateArticleInput } from '../dto/update-article.input'; +import { ListArticlesInput } from '../dto/list-articles.input'; + +/** + * Article 데이터베이스 작업을 담당하는 Repository + */ +@Injectable() +export class ArticleRepository { + private readonly logger = new Logger(ArticleRepository.name); + + constructor(private readonly prisma: PrismaService) {} + + /** + * 새로운 Article을 생성합니다. + */ + async createArticle(userId: string, input: CreateArticleInput): Promise
{ + const data: Prisma.ArticleCreateInput = { + url: input.url, + finalUrl: input.finalUrl, + title: input.title, + content: input.content, + contentType: input.contentType, + summary: input.summary, + author: input.author, + publishedAt: input.publishedAt ? new Date(input.publishedAt) : undefined, + wordCount: input.wordCount, + readingTime: input.readingTime, + tags: input.tags || [], + isBookmarked: input.isBookmarked || false, + isArchived: input.isArchived || false, + user: { + connect: { id: userId }, + }, + }; + + return this.prisma.article.create({ + data, + include: { + user: { + select: { + id: true, + email: true, + name: true, + }, + }, + }, + }); + } + + /** + * URL과 사용자 ID로 기존 Article을 찾습니다. + */ + async findByUrlAndUserId(url: string, userId: string): Promise
{ + return this.prisma.article.findUnique({ + where: { + url_userId: { + url, + userId, + }, + }, + }); + } + + /** + * ID로 Article을 찾습니다. + */ + async findById(id: string, userId: string): Promise
{ + return this.prisma.article.findFirst({ + where: { + id, + userId, + }, + include: { + user: { + select: { + id: true, + email: true, + name: true, + }, + }, + }, + }); + } + + /** + * Article을 업데이트합니다. + */ + async updateArticle(id: string, userId: string, input: UpdateArticleInput): Promise
{ + const data: Prisma.ArticleUpdateInput = { + title: input.title, + content: input.content, + contentType: input.contentType, + summary: input.summary, + author: input.author, + publishedAt: input.publishedAt ? new Date(input.publishedAt) : undefined, + wordCount: input.wordCount, + readingTime: input.readingTime, + tags: input.tags, + isBookmarked: input.isBookmarked, + isArchived: input.isArchived, + }; + + // undefined 값들을 제거 + Object.keys(data).forEach((key) => { + if (data[key as keyof typeof data] === undefined) { + delete data[key as keyof typeof data]; + } + }); + + return this.prisma.article.update({ + where: { + id, + userId, + }, + data, + include: { + user: { + select: { + id: true, + email: true, + name: true, + }, + }, + }, + }); + } + + /** + * Article을 삭제합니다. + */ + async deleteArticle(id: string, userId: string): Promise { + try { + await this.prisma.article.delete({ + where: { + id, + userId, + }, + }); + return true; + } catch (error) { + this.logger.warn(`Failed to delete article ${id} for user ${userId}: ${(error as Error).message}`); + return false; + } + } + + /** + * 사용자의 Article 목록을 조회합니다. + */ + async findArticlesByUserId( + userId: string, + input: ListArticlesInput, + ): Promise<{ + articles: Article[]; + total: number; + }> { + const { + page = 1, + limit = 20, + search, + tags, + isBookmarked, + isArchived, + sortBy = 'createdAt', + sortOrder = 'desc', + } = input; + + const skip = (page - 1) * limit; + + // 검색 조건 구성 + const where: Prisma.ArticleWhereInput = { + userId, + ...(search && { + OR: [ + { title: { contains: search, mode: 'insensitive' } }, + { content: { contains: search, mode: 'insensitive' } }, + { summary: { contains: search, mode: 'insensitive' } }, + ], + }), + ...(tags && + tags.length > 0 && { + tags: { + hasSome: tags, + }, + }), + ...(isBookmarked !== undefined && { isBookmarked }), + ...(isArchived !== undefined && { isArchived }), + }; + + // 정렬 조건 구성 + const orderBy: Prisma.ArticleOrderByWithRelationInput = { + [sortBy]: sortOrder, + }; + + const [articles, total] = await Promise.all([ + this.prisma.article.findMany({ + where, + orderBy, + skip, + take: limit, + include: { + user: { + select: { + id: true, + email: true, + name: true, + }, + }, + }, + }), + this.prisma.article.count({ where }), + ]); + + return { articles, total }; + } + + /** + * 사용자의 Article 통계를 조회합니다. + */ + async getArticleStats(userId: string): Promise<{ + total: number; + bookmarked: number; + archived: number; + recent: number; + }> { + const [total, bookmarked, archived, recent] = await Promise.all([ + this.prisma.article.count({ where: { userId } }), + this.prisma.article.count({ where: { userId, isBookmarked: true } }), + this.prisma.article.count({ where: { userId, isArchived: true } }), + this.prisma.article.count({ + where: { + userId, + createdAt: { + gte: new Date(Date.now() - 7 * 24 * 60 * 60 * 1000), // 최근 7일 + }, + }, + }), + ]); + + return { total, bookmarked, archived, recent }; + } + + /** + * 사용자의 모든 태그를 조회합니다. + */ + async getUserTags(userId: string): Promise { + const articles = await this.prisma.article.findMany({ + where: { userId }, + select: { tags: true }, + }); + + const allTags = articles.flatMap((article) => article.tags); + return Array.from(new Set(allTags)).sort(); + } + + /** + * URL과 사용자 ID로 기존 Article을 업데이트하거나 새로 생성합니다. + */ + async upsertArticle(userId: string, input: CreateArticleInput): Promise
{ + const data: Prisma.ArticleCreateInput = { + url: input.url, + finalUrl: input.finalUrl, + title: input.title, + content: input.content, + contentType: input.contentType, + summary: input.summary, + author: input.author, + publishedAt: input.publishedAt ? new Date(input.publishedAt) : undefined, + wordCount: input.wordCount, + readingTime: input.readingTime, + tags: input.tags || [], + isBookmarked: input.isBookmarked || false, + isArchived: input.isArchived || false, + user: { + connect: { id: userId }, + }, + }; + + return this.prisma.article.upsert({ + where: { + url_userId: { + url: input.url, + userId, + }, + }, + create: data, + update: { + finalUrl: input.finalUrl, + title: input.title, + content: input.content, + contentType: input.contentType, + summary: input.summary, + author: input.author, + publishedAt: input.publishedAt ? new Date(input.publishedAt) : undefined, + wordCount: input.wordCount, + readingTime: input.readingTime, + tags: input.tags || [], + // 북마크와 아카이브 상태는 업데이트하지 않음 (사용자가 설정한 값 유지) + }, + include: { + user: { + select: { + id: true, + email: true, + name: true, + }, + }, + }, + }); + } +} diff --git a/src/modules/article/services/article.service.ts b/src/modules/article/services/article.service.ts new file mode 100644 index 0000000..386e306 --- /dev/null +++ b/src/modules/article/services/article.service.ts @@ -0,0 +1,236 @@ +import { Injectable, Logger, NotFoundException, BadRequestException } from '@nestjs/common'; +import { ArticleRepository } from '../repositories/article.repository'; +import { CreateArticleInput } from '../dto/create-article.input'; +import { UpdateArticleInput } from '../dto/update-article.input'; +import { ListArticlesInput } from '../dto/list-articles.input'; +import { ArticleOutput } from '../dto/article.output'; +import { PaginatedArticlesOutput } from '../dto/paginated-articles.output'; +import { SaveScrapedContentInput } from '../dto/save-scraped-content.input'; +import { ScrapedContentOutput } from '../../scraper/dto/scraped-content.output'; + +/** + * Article 비즈니스 로직을 담당하는 Service + */ +@Injectable() +export class ArticleService { + private readonly logger = new Logger(ArticleService.name); + + constructor(private readonly articleRepository: ArticleRepository) {} + + /** + * 새로운 Article을 생성합니다. + */ + async createArticle(userId: string, input: CreateArticleInput): Promise { + try { + // 중복 체크 + const existing = await this.articleRepository.findByUrlAndUserId(input.url, userId); + if (existing) { + throw new BadRequestException('이미 저장된 URL입니다.'); + } + + const article = await this.articleRepository.createArticle(userId, input); + return this.mapToOutput(article); + } catch (error) { + this.logger.error(`Failed to create article for user ${userId}: ${(error as Error).message}`); + throw error; + } + } + + /** + * 스크래핑된 콘텐츠를 저장합니다. + */ + async saveScrapedContent( + userId: string, + scrapedContent: ScrapedContentOutput, + options: Omit = {}, + ): Promise { + try { + // 단어 수와 읽기 시간 계산 + const wordCount = this.calculateWordCount(scrapedContent.content); + const readingTime = this.calculateReadingTime(wordCount); + + const articleData: CreateArticleInput = { + url: scrapedContent.finalUrl, // 원본 URL 대신 최종 URL 사용 + finalUrl: scrapedContent.finalUrl, + title: scrapedContent.title, + content: scrapedContent.content, + contentType: scrapedContent.contentType, + wordCount, + readingTime, + tags: options.tags || [], + isBookmarked: options.isBookmarked || false, + isArchived: options.isArchived || false, + }; + + // upsert를 사용하여 중복 처리 + const article = await this.articleRepository.upsertArticle(userId, articleData); + return this.mapToOutput(article); + } catch (error) { + this.logger.error(`Failed to save scraped content for user ${userId}: ${(error as Error).message}`); + throw error; + } + } + + /** + * Article을 조회합니다. + */ + async getArticle(userId: string, articleId: string): Promise { + const article = await this.articleRepository.findById(articleId, userId); + if (!article) { + throw new NotFoundException('Article을 찾을 수 없습니다.'); + } + return this.mapToOutput(article); + } + + /** + * Article을 업데이트합니다. + */ + async updateArticle(userId: string, articleId: string, input: UpdateArticleInput): Promise { + const article = await this.articleRepository.updateArticle(articleId, userId, input); + if (!article) { + throw new NotFoundException('Article을 찾을 수 없습니다.'); + } + return this.mapToOutput(article); + } + + /** + * Article을 삭제합니다. + */ + async deleteArticle(userId: string, articleId: string): Promise { + const success = await this.articleRepository.deleteArticle(articleId, userId); + if (!success) { + throw new NotFoundException('Article을 찾을 수 없습니다.'); + } + } + + /** + * 사용자의 Article 목록을 조회합니다. + */ + async getArticles(userId: string, input: ListArticlesInput): Promise { + const { articles, total } = await this.articleRepository.findArticlesByUserId(userId, input); + + const page = input.page || 1; + const limit = input.limit || 20; + const totalPages = Math.ceil(total / limit); + + return { + articles: articles.map((article) => this.mapToOutput(article)), + total, + page, + limit, + totalPages, + hasNext: page < totalPages, + hasPrev: page > 1, + }; + } + + /** + * 사용자의 Article 통계를 조회합니다. + */ + async getArticleStats(userId: string): Promise<{ + total: number; + bookmarked: number; + archived: number; + recent: number; + }> { + return this.articleRepository.getArticleStats(userId); + } + + /** + * 사용자의 모든 태그를 조회합니다. + */ + async getUserTags(userId: string): Promise { + return this.articleRepository.getUserTags(userId); + } + + /** + * Article 북마크 상태를 토글합니다. + */ + async toggleBookmark(userId: string, articleId: string): Promise { + const article = await this.articleRepository.findById(articleId, userId); + if (!article) { + throw new NotFoundException('Article을 찾을 수 없습니다.'); + } + + const updated = await this.articleRepository.updateArticle(articleId, userId, { + isBookmarked: !article.isBookmarked, + }); + + return this.mapToOutput(updated!); + } + + /** + * Article 아카이브 상태를 토글합니다. + */ + async toggleArchive(userId: string, articleId: string): Promise { + const article = await this.articleRepository.findById(articleId, userId); + if (!article) { + throw new NotFoundException('Article을 찾을 수 없습니다.'); + } + + const updated = await this.articleRepository.updateArticle(articleId, userId, { + isArchived: !article.isArchived, + }); + + return this.mapToOutput(updated!); + } + + /** + * URL이 이미 저장되어 있는지 확인합니다. + */ + async isUrlAlreadySaved(userId: string, url: string): Promise { + const article = await this.articleRepository.findByUrlAndUserId(url, userId); + return !!article; + } + + // ==================== PRIVATE HELPERS ==================== + + /** + * Article 엔티티를 ArticleOutput DTO로 변환합니다. + */ + private mapToOutput(article: any): ArticleOutput { + return { + id: article.id, + url: article.url, + finalUrl: article.finalUrl, + title: article.title, + content: article.content, + contentType: article.contentType, + summary: article.summary, + author: article.author, + publishedAt: article.publishedAt, + wordCount: article.wordCount, + readingTime: article.readingTime, + tags: article.tags, + isBookmarked: article.isBookmarked, + isArchived: article.isArchived, + userId: article.userId, + createdAt: article.createdAt, + updatedAt: article.updatedAt, + }; + } + + /** + * 콘텐츠의 단어 수를 계산합니다. + */ + private calculateWordCount(content?: string): number { + if (!content) return 0; + + // HTML 태그 제거 + const textContent = content.replace(/<[^>]*>/g, ''); + + // 단어 수 계산 (공백 기준) + const words = textContent.trim().split(/\s+/); + return words.length > 0 && words[0] !== '' ? words.length : 0; + } + + /** + * 예상 읽기 시간을 계산합니다 (분 단위). + */ + private calculateReadingTime(wordCount: number): number { + // 평균 읽기 속도: 200-250 단어/분 + const wordsPerMinute = 225; + const minutes = Math.ceil(wordCount / wordsPerMinute); + return Math.max(1, minutes); // 최소 1분 + } +} From 72fb7a78c46a1b5d3cfece902cef41c7418de943 Mon Sep 17 00:00:00 2001 From: reach0908 Date: Fri, 4 Jul 2025 20:47:40 +0900 Subject: [PATCH 04/28] =?UTF-8?q?feat(pre-handler):=20=EC=BD=98=ED=85=90?= =?UTF-8?q?=EC=B8=A0=20=EC=A0=84=EC=B2=98=EB=A6=AC=20=EB=AA=A8=EB=93=88=20?= =?UTF-8?q?=EB=B0=8F=20=ED=95=B8=EB=93=A4=EB=9F=AC=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 콘텐츠 전처리를 위한 PreHandlerModule 및 PreHandlerService를 추가했습니다. * 다양한 콘텐츠 유형을 처리하기 위한 핸들러(PdfHandler, RssHandler, YoutubeHandler, SocialMediaHandler, NewsSiteHandler, DomainSpecificHandler, ReadabilityHandler)를 구현했습니다. * 각 핸들러는 특정 URL 패턴을 인식하고, 콘텐츠를 적절히 변환하여 처리할 수 있도록 설계되었습니다. * PreHandleResult DTO를 통해 핸들러의 결과를 구조화하여 반환합니다. * 모듈의 확장성을 고려하여 새로운 핸들러 추가가 용이하도록 설계되었습니다. --- .../pre-handler/dto/pre-handle-result.dto.ts | 28 ++ .../handlers/domain-specific.handler.ts | 363 ++++++++++++++++++ .../pre-handler/handlers/news-site.handler.ts | 317 +++++++++++++++ .../pre-handler/handlers/pdf.handler.ts | 66 ++++ .../handlers/readability.handler.ts | 56 +++ .../pre-handler/handlers/rss.handler.ts | 91 +++++ .../handlers/social-media.handler.ts | 225 +++++++++++ .../pre-handler/handlers/youtube.handler.ts | 136 +++++++ .../interfaces/content-handler.interface.ts | 27 ++ src/modules/pre-handler/pre-handler.module.ts | 59 +++ .../pre-handler/pre-handler.service.ts | 59 +++ 11 files changed, 1427 insertions(+) create mode 100644 src/modules/pre-handler/dto/pre-handle-result.dto.ts create mode 100644 src/modules/pre-handler/handlers/domain-specific.handler.ts create mode 100644 src/modules/pre-handler/handlers/news-site.handler.ts create mode 100644 src/modules/pre-handler/handlers/pdf.handler.ts create mode 100644 src/modules/pre-handler/handlers/readability.handler.ts create mode 100644 src/modules/pre-handler/handlers/rss.handler.ts create mode 100644 src/modules/pre-handler/handlers/social-media.handler.ts create mode 100644 src/modules/pre-handler/handlers/youtube.handler.ts create mode 100644 src/modules/pre-handler/interfaces/content-handler.interface.ts create mode 100644 src/modules/pre-handler/pre-handler.module.ts create mode 100644 src/modules/pre-handler/pre-handler.service.ts diff --git a/src/modules/pre-handler/dto/pre-handle-result.dto.ts b/src/modules/pre-handler/dto/pre-handle-result.dto.ts new file mode 100644 index 0000000..48cc2c5 --- /dev/null +++ b/src/modules/pre-handler/dto/pre-handle-result.dto.ts @@ -0,0 +1,28 @@ +/** + * DTO for the result of a pre-handling process. + * It encapsulates the data extracted by a content handler. + */ +export class PreHandleResult { + /** + * The final URL after potential redirects or modifications by a handler. + */ + url: string; + + /** + * The extracted title of the content, if available. + * @optional + */ + title?: string; + + /** + * The extracted main content, typically in HTML format. + * @optional + */ + content?: string; + + /** + * The MIME type of the content (e.g., 'text/html', 'application/pdf'). + * @optional + */ + contentType?: string; +} diff --git a/src/modules/pre-handler/handlers/domain-specific.handler.ts b/src/modules/pre-handler/handlers/domain-specific.handler.ts new file mode 100644 index 0000000..19b6206 --- /dev/null +++ b/src/modules/pre-handler/handlers/domain-specific.handler.ts @@ -0,0 +1,363 @@ +import { Injectable, Logger } from '@nestjs/common'; +import { IContentHandler } from '../interfaces/content-handler.interface'; +import { PreHandleResult } from '../dto/pre-handle-result.dto'; + +/** + * A map of domain names to their URL transformation functions. + * This allows for easy extension to support new domains. + * Note: Social media domains are handled by SocialMediaHandler, + * and news sites are handled by NewsSiteHandler. + */ +const DOMAIN_TRANSFORMATIONS: Record URL> = { + // Publishing platforms + 'substack.com': (url) => { + // Substack provides a clean AMP version by setting the search query. + const newUrl = new URL(url.href); + newUrl.search = '?format=amp'; + return newUrl; + }, + 'medium.com': (url) => { + // Use a proxy/reader service to bypass paywalls and pop-ups, + // mirroring the approach used by Omnivore for enhanced compatibility. + return new URL(`https://r.jina.ai/${url.href}`); + }, + + // Developer platforms (non-social aspects) + 'github.com': (url) => { + // GitHub: For markdown files, get the raw content + if (url.pathname.includes('/blob/') && url.pathname.endsWith('.md')) { + return new URL(url.href.replace('/blob/', '/raw/')); + } + return url; + }, + 'gitlab.com': (url) => { + // GitLab: For markdown files, get the raw content + if (url.pathname.includes('/blob/') && url.pathname.endsWith('.md')) { + return new URL(url.href.replace('/blob/', '/raw/')); + } + return url; + }, + + // Knowledge platforms + 'wikipedia.org': (url) => { + // Wikipedia: Use mobile version for cleaner layout + const newUrl = new URL(url.href); + newUrl.hostname = newUrl.hostname.replace('en.wikipedia.org', 'm.wikipedia.org'); + return newUrl; + }, + 'stackoverflow.com': (url) => { + // Stack Overflow: Keep original, it's usually accessible + return url; + }, + + // Other platforms + 'notion.so': (url) => { + // Notion: Keep original, usually accessible + return url; + }, + 'hackernews.com': (url) => { + // Hacker News: Keep original + return url; + }, + 'news.ycombinator.com': (url) => { + // Y Combinator Hacker News: Keep original + return url; + }, + 'patreon.com': (url) => { + // Patreon: Keep original for posts + return url; + }, + 'ko-fi.com': (url) => { + // Ko-fi: Keep original + return url; + }, + 'buymeacoffee.com': (url) => { + // Buy Me a Coffee: Keep original + return url; + }, + 'gumroad.com': (url) => { + // Gumroad: Keep original + return url; + }, + 'itch.io': (url) => { + // Itch.io: Keep original + return url; + }, + 'deviantart.com': (url) => { + // DeviantArt: Keep original + return url; + }, + 'artstation.com': (url) => { + // ArtStation: Keep original + return url; + }, + 'behance.net': (url) => { + // Behance: Keep original + return url; + }, + 'dribbble.com': (url) => { + // Dribbble: Keep original + return url; + }, + 'figma.com': (url) => { + // Figma: Keep original for public files + return url; + }, + 'canva.com': (url) => { + // Canva: Keep original for public designs + return url; + }, + 'unsplash.com': (url) => { + // Unsplash: Keep original + return url; + }, + 'pexels.com': (url) => { + // Pexels: Keep original + return url; + }, + 'pixabay.com': (url) => { + // Pixabay: Keep original + return url; + }, + 'shutterstock.com': (url) => { + // Shutterstock: Keep original + return url; + }, + 'gettyimages.com': (url) => { + // Getty Images: Keep original + return url; + }, + 'imgur.com': (url) => { + // Imgur: Keep original + return url; + }, + 'flickr.com': (url) => { + // Flickr: Keep original + return url; + }, + 'photobucket.com': (url) => { + // Photobucket: Keep original + return url; + }, + 'dropbox.com': (url) => { + // Dropbox: For shared files, keep original + return url; + }, + 'drive.google.com': (url) => { + // Google Drive: For shared files, keep original + return url; + }, + 'onedrive.live.com': (url) => { + // OneDrive: For shared files, keep original + return url; + }, + 'box.com': (url) => { + // Box: For shared files, keep original + return url; + }, + 'wetransfer.com': (url) => { + // WeTransfer: Keep original + return url; + }, + 'sendspace.com': (url) => { + // SendSpace: Keep original + return url; + }, + 'mediafire.com': (url) => { + // MediaFire: Keep original + return url; + }, + 'mega.nz': (url) => { + // Mega: Keep original + return url; + }, + 'archive.org': (url) => { + // Internet Archive: Keep original + return url; + }, + 'web.archive.org': (url) => { + // Wayback Machine: Keep original + return url; + }, + 'scholar.google.com': (url) => { + // Google Scholar: Keep original + return url; + }, + 'researchgate.net': (url) => { + // ResearchGate: Keep original + return url; + }, + 'academia.edu': (url) => { + // Academia.edu: Keep original + return url; + }, + 'jstor.org': (url) => { + // JSTOR: Keep original + return url; + }, + 'pubmed.ncbi.nlm.nih.gov': (url) => { + // PubMed: Keep original + return url; + }, + 'arxiv.org': (url) => { + // arXiv: Keep original + return url; + }, + 'biorxiv.org': (url) => { + // bioRxiv: Keep original + return url; + }, + 'medrxiv.org': (url) => { + // medRxiv: Keep original + return url; + }, + 'ssrn.com': (url) => { + // SSRN: Keep original + return url; + }, + 'doi.org': (url) => { + // DOI: Keep original + return url; + }, + 'orcid.org': (url) => { + // ORCID: Keep original + return url; + }, + 'goodreads.com': (url) => { + // Goodreads: Keep original + return url; + }, + 'bookdepository.com': (url) => { + // Book Depository: Keep original + return url; + }, + 'amazon.com': (url) => { + // Amazon: For book/product pages, keep original + return url; + }, + 'amazon.co.uk': (url) => { + // Amazon UK: Keep original + return url; + }, + 'amazon.de': (url) => { + // Amazon Germany: Keep original + return url; + }, + 'amazon.fr': (url) => { + // Amazon France: Keep original + return url; + }, + 'amazon.es': (url) => { + // Amazon Spain: Keep original + return url; + }, + 'amazon.it': (url) => { + // Amazon Italy: Keep original + return url; + }, + 'amazon.ca': (url) => { + // Amazon Canada: Keep original + return url; + }, + 'amazon.com.au': (url) => { + // Amazon Australia: Keep original + return url; + }, + 'amazon.co.jp': (url) => { + // Amazon Japan: Keep original + return url; + }, + 'ebay.com': (url) => { + // eBay: Keep original + return url; + }, + 'etsy.com': (url) => { + // Etsy: Keep original + return url; + }, + 'aliexpress.com': (url) => { + // AliExpress: Keep original + return url; + }, + 'alibaba.com': (url) => { + // Alibaba: Keep original + return url; + }, + 'shopify.com': (url) => { + // Shopify stores: Keep original + return url; + }, + 'squarespace.com': (url) => { + // Squarespace sites: Keep original + return url; + }, + 'wix.com': (url) => { + // Wix sites: Keep original + return url; + }, + 'wordpress.com': (url) => { + // WordPress.com sites: Keep original + return url; + }, + 'blogger.com': (url) => { + // Blogger: Keep original + return url; + }, + 'blogspot.com': (url) => { + // Blogspot: Keep original + return url; + }, + 'tumblr.com': (url) => { + // Tumblr: Keep original + return url; + }, + 'ghost.org': (url) => { + // Ghost blogs: Keep original + return url; + }, +}; + +/** + * A content handler that transforms URLs for specific domains to improve content extraction. + * This handler focuses on general domain transformations, excluding social media and news sites + * which are handled by specialized handlers. + */ +@Injectable() +export class DomainSpecificHandler implements IContentHandler { + private readonly logger = new Logger(DomainSpecificHandler.name); + + /** + * Determines if the handler can process the content from the given URL. + * @param url - The URL to be checked. + * @returns `true` if the handler can process the URL, `false` otherwise. + */ + public canHandle(url: URL): boolean { + return Object.keys(DOMAIN_TRANSFORMATIONS).some((domain) => url.hostname.endsWith(domain)); + } + + /** + * Processes the content from the URL by transforming it to a more accessible version. + * @param url - The URL of the content to handle. + * @returns A `PreHandleResult` with the new URL, or `null` on failure. + */ + public handle(url: URL): Promise { + const domain = Object.keys(DOMAIN_TRANSFORMATIONS).find((d) => url.hostname.endsWith(d)); + + if (!domain) { + return Promise.resolve(null); + } + + try { + const transform = DOMAIN_TRANSFORMATIONS[domain]; + const newUrl = transform(url); + this.logger.debug(`Transformed [${domain}] URL to: ${newUrl.href}`); + + return Promise.resolve({ + url: newUrl.href, + }); + } catch (error) { + this.logger.warn(`DomainSpecificHandler failed for ${url.href}: ${(error as Error).message}`); + return Promise.resolve(null); + } + } +} diff --git a/src/modules/pre-handler/handlers/news-site.handler.ts b/src/modules/pre-handler/handlers/news-site.handler.ts new file mode 100644 index 0000000..5b1052c --- /dev/null +++ b/src/modules/pre-handler/handlers/news-site.handler.ts @@ -0,0 +1,317 @@ +import { Injectable, Logger } from '@nestjs/common'; +import { IContentHandler } from '../interfaces/content-handler.interface'; +import { PreHandleResult } from '../dto/pre-handle-result.dto'; + +/** + * News site transformations. + * Each news site has specific URL patterns and optimal access methods. + */ +const NEWS_SITE_TRANSFORMATIONS: Record URL> = { + 'nytimes.com': (url) => { + // New York Times: Use print version to bypass paywall + const newUrl = new URL(url.href); + newUrl.searchParams.set('print', '1'); + return newUrl; + }, + 'wsj.com': (url) => { + // Wall Street Journal: Use print version + const newUrl = new URL(url.href); + newUrl.searchParams.set('print', '1'); + return newUrl; + }, + 'washingtonpost.com': (url) => { + // Washington Post: Use print version + const newUrl = new URL(url.href); + newUrl.searchParams.set('print', '1'); + return newUrl; + }, + 'ft.com': (url) => { + // Financial Times: Use print version + const newUrl = new URL(url.href); + newUrl.searchParams.set('print', '1'); + return newUrl; + }, + 'bloomberg.com': (url) => { + // Bloomberg: Use print version + const newUrl = new URL(url.href); + newUrl.searchParams.set('print', '1'); + return newUrl; + }, + 'economist.com': (url) => { + // The Economist: Use print version + const newUrl = new URL(url.href); + newUrl.searchParams.set('print', '1'); + return newUrl; + }, + 'cnn.com': (url) => { + // CNN: Use mobile version for cleaner layout + const newUrl = new URL(url.href); + newUrl.hostname = 'lite.cnn.com'; + return newUrl; + }, + 'bbc.com': (url) => { + // BBC: Use mobile version + const newUrl = new URL(url.href); + newUrl.hostname = 'm.bbc.com'; + return newUrl; + }, + 'bbc.co.uk': (url) => { + // BBC UK: Use mobile version + const newUrl = new URL(url.href); + newUrl.hostname = 'm.bbc.co.uk'; + return newUrl; + }, + 'reuters.com': (url) => { + // Reuters: Keep original, usually accessible + return url; + }, + 'apnews.com': (url) => { + // Associated Press: Keep original, usually accessible + return url; + }, + 'theguardian.com': (url) => { + // The Guardian: Keep original, no paywall + return url; + }, + 'npr.org': (url) => { + // NPR: Keep original, usually accessible + return url; + }, + 'politico.com': (url) => { + // Politico: Use print version for better readability + const newUrl = new URL(url.href); + newUrl.searchParams.set('print', '1'); + return newUrl; + }, + 'axios.com': (url) => { + // Axios: Keep original, usually accessible + return url; + }, + 'vox.com': (url) => { + // Vox: Keep original, usually accessible + return url; + }, + 'buzzfeed.com': (url) => { + // BuzzFeed: Keep original + return url; + }, + 'huffpost.com': (url) => { + // HuffPost: Keep original + return url; + }, + 'usatoday.com': (url) => { + // USA Today: Use print version + const newUrl = new URL(url.href); + newUrl.searchParams.set('print', '1'); + return newUrl; + }, + 'latimes.com': (url) => { + // LA Times: Use print version + const newUrl = new URL(url.href); + newUrl.searchParams.set('print', '1'); + return newUrl; + }, + 'chicagotribune.com': (url) => { + // Chicago Tribune: Use print version + const newUrl = new URL(url.href); + newUrl.searchParams.set('print', '1'); + return newUrl; + }, + 'time.com': (url) => { + // Time Magazine: Keep original + return url; + }, + 'newsweek.com': (url) => { + // Newsweek: Keep original + return url; + }, + 'theatlantic.com': (url) => { + // The Atlantic: Use print version + const newUrl = new URL(url.href); + newUrl.searchParams.set('print', '1'); + return newUrl; + }, + 'newyorker.com': (url) => { + // The New Yorker: Use print version + const newUrl = new URL(url.href); + newUrl.searchParams.set('print', '1'); + return newUrl; + }, + 'forbes.com': (url) => { + // Forbes: Keep original but remove tracking + const newUrl = new URL(url.href); + newUrl.searchParams.delete('sh'); + return newUrl; + }, + 'techcrunch.com': (url) => { + // TechCrunch: Keep original + return url; + }, + 'engadget.com': (url) => { + // Engadget: Keep original + return url; + }, + 'theverge.com': (url) => { + // The Verge: Keep original + return url; + }, + 'wired.com': (url) => { + // Wired: Use print version + const newUrl = new URL(url.href); + newUrl.searchParams.set('print', '1'); + return newUrl; + }, + 'arstechnica.com': (url) => { + // Ars Technica: Keep original, usually accessible + return url; + }, + 'espn.com': (url) => { + // ESPN: Use mobile version + const newUrl = new URL(url.href); + newUrl.hostname = 'm.espn.com'; + return newUrl; + }, + 'cbssports.com': (url) => { + // CBS Sports: Use mobile version + const newUrl = new URL(url.href); + newUrl.hostname = 'm.cbssports.com'; + return newUrl; + }, + 'nfl.com': (url) => { + // NFL: Use mobile version + const newUrl = new URL(url.href); + newUrl.hostname = 'm.nfl.com'; + return newUrl; + }, + 'nba.com': (url) => { + // NBA: Use mobile version + const newUrl = new URL(url.href); + newUrl.hostname = 'm.nba.com'; + return newUrl; + }, +}; + +/** + * A content handler specifically for news websites. + * This handler detects news site URLs and transforms them to more + * accessible versions, often bypassing paywalls or using cleaner layouts. + */ +@Injectable() +export class NewsSiteHandler implements IContentHandler { + private readonly logger = new Logger(NewsSiteHandler.name); + + /** + * Checks if the URL is from a supported news website. + * @param url - The URL to check. + * @returns `true` if the URL is from a supported news website. + */ + public canHandle(url: URL): boolean { + return Object.keys(NEWS_SITE_TRANSFORMATIONS).some((domain) => url.hostname.endsWith(domain)); + } + + /** + * Processes news site URLs by transforming them to more accessible versions. + * @param url - The news site URL to handle. + * @returns A `PreHandleResult` with the transformed URL, or `null` on failure. + */ + public handle(url: URL): Promise { + const domain = Object.keys(NEWS_SITE_TRANSFORMATIONS).find((d) => url.hostname.endsWith(d)); + + if (!domain) { + return Promise.resolve(null); + } + + try { + const transform = NEWS_SITE_TRANSFORMATIONS[domain]; + const newUrl = transform(url); + + this.logger.debug(`Transformed news site URL [${domain}]: ${url.href} -> ${newUrl.href}`); + + // Extract potential title from URL + let title: string | undefined; + const siteName = this.getSiteName(domain); + + // Try to extract article title from URL path + const pathParts = url.pathname.split('/').filter((part) => part.length > 0); + if (pathParts.length > 0) { + // Look for article identifiers in the path + const lastPart = pathParts[pathParts.length - 1]; + if (lastPart.includes('-') || lastPart.includes('_')) { + // Convert URL slug to title + title = lastPart + .replace(/[-_]/g, ' ') + .replace(/\.(html|htm|php|asp|aspx)$/i, '') + .replace(/\b\w/g, (l) => l.toUpperCase()) + .trim(); + + if (title.length > 60) { + title = title.substring(0, 60) + '...'; + } + + title = `${siteName}: ${title}`; + } + } + + // Fallback title + if (!title) { + title = `${siteName} Article`; + } + + return Promise.resolve({ + url: newUrl.href, + title, + contentType: 'text/html', + }); + } catch (error) { + this.logger.warn(`NewsSiteHandler failed for ${url.href}: ${(error as Error).message}`); + return Promise.resolve(null); + } + } + + /** + * Gets a human-readable site name from domain. + * @param domain - The domain name. + * @returns The site name. + */ + private getSiteName(domain: string): string { + const siteNames: Record = { + 'nytimes.com': 'New York Times', + 'wsj.com': 'Wall Street Journal', + 'washingtonpost.com': 'Washington Post', + 'ft.com': 'Financial Times', + 'bloomberg.com': 'Bloomberg', + 'economist.com': 'The Economist', + 'cnn.com': 'CNN', + 'bbc.com': 'BBC', + 'bbc.co.uk': 'BBC', + 'reuters.com': 'Reuters', + 'apnews.com': 'Associated Press', + 'theguardian.com': 'The Guardian', + 'npr.org': 'NPR', + 'politico.com': 'Politico', + 'axios.com': 'Axios', + 'vox.com': 'Vox', + 'buzzfeed.com': 'BuzzFeed', + 'huffpost.com': 'HuffPost', + 'usatoday.com': 'USA Today', + 'latimes.com': 'LA Times', + 'chicagotribune.com': 'Chicago Tribune', + 'time.com': 'Time', + 'newsweek.com': 'Newsweek', + 'theatlantic.com': 'The Atlantic', + 'newyorker.com': 'The New Yorker', + 'forbes.com': 'Forbes', + 'techcrunch.com': 'TechCrunch', + 'engadget.com': 'Engadget', + 'theverge.com': 'The Verge', + 'wired.com': 'Wired', + 'arstechnica.com': 'Ars Technica', + 'espn.com': 'ESPN', + 'cbssports.com': 'CBS Sports', + 'nfl.com': 'NFL', + 'nba.com': 'NBA', + }; + + return siteNames[domain] || domain; + } +} diff --git a/src/modules/pre-handler/handlers/pdf.handler.ts b/src/modules/pre-handler/handlers/pdf.handler.ts new file mode 100644 index 0000000..4e9c4ce --- /dev/null +++ b/src/modules/pre-handler/handlers/pdf.handler.ts @@ -0,0 +1,66 @@ +import { Injectable, Logger } from '@nestjs/common'; +import { IContentHandler } from '../interfaces/content-handler.interface'; +import { PreHandleResult } from '../dto/pre-handle-result.dto'; + +/** + * A content handler specifically for PDF files. + * This handler detects PDF URLs and marks them appropriately + * so that the main scraping service can handle them differently. + */ +@Injectable() +export class PdfHandler implements IContentHandler { + private readonly logger = new Logger(PdfHandler.name); + + /** + * Checks if the URL points to a PDF file. + * @param url - The URL to check. + * @returns `true` if the URL appears to be a PDF file. + */ + public canHandle(url: URL): boolean { + // Check file extension + if (url.pathname.toLowerCase().endsWith('.pdf')) { + return true; + } + + // Check for common PDF hosting patterns + const pdfPatterns = [/\/pdf\//i, /\.pdf$/i, /\/download.*\.pdf/i, /\/files.*\.pdf/i, /\/documents.*\.pdf/i]; + + return pdfPatterns.some((pattern) => pattern.test(url.pathname)); + } + + /** + * Processes PDF URLs by marking them with the correct content type. + * @param url - The URL of the PDF to handle. + * @returns A `PreHandleResult` with PDF content type, or `null` on failure. + */ + public handle(url: URL): Promise { + try { + // For PDF files, we don't extract content here but mark the content type + // The main service will handle PDF extraction using appropriate tools + this.logger.debug(`Detected PDF file: ${url.href}`); + + // Try to extract title from URL path + let title: string | undefined; + const pathParts = url.pathname.split('/'); + const filename = pathParts[pathParts.length - 1]; + + if (filename && filename.includes('.pdf')) { + // Remove .pdf extension and clean up the filename for title + title = filename + .replace(/\.pdf$/i, '') + .replace(/[-_]/g, ' ') + .replace(/\b\w/g, (l) => l.toUpperCase()) + .trim(); + } + + return Promise.resolve({ + url: url.href, + title, + contentType: 'application/pdf', + }); + } catch (error) { + this.logger.warn(`PdfHandler failed for ${url.href}: ${(error as Error).message}`); + return Promise.resolve(null); + } + } +} diff --git a/src/modules/pre-handler/handlers/readability.handler.ts b/src/modules/pre-handler/handlers/readability.handler.ts new file mode 100644 index 0000000..a60f5c5 --- /dev/null +++ b/src/modules/pre-handler/handlers/readability.handler.ts @@ -0,0 +1,56 @@ +import { Injectable, Logger } from '@nestjs/common'; +import { JSDOM } from 'jsdom'; +import { Readability } from '@mozilla/readability'; +import { IContentHandler } from '../interfaces/content-handler.interface'; +import { PreHandleResult } from '../dto/pre-handle-result.dto'; + +/** + * A content handler that uses Mozilla's Readability library to extract + * the main readable content from a generic webpage. + */ +@Injectable() +export class ReadabilityHandler implements IContentHandler { + private readonly logger = new Logger(ReadabilityHandler.name); + private readonly USER_AGENT = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'; + + /** + * This handler can attempt to process any HTTP/HTTPS URL. + * It should typically be placed last in the handler chain as a fallback. + * @param url - The URL to check. + * @returns `true` if the protocol is http or https. + */ + public canHandle(url: URL): boolean { + return ['http:', 'https:'].includes(url.protocol); + } + + /** + * Fetches the webpage, parses it with JSDOM, and extracts the article content. + * @param url - The URL to handle. + * @returns A `PreHandleResult` with the extracted article, or `null` on failure. + */ + public async handle(url: URL): Promise { + try { + const dom = await JSDOM.fromURL(url.href, { + userAgent: this.USER_AGENT, + }); + + const reader = new Readability(dom.window.document); + const article = reader.parse(); + + if (!article?.content) { + this.logger.debug(`Readability could not find content for: ${url.href}`); + return null; + } + + return { + url: url.href, + title: article.title ?? undefined, + content: article.content, + contentType: 'text/html', + }; + } catch (error) { + this.logger.warn(`ReadabilityHandler failed for ${url.href}: ${(error as Error).message}`); + return null; + } + } +} diff --git a/src/modules/pre-handler/handlers/rss.handler.ts b/src/modules/pre-handler/handlers/rss.handler.ts new file mode 100644 index 0000000..d0e654d --- /dev/null +++ b/src/modules/pre-handler/handlers/rss.handler.ts @@ -0,0 +1,91 @@ +import { Injectable, Logger } from '@nestjs/common'; +import { IContentHandler } from '../interfaces/content-handler.interface'; +import { PreHandleResult } from '../dto/pre-handle-result.dto'; + +/** + * A content handler for RSS/Atom feeds. + * This handler detects feed URLs and marks them appropriately + * for specialized feed processing. + */ +@Injectable() +export class RssHandler implements IContentHandler { + private readonly logger = new Logger(RssHandler.name); + + /** + * Checks if the URL points to an RSS or Atom feed. + * @param url - The URL to check. + * @returns `true` if the URL appears to be a feed. + */ + public canHandle(url: URL): boolean { + // Check file extension + const feedExtensions = ['.rss', '.xml', '.atom']; + if (feedExtensions.some((ext) => url.pathname.toLowerCase().endsWith(ext))) { + return true; + } + + // Check for common feed URL patterns + const feedPatterns = [ + /\/feed\/?$/i, + /\/feeds?\//i, + /\/rss\/?$/i, + /\/atom\/?$/i, + /\/syndication\//i, + /\/index\.xml$/i, + /\/rss\.xml$/i, + /\/atom\.xml$/i, + /\/feed\.xml$/i, + ]; + + return feedPatterns.some((pattern) => pattern.test(url.pathname)); + } + + /** + * Processes feed URLs by marking them with the correct content type. + * @param url - The URL of the feed to handle. + * @returns A `PreHandleResult` with feed content type, or `null` on failure. + */ + public handle(url: URL): Promise { + try { + this.logger.debug(`Detected RSS/Atom feed: ${url.href}`); + + // Try to extract title from URL or domain + let title: string | undefined; + + // Extract from path + const pathParts = url.pathname.split('/').filter((part) => part.length > 0); + if (pathParts.length > 0) { + const lastPart = pathParts[pathParts.length - 1]; + if ( + !['feed', 'rss', 'atom', 'index.xml', 'rss.xml', 'atom.xml', 'feed.xml'].includes( + lastPart.toLowerCase(), + ) + ) { + title = lastPart + .replace(/[-_]/g, ' ') + .replace(/\b\w/g, (l) => l.toUpperCase()) + .trim(); + } + } + + // Fallback to domain name + if (!title) { + title = `${url.hostname} Feed`; + } + + // Determine content type based on URL patterns + let contentType = 'application/rss+xml'; + if (url.pathname.toLowerCase().includes('atom')) { + contentType = 'application/atom+xml'; + } + + return Promise.resolve({ + url: url.href, + title, + contentType, + }); + } catch (error) { + this.logger.warn(`RssHandler failed for ${url.href}: ${(error as Error).message}`); + return Promise.resolve(null); + } + } +} diff --git a/src/modules/pre-handler/handlers/social-media.handler.ts b/src/modules/pre-handler/handlers/social-media.handler.ts new file mode 100644 index 0000000..0c5f8e8 --- /dev/null +++ b/src/modules/pre-handler/handlers/social-media.handler.ts @@ -0,0 +1,225 @@ +import { Injectable, Logger } from '@nestjs/common'; +import { IContentHandler } from '../interfaces/content-handler.interface'; +import { PreHandleResult } from '../dto/pre-handle-result.dto'; + +/** + * Social media platform transformations. + * Each platform has specific URL patterns and optimal access methods. + */ +const SOCIAL_MEDIA_TRANSFORMATIONS: Record URL> = { + 'instagram.com': (url) => { + // Instagram: Use bibliogram or other privacy-friendly proxies + // For posts, stories, and profiles + if (url.pathname.includes('/p/') || url.pathname.includes('/reel/') || url.pathname.includes('/tv/')) { + // For specific posts, use a proxy service + return new URL(`https://bibliogram.art${url.pathname}`); + } + // For profiles, keep original but mark for special handling + return url; + }, + 'tiktok.com': (url) => { + // TikTok: Use mobile web version for better content extraction + const newUrl = new URL(url.href); + newUrl.hostname = 'm.tiktok.com'; + return newUrl; + }, + 'facebook.com': (url) => { + // Facebook: Use mobile version for simpler layout + const newUrl = new URL(url.href); + newUrl.hostname = 'm.facebook.com'; + return newUrl; + }, + 'fb.com': (url) => { + // Facebook short URLs: Convert to mobile version + const newUrl = new URL(url.href); + newUrl.hostname = 'm.facebook.com'; + return newUrl; + }, + 'twitter.com': (url) => { + // Twitter: Use Nitter (privacy-friendly Twitter frontend) + const newUrl = new URL(url.href); + newUrl.hostname = 'nitter.net'; + return newUrl; + }, + 'x.com': (url) => { + // X (formerly Twitter): Use Nitter + const newUrl = new URL(url.href); + newUrl.hostname = 'nitter.net'; + return newUrl; + }, + 'linkedin.com': (url) => { + // LinkedIn: Keep original but add parameters to avoid login prompts + const newUrl = new URL(url.href); + // Remove tracking parameters and add mobile indicator + newUrl.searchParams.delete('trk'); + newUrl.searchParams.delete('trkInfo'); + newUrl.searchParams.set('lipi', 'urn:li:page:d_flagship3_profile_view_base'); + return newUrl; + }, + 'pinterest.com': (url) => { + // Pinterest: Use mobile version + const newUrl = new URL(url.href); + newUrl.hostname = 'm.pinterest.com'; + return newUrl; + }, + 'pinterest.co.uk': (url) => { + // Pinterest UK: Use mobile version + const newUrl = new URL(url.href); + newUrl.hostname = 'm.pinterest.co.uk'; + return newUrl; + }, + 'snapchat.com': (url) => { + // Snapchat: For stories and profiles, keep original + // Most Snapchat content requires the app, but some web content exists + return url; + }, + 'discord.com': (url) => { + // Discord: For invite links and server info + if (url.pathname.includes('/invite/')) { + // Keep invite links as-is + return url; + } + // For other Discord links, they typically require the app + return url; + }, + 'telegram.org': (url) => { + // Telegram: Convert to web version when possible + if (url.pathname.includes('/s/')) { + // Channel/chat links - use web version + const newUrl = new URL(url.href); + newUrl.hostname = 't.me'; + return newUrl; + } + return url; + }, + 't.me': (url) => { + // Telegram short links: Keep as-is, they're already optimized + return url; + }, + 'mastodon.social': (url) => { + // Mastodon: Keep original, it's already web-friendly + return url; + }, + 'mastodon.world': (url) => { + // Mastodon instance: Keep original + return url; + }, + 'threads.net': (url) => { + // Meta Threads: Keep original but note it might require login + return url; + }, + 'bluesky.app': (url) => { + // Bluesky: Keep original, it's web-friendly + return url; + }, + 'vk.com': (url) => { + // VKontakte: Use mobile version + const newUrl = new URL(url.href); + newUrl.hostname = 'm.vk.com'; + return newUrl; + }, + 'weibo.com': (url) => { + // Weibo: Use mobile version + const newUrl = new URL(url.href); + newUrl.hostname = 'm.weibo.com'; + return newUrl; + }, +}; + +/** + * A content handler specifically for social media platforms. + * This handler detects social media URLs and transforms them to more + * scraping-friendly versions when possible. + */ +@Injectable() +export class SocialMediaHandler implements IContentHandler { + private readonly logger = new Logger(SocialMediaHandler.name); + + /** + * Checks if the URL is from a supported social media platform. + * @param url - The URL to check. + * @returns `true` if the URL is from a supported social media platform. + */ + public canHandle(url: URL): boolean { + return Object.keys(SOCIAL_MEDIA_TRANSFORMATIONS).some((domain) => url.hostname.endsWith(domain)); + } + + /** + * Processes social media URLs by transforming them to more accessible versions. + * @param url - The social media URL to handle. + * @returns A `PreHandleResult` with the transformed URL, or `null` on failure. + */ + public handle(url: URL): Promise { + const domain = Object.keys(SOCIAL_MEDIA_TRANSFORMATIONS).find((d) => url.hostname.endsWith(d)); + + if (!domain) { + return Promise.resolve(null); + } + + try { + const transform = SOCIAL_MEDIA_TRANSFORMATIONS[domain]; + const newUrl = transform(url); + + this.logger.debug(`Transformed social media URL [${domain}]: ${url.href} -> ${newUrl.href}`); + + // Extract potential title from URL + let title: string | undefined; + const platform = this.getPlatformName(domain); + + if (url.pathname.includes('/p/') || url.pathname.includes('/post/')) { + title = `${platform} Post`; + } else if (url.pathname.includes('/reel/') || url.pathname.includes('/video/')) { + title = `${platform} Video`; + } else if (url.pathname.includes('/story/') || url.pathname.includes('/stories/')) { + title = `${platform} Story`; + } else if (url.pathname.length > 1) { + // Try to extract username or page name + const pathParts = url.pathname.split('/').filter((part) => part.length > 0); + if (pathParts.length > 0) { + const identifier = pathParts[0]; + title = `${platform} - ${identifier}`; + } + } + + return Promise.resolve({ + url: newUrl.href, + title, + contentType: 'text/html', + }); + } catch (error) { + this.logger.warn(`SocialMediaHandler failed for ${url.href}: ${(error as Error).message}`); + return Promise.resolve(null); + } + } + + /** + * Gets a human-readable platform name from domain. + * @param domain - The domain name. + * @returns The platform name. + */ + private getPlatformName(domain: string): string { + const platformNames: Record = { + 'instagram.com': 'Instagram', + 'tiktok.com': 'TikTok', + 'facebook.com': 'Facebook', + 'fb.com': 'Facebook', + 'twitter.com': 'Twitter', + 'x.com': 'X', + 'linkedin.com': 'LinkedIn', + 'pinterest.com': 'Pinterest', + 'pinterest.co.uk': 'Pinterest', + 'snapchat.com': 'Snapchat', + 'discord.com': 'Discord', + 'telegram.org': 'Telegram', + 't.me': 'Telegram', + 'mastodon.social': 'Mastodon', + 'mastodon.world': 'Mastodon', + 'threads.net': 'Threads', + 'bluesky.app': 'Bluesky', + 'vk.com': 'VK', + 'weibo.com': 'Weibo', + }; + + return platformNames[domain] || domain; + } +} diff --git a/src/modules/pre-handler/handlers/youtube.handler.ts b/src/modules/pre-handler/handlers/youtube.handler.ts new file mode 100644 index 0000000..67146c2 --- /dev/null +++ b/src/modules/pre-handler/handlers/youtube.handler.ts @@ -0,0 +1,136 @@ +import { Injectable, Logger } from '@nestjs/common'; +import { IContentHandler } from '../interfaces/content-handler.interface'; +import { PreHandleResult } from '../dto/pre-handle-result.dto'; + +/** + * A content handler specifically for YouTube videos. + * This handler detects YouTube URLs and attempts to extract + * video metadata and transcripts when available. + */ +@Injectable() +export class YoutubeHandler implements IContentHandler { + private readonly logger = new Logger(YoutubeHandler.name); + + /** + * Checks if the URL is a YouTube video. + * @param url - The URL to check. + * @returns `true` if the URL is a YouTube video. + */ + public canHandle(url: URL): boolean { + const youtubeHosts = ['youtube.com', 'www.youtube.com', 'youtu.be', 'm.youtube.com']; + + if (!youtubeHosts.includes(url.hostname)) { + return false; + } + + // Check for video patterns + if (url.hostname === 'youtu.be') { + return url.pathname.length > 1; // Has video ID + } + + // For youtube.com domains + return url.pathname.includes('/watch') || url.pathname.includes('/embed/') || url.pathname.includes('/v/'); + } + + /** + * Processes YouTube URLs to extract video information. + * @param url - The YouTube URL to handle. + * @returns A `PreHandleResult` with video information, or `null` on failure. + */ + public handle(url: URL): Promise { + try { + const videoId = this.extractVideoId(url); + if (!videoId) { + this.logger.warn(`Could not extract video ID from: ${url.href}`); + return Promise.resolve(null); + } + + this.logger.debug(`Processing YouTube video: ${videoId}`); + + // For now, we'll return basic information + // In a full implementation, you might want to: + // 1. Fetch video metadata from YouTube API + // 2. Extract auto-generated captions/transcripts + // 3. Convert video description to readable format + + const title = `YouTube Video: ${videoId}`; + const content = this.generateVideoContent(videoId, url); + + return Promise.resolve({ + url: url.href, + title, + content, + contentType: 'text/html', + }); + } catch (error) { + this.logger.warn(`YoutubeHandler failed for ${url.href}: ${(error as Error).message}`); + return Promise.resolve(null); + } + } + + /** + * Extracts the video ID from various YouTube URL formats. + * @param url - The YouTube URL. + * @returns The video ID or null if not found. + */ + private extractVideoId(url: URL): string | null { + // For youtu.be format + if (url.hostname === 'youtu.be') { + return url.pathname.slice(1); + } + + // For youtube.com formats + if (url.searchParams.has('v')) { + return url.searchParams.get('v'); + } + + // For embed URLs + const embedMatch = url.pathname.match(/\/embed\/([^/?]+)/); + if (embedMatch) { + return embedMatch[1]; + } + + // For /v/ URLs + const vMatch = url.pathname.match(/\/v\/([^/?]+)/); + if (vMatch) { + return vMatch[1]; + } + + return null; + } + + /** + * Generates readable content for a YouTube video. + * @param videoId - The YouTube video ID. + * @param originalUrl - The original URL. + * @returns HTML content representing the video. + */ + private generateVideoContent(videoId: string, originalUrl: URL): string { + const watchUrl = `https://www.youtube.com/watch?v=${videoId}`; + const embedUrl = `https://www.youtube.com/embed/${videoId}`; + + // Extract timestamp if present + const timestamp = originalUrl.searchParams.get('t'); + const timestampText = timestamp ? ` (starting at ${timestamp})` : ''; + + return ` +
+

YouTube Video

+

Video ID: ${videoId}

+

Watch URL: ${watchUrl}${timestampText}

+

Embed URL: ${embedUrl}

+ +
+

Note: This is a YouTube video. To get the full content including transcripts, + additional processing would be required using YouTube's API or transcript extraction tools.

+
+ + +
+ `.trim(); + } +} diff --git a/src/modules/pre-handler/interfaces/content-handler.interface.ts b/src/modules/pre-handler/interfaces/content-handler.interface.ts new file mode 100644 index 0000000..ea57a6b --- /dev/null +++ b/src/modules/pre-handler/interfaces/content-handler.interface.ts @@ -0,0 +1,27 @@ +import { PreHandleResult } from '../dto/pre-handle-result.dto'; + +/** + * Injection token for providing an array of IContentHandler implementations. + * This allows for flexible and extensible handler registration. + */ +export const CONTENT_HANDLER_TOKEN = 'CONTENT_HANDLER_TOKEN'; + +/** + * Defines the contract for content handlers. + * Each handler is responsible for processing a specific type of content (e.g., a PDF file, a specific domain). + */ +export interface IContentHandler { + /** + * Determines if the handler can process the content from the given URL. + * @param url - The URL to be checked. + * @returns `true` if the handler can process the URL, `false` otherwise. + */ + canHandle(url: URL): boolean; + + /** + * Processes the content from the URL and extracts relevant data. + * @param url - The URL of the content to handle. + * @returns A promise that resolves to a `PreHandleResult` object, or `null` if handling fails. + */ + handle(url: URL): Promise; +} diff --git a/src/modules/pre-handler/pre-handler.module.ts b/src/modules/pre-handler/pre-handler.module.ts new file mode 100644 index 0000000..59ecfcd --- /dev/null +++ b/src/modules/pre-handler/pre-handler.module.ts @@ -0,0 +1,59 @@ +import { Module } from '@nestjs/common'; +import { PreHandlerService } from './pre-handler.service'; +import { ReadabilityHandler } from './handlers/readability.handler'; +import { CONTENT_HANDLER_TOKEN, IContentHandler } from './interfaces/content-handler.interface'; +import { DomainSpecificHandler } from './handlers/domain-specific.handler'; +import { PdfHandler } from './handlers/pdf.handler'; +import { RssHandler } from './handlers/rss.handler'; +import { YoutubeHandler } from './handlers/youtube.handler'; +import { SocialMediaHandler } from './handlers/social-media.handler'; +import { NewsSiteHandler } from './handlers/news-site.handler'; + +// --- Register all handlers here --- +// The order is important: more specific handlers should come first. +// 1. File type handlers (PDF, RSS) - most specific +// 2. Platform-specific handlers (YouTube) - very specific +// 3. Social media handlers - moderately specific +// 4. News site handlers - moderately specific +// 5. Domain transformation handlers - general transformations +// 6. General readability handler - fallback for everything else +const handlers = [ + PdfHandler, + RssHandler, + YoutubeHandler, + SocialMediaHandler, + NewsSiteHandler, + DomainSpecificHandler, + ReadabilityHandler, +]; + +/** + * Encapsulates all content pre-handling logic. + * It provides the PreHandlerService and registers all available content handlers. + * This module is designed to be extensible; new handlers can be added easily. + * + * Handler execution order: + * 1. PdfHandler - Detects and marks PDF files + * 2. RssHandler - Detects and marks RSS/Atom feeds + * 3. YoutubeHandler - Extracts YouTube video information + * 4. SocialMediaHandler - Transforms social media URLs + * 5. NewsSiteHandler - Transforms news site URLs + * 6. DomainSpecificHandler - Transforms URLs for other specific domains + * 7. ReadabilityHandler - Fallback for general web content + */ +@Module({ + providers: [ + PreHandlerService, + ...handlers, + { + provide: CONTENT_HANDLER_TOKEN, + // The useFactory provider collects all registered handlers and makes them + // available for injection as an array. To add a new handler, + // simply add it to the `handlers` array above and the `inject` array below. + useFactory: (...injectedHandlers: IContentHandler[]): IContentHandler[] => injectedHandlers, + inject: handlers, + }, + ], + exports: [PreHandlerService], +}) +export class PreHandlerModule {} diff --git a/src/modules/pre-handler/pre-handler.service.ts b/src/modules/pre-handler/pre-handler.service.ts new file mode 100644 index 0000000..27824d3 --- /dev/null +++ b/src/modules/pre-handler/pre-handler.service.ts @@ -0,0 +1,59 @@ +import { Inject, Injectable, Logger } from '@nestjs/common'; +import { CONTENT_HANDLER_TOKEN, IContentHandler } from './interfaces/content-handler.interface'; +import { PreHandleResult } from './dto/pre-handle-result.dto'; + +/** + * Orchestrates the content pre-handling process by iterating through a chain of registered handlers. + */ +@Injectable() +export class PreHandlerService { + private readonly logger = new Logger(PreHandlerService.name); + + /** + * Injects all services that are provided with the `CONTENT_HANDLER_TOKEN`. + * @param handlers - An array of `IContentHandler` implementations. + */ + constructor( + @Inject(CONTENT_HANDLER_TOKEN) + private readonly handlers: IContentHandler[], + ) {} + + /** + * Executes the handler chain for a given URL. + * It tries handlers one by one until one successfully returns a result with content. + * @param urlString - The URL to process. + * @returns A `PreHandleResult`. If all handlers fail, it returns the original URL. + */ + public async execute(urlString: string): Promise { + let currentUrl = new URL(urlString); + const finalResult: PreHandleResult = { url: urlString }; + + for (const handler of this.handlers) { + if (handler.canHandle(currentUrl)) { + this.logger.debug(`Attempting to use handler: ${handler.constructor.name}`); + const result = await handler.handle(currentUrl); + + if (result) { + // URL이 핸들러에 의해 변경되었는지 확인하고 업데이트합니다. + if (result.url && result.url !== currentUrl.href) { + this.logger.debug(`URL transformed by ${handler.constructor.name} to: ${result.url}`); + currentUrl = new URL(result.url); + finalResult.url = result.url; + } + + // 콘텐츠가 성공적으로 추출되면 즉시 반환합니다. + if (result.content) { + this.logger.log(`Successfully handled by ${handler.constructor.name}`); + finalResult.title = result.title; + finalResult.content = result.content; + finalResult.contentType = result.contentType; + return finalResult; + } + } + } + } + + this.logger.debug('No suitable handler found. Returning final result.'); + return finalResult; // 콘텐츠가 없더라도, 변환된 URL이 포함될 수 있는 최종 결과를 반환합니다. + } +} From ba24b210edac48a7e72fa617825d3298abe83dc6 Mon Sep 17 00:00:00 2001 From: reach0908 Date: Fri, 4 Jul 2025 20:47:55 +0900 Subject: [PATCH 05/28] =?UTF-8?q?feat(scraper):=20=EC=8A=A4=ED=81=AC?= =?UTF-8?q?=EB=9E=98=ED=95=91=20=EA=B8=B0=EB=8A=A5=EC=9D=84=20=EC=9C=84?= =?UTF-8?q?=ED=95=9C=20=EC=BB=A8=ED=8A=B8=EB=A1=A4=EB=9F=AC=20=EB=B0=8F=20?= =?UTF-8?q?=EC=84=9C=EB=B9=84=EC=8A=A4=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * ScraperController를 추가하여 웹 콘텐츠 스크래핑 및 저장 기능을 구현했습니다. * FetchContentInput 및 ScrapedContentOutput DTO를 정의하여 스크래핑 요청 및 응답 구조를 명확히 했습니다. * PuppeteerParseService를 통해 Puppeteer를 사용한 콘텐츠 가져오기 및 처리 로직을 추가했습니다. * InvalidUrlException을 정의하여 URL 유효성 검사 실패 시 적절한 예외 처리를 구현했습니다. * 스크래핑된 콘텐츠를 HTML로 렌더링하여 미리보기 기능을 추가했습니다. --- .../scraper/dto/fetch-content.input.ts | 19 + .../scraper/dto/scraped-content.output.ts | 16 + .../exceptions/invalid-url.exception.ts | 10 + src/modules/scraper/scraper.controller.ts | 743 ++++++++++++++++++ src/modules/scraper/scraper.module.ts | 12 +- .../services/puppeteer-parse.service.ts | 283 +++++++ .../scraper/services/puppeteer-parse.ts | 0 7 files changed, 1080 insertions(+), 3 deletions(-) create mode 100644 src/modules/scraper/dto/fetch-content.input.ts create mode 100644 src/modules/scraper/dto/scraped-content.output.ts create mode 100644 src/modules/scraper/exceptions/invalid-url.exception.ts create mode 100644 src/modules/scraper/scraper.controller.ts create mode 100644 src/modules/scraper/services/puppeteer-parse.service.ts delete mode 100644 src/modules/scraper/services/puppeteer-parse.ts diff --git a/src/modules/scraper/dto/fetch-content.input.ts b/src/modules/scraper/dto/fetch-content.input.ts new file mode 100644 index 0000000..741047d --- /dev/null +++ b/src/modules/scraper/dto/fetch-content.input.ts @@ -0,0 +1,19 @@ +import { IsNotEmpty, IsOptional, IsString, IsUrl } from 'class-validator'; + +/** + * DTO used by ScraperController (or other callers) to request content scraping. + */ +export class FetchContentInput { + @IsString() + @IsNotEmpty() + @IsUrl({ require_protocol: true }) + url!: string; + + @IsOptional() + @IsString() + locale?: string; + + @IsOptional() + @IsString() + timezone?: string; +} diff --git a/src/modules/scraper/dto/scraped-content.output.ts b/src/modules/scraper/dto/scraped-content.output.ts new file mode 100644 index 0000000..96bda88 --- /dev/null +++ b/src/modules/scraper/dto/scraped-content.output.ts @@ -0,0 +1,16 @@ +/** + * The result of a scrape operation. + */ +export interface ScrapedContentOutput { + /** 마지막으로 확인된 URL (리디렉션 반영) */ + finalUrl: string; + + /** 페이지 또는 PDF 파일 이름 */ + title?: string; + + /** HTML 본문(outerHTML) 또는 기타 텍스트 콘텐츠 */ + content?: string; + + /** MIME 타입 (예: text/html, application/pdf) */ + contentType?: string; +} diff --git a/src/modules/scraper/exceptions/invalid-url.exception.ts b/src/modules/scraper/exceptions/invalid-url.exception.ts new file mode 100644 index 0000000..c86f558 --- /dev/null +++ b/src/modules/scraper/exceptions/invalid-url.exception.ts @@ -0,0 +1,10 @@ +import { BadRequestException } from '@nestjs/common'; + +/** + * Thrown when the provided URL fails validation checks. + */ +export class InvalidUrlException extends BadRequestException { + constructor(reason: string) { + super(`Invalid URL: ${reason}`); + } +} diff --git a/src/modules/scraper/scraper.controller.ts b/src/modules/scraper/scraper.controller.ts new file mode 100644 index 0000000..89b006d --- /dev/null +++ b/src/modules/scraper/scraper.controller.ts @@ -0,0 +1,743 @@ +import { Controller, Post, Body, Get, Query, BadRequestException, Res, Header } from '@nestjs/common'; +import { ApiTags, ApiOperation, ApiResponse, ApiBody, ApiQuery } from '@nestjs/swagger'; +import { Response } from 'express'; +import { PuppeteerParseService, FetchContentWithSaveInput } from './services/puppeteer-parse.service'; +import { FetchContentInput } from './dto/fetch-content.input'; +import { ScrapedContentOutput } from './dto/scraped-content.output'; +import { PreHandlerService } from '../pre-handler/pre-handler.service'; +import { PreHandleResult } from '../pre-handler/dto/pre-handle-result.dto'; + +@ApiTags('scraper') +@Controller('scraper') +export class ScraperController { + constructor( + private readonly puppeteerParseService: PuppeteerParseService, + private readonly preHandlerService: PreHandlerService, + ) {} + + /** + * 전체 스크래핑 프로세스 (사전 처리 + Puppeteer) + */ + @Post('fetch-content') + @ApiOperation({ + summary: '웹 콘텐츠 스크래핑', + description: '사전 처리 핸들러들을 거쳐 최종적으로 웹 콘텐츠를 스크래핑합니다.', + }) + @ApiBody({ type: FetchContentInput }) + @ApiResponse({ + status: 200, + description: '스크래핑 성공', + schema: { + type: 'object', + properties: { + finalUrl: { type: 'string' }, + title: { type: 'string' }, + content: { type: 'string' }, + contentType: { type: 'string' }, + }, + }, + }) + @ApiResponse({ + status: 400, + description: '잘못된 URL 또는 요청', + }) + @ApiResponse({ + status: 500, + description: '서버 오류', + }) + async fetchContent(@Body() input: FetchContentInput): Promise<ScrapedContentOutput> { + return this.puppeteerParseService.fetchContent(input); + } + + /** + * 웹 콘텐츠 스크래핑 및 저장 (인증 필요) + */ + @Post('save-content') + @ApiOperation({ + summary: '웹 콘텐츠 스크래핑 및 저장', + description: '웹 콘텐츠를 스크래핑하고 사용자 계정에 저장합니다. 인증이 필요합니다.', + }) + @ApiBody({ + schema: { + type: 'object', + properties: { + url: { type: 'string', description: '스크래핑할 URL' }, + locale: { type: 'string', description: '언어 설정 (선택사항)' }, + timezone: { type: 'string', description: '시간대 설정 (선택사항)' }, + tags: { type: 'array', items: { type: 'string' }, description: '태그 목록 (선택사항)' }, + isBookmarked: { type: 'boolean', description: '북마크 여부 (선택사항)' }, + isArchived: { type: 'boolean', description: '아카이브 여부 (선택사항)' }, + }, + required: ['url'], + }, + }) + @ApiResponse({ + status: 200, + description: '스크래핑 및 저장 성공', + schema: { + type: 'object', + properties: { + finalUrl: { type: 'string' }, + title: { type: 'string' }, + content: { type: 'string' }, + contentType: { type: 'string' }, + saved: { type: 'boolean', description: '데이터베이스 저장 여부' }, + }, + }, + }) + @ApiResponse({ + status: 400, + description: '잘못된 URL 또는 요청', + }) + @ApiResponse({ + status: 401, + description: '인증 실패', + }) + @ApiResponse({ + status: 500, + description: '서버 오류', + }) + async saveContent(@Body() input: Record<string, any>): Promise<ScrapedContentOutput & { saved: boolean }> { + // URL 필수 필드 검증 + if (!input.url) { + throw new BadRequestException('URL is required'); + } + + // 간단한 구현: 인증 없이도 동작하도록 함 + const fetchInput: FetchContentWithSaveInput = { + url: input.url as string, + locale: input.locale as string | undefined, + timezone: input.timezone as string | undefined, + tags: input.tags as string[] | undefined, + isBookmarked: input.isBookmarked as boolean | undefined, + isArchived: input.isArchived as boolean | undefined, + saveToDatabase: false, // 인증 구현 전까지는 저장 비활성화 + }; + + const result = await this.puppeteerParseService.fetchContentWithSave(fetchInput); + return { ...result, saved: false }; + } + + /** + * 사전 처리만 테스트 (핸들러 체인 테스트용) + */ + @Get('pre-handle') + @ApiOperation({ + summary: '사전 처리 핸들러 테스트', + description: + 'URL이 어떤 핸들러에 의해 어떻게 변환되는지 테스트합니다. Puppeteer를 사용하지 않고 사전 처리만 수행합니다.', + }) + @ApiQuery({ + name: 'url', + description: '테스트할 URL', + example: 'https://www.youtube.com/watch?v=dQw4w9WgXcQ', + }) + @ApiResponse({ + status: 200, + description: '사전 처리 성공', + schema: { + type: 'object', + properties: { + originalUrl: { type: 'string', description: '원본 URL' }, + finalUrl: { type: 'string', description: '변환된 URL' }, + title: { type: 'string', description: '추출된 제목' }, + contentType: { type: 'string', description: '콘텐츠 타입' }, + contentLength: { type: 'number', description: '콘텐츠 길이' }, + handlerUsed: { type: 'string', description: '사용된 핸들러 정보' }, + }, + }, + }) + async preHandle(@Query('url') url: string) { + if (!url) { + throw new BadRequestException('URL is required'); + } + + const result = await this.preHandlerService.execute(url); + + return { + originalUrl: url, + finalUrl: result.url, + title: result.title, + contentType: result.contentType, + contentLength: result.content?.length || 0, + handlerUsed: this.determineHandlerUsed(url, result), + urlChanged: result.url !== url, + hasContent: !!result.content, + hasTitle: !!result.title, + }; + } + + /** + * 핸들러 테스트용 샘플 URL 목록 제공 + */ + @Get('sample-urls') + @ApiOperation({ + summary: '테스트용 샘플 URL 목록', + description: '각 핸들러를 테스트할 수 있는 샘플 URL들을 제공합니다.', + }) + @ApiResponse({ + status: 200, + description: '샘플 URL 목록', + schema: { + type: 'object', + properties: { + categories: { + type: 'object', + additionalProperties: { + type: 'array', + items: { type: 'string' }, + }, + }, + }, + }, + }) + getSampleUrls() { + return { + categories: { + 'PDF 파일': ['https://arxiv.org/pdf/2301.00001.pdf', 'https://example.com/document.pdf'], + 'RSS 피드': ['https://feeds.feedburner.com/TechCrunch', 'https://rss.cnn.com/rss/edition.rss'], + YouTube: ['https://www.youtube.com/watch?v=dQw4w9WgXcQ', 'https://youtu.be/dQw4w9WgXcQ'], + '소셜 미디어': [ + 'https://twitter.com/elonmusk/status/1234567890', + 'https://x.com/elonmusk/status/1234567890', + 'https://www.instagram.com/p/ABC123/', + 'https://www.tiktok.com/@user/video/1234567890', + 'https://www.facebook.com/post/123456789', + 'https://www.linkedin.com/posts/user_activity-123456789', + ], + '뉴스 사이트': [ + 'https://www.nytimes.com/2024/01/01/technology/ai-breakthrough.html', + 'https://www.washingtonpost.com/technology/2024/01/01/tech-news/', + 'https://www.cnn.com/2024/01/01/tech/ai-news/index.html', + 'https://www.bbc.com/news/technology-12345678', + 'https://www.reuters.com/technology/ai-breakthrough-2024-01-01/', + ], + '도메인 특화': [ + 'https://medium.com/@author/article-title-123', + 'https://substack.com/p/article-title', + 'https://github.com/user/repo/blob/main/README.md', + 'https://en.wikipedia.org/wiki/Artificial_Intelligence', + 'https://stackoverflow.com/questions/123456/how-to-code', + ], + '일반 웹사이트': ['https://example.com/article', 'https://blog.example.com/post/123'], + }, + }; + } + + /** + * 스크래핑된 콘텐츠를 HTML로 렌더링해서 브라우저에서 볼 수 있게 함 + */ + @Get('preview') + @ApiOperation({ + summary: '스크래핑 콘텐츠 미리보기', + description: '스크래핑된 콘텐츠를 HTML 형태로 렌더링하여 브라우저에서 직접 볼 수 있습니다.', + }) + @ApiQuery({ + name: 'url', + description: '스크래핑할 URL', + example: 'https://example.com/article', + }) + @ApiQuery({ + name: 'mode', + description: '미리보기 모드 (content: 콘텐츠만, full: 전체 결과)', + enum: ['content', 'full'], + required: false, + example: 'content', + }) + @Header('Content-Type', 'text/html; charset=utf-8') + async previewContent(@Query('url') url: string, @Query('mode') mode: string = 'content', @Res() res: Response) { + if (!url) { + throw new BadRequestException('URL is required'); + } + + try { + const result = await this.puppeteerParseService.fetchContent({ url }); + + if (mode === 'content' && result.content) { + // 콘텐츠만 보여주기 + const html = this.wrapContentInHtml(result.content, result.title, result.finalUrl); + res.send(html); + } else { + // 전체 결과를 JSON 형태로 보여주기 + const html = this.createResultViewHtml(result, url); + res.send(html); + } + } catch (error) { + const errorHtml = this.createErrorHtml(url, (error as Error).message); + res.status(500).send(errorHtml); + } + } + + /** + * 사전 처리 결과를 HTML로 미리보기 + */ + @Get('preview-prehandle') + @ApiOperation({ + summary: '사전 처리 결과 미리보기', + description: '사전 처리 핸들러들의 결과를 HTML 형태로 보여줍니다.', + }) + @ApiQuery({ + name: 'url', + description: '테스트할 URL', + example: 'https://www.youtube.com/watch?v=dQw4w9WgXcQ', + }) + @Header('Content-Type', 'text/html; charset=utf-8') + async previewPreHandle(@Query('url') url: string, @Res() res: Response) { + if (!url) { + throw new BadRequestException('URL is required'); + } + + try { + const result = await this.preHandlerService.execute(url); + const handlerUsed = this.determineHandlerUsed(url, result); + + const html = this.createPreHandleViewHtml(url, result, handlerUsed); + res.send(html); + } catch (error) { + const errorHtml = this.createErrorHtml(url, (error as Error).message); + res.status(500).send(errorHtml); + } + } + + /** + * 어떤 핸들러가 사용되었는지 추정하는 헬퍼 메서드 + */ + private determineHandlerUsed(originalUrl: string, result: PreHandleResult): string { + const url = new URL(originalUrl); + + // PDF 핸들러 + if (url.pathname.toLowerCase().endsWith('.pdf')) { + return 'PdfHandler'; + } + + // RSS 핸들러 + if (url.pathname.includes('rss') || url.pathname.includes('feed') || url.pathname.includes('atom')) { + return 'RssHandler'; + } + + // YouTube 핸들러 + if (url.hostname.includes('youtube.com') || url.hostname.includes('youtu.be')) { + return 'YoutubeHandler'; + } + + // 소셜 미디어 핸들러 + const socialMediaDomains = [ + 'twitter.com', + 'x.com', + 'instagram.com', + 'tiktok.com', + 'facebook.com', + 'linkedin.com', + ]; + if (socialMediaDomains.some((domain) => url.hostname.includes(domain))) { + return 'SocialMediaHandler'; + } + + // 뉴스 사이트 핸들러 + const newsDomains = ['nytimes.com', 'washingtonpost.com', 'cnn.com', 'bbc.com', 'reuters.com']; + if (newsDomains.some((domain) => url.hostname.includes(domain))) { + return 'NewsSiteHandler'; + } + + // 도메인 특화 핸들러 + const domainSpecificDomains = [ + 'medium.com', + 'substack.com', + 'github.com', + 'wikipedia.org', + 'stackoverflow.com', + ]; + if (domainSpecificDomains.some((domain) => url.hostname.includes(domain))) { + return 'DomainSpecificHandler'; + } + + // 기본값 + return result.content ? 'ReadabilityHandler' : 'No handler matched'; + } + + /** + * 콘텐츠를 HTML로 래핑하는 헬퍼 메서드 + */ + private wrapContentInHtml(content: string, title?: string, sourceUrl?: string): string { + return ` +<!DOCTYPE html> +<html lang="ko"> +<head> + <meta charset="UTF-8"> + <meta name="viewport" content="width=device-width, initial-scale=1.0"> + <title>${title || 'Scraped Content'} + + + +
+
+

${title || 'Scraped Content'}

+ ${sourceUrl ? `

출처: ${sourceUrl}

` : ''} +
+
+ ${content} +
+
+ +`; + } + + /** + * 전체 스크래핑 결과를 보여주는 HTML 생성 + */ + private createResultViewHtml(result: ScrapedContentOutput, originalUrl: string): string { + return ` + + + + + + 스크래핑 결과 - ${result.title || 'Unknown'} + + + +
+

🔍 스크래핑 결과

+ + + +

📄 콘텐츠 미리보기

+
+ ${result.content ? result.content : '

콘텐츠가 없습니다.

'} +
+ + +
+ +`; + } + + /** + * 사전 처리 결과를 HTML로 미리보기 + */ + private createPreHandleViewHtml(originalUrl: string, result: PreHandleResult, handlerUsed: string): string { + return ` + + + + + + 사전 처리 결과 - ${handlerUsed} + + + +
+

🔧 사전 처리 결과

+ +
${handlerUsed}
+ +
+
+
제목
+
${result.title || '없음'}
+
+
+
콘텐츠 타입
+
${result.contentType || '없음'}
+
+
+
콘텐츠 길이
+
${result.content?.length || 0} 문자
+
+
+
URL 변경 여부
+
${originalUrl !== result.url ? '✅ 변경됨' : '❌ 변경되지 않음'}
+
+
+ +
+

📍 URL 비교

+
+ 원본: ${originalUrl} +
+
+ 결과: ${result.url} +
+
+ + ${ + result.content + ? ` +

📄 콘텐츠 미리보기

+
+ ${result.content} +
+ ` + : '' + } + + +
+ +`; + } + + /** + * 오류 HTML 생성 + */ + private createErrorHtml(url: string, errorMessage: string): string { + return ` + + + + + + 오류 발생 + + + +
+

❌ 오류 발생

+

URL: ${url}

+
+ 오류 메시지:
+ ${errorMessage} +
+ +
+ +`; + } +} diff --git a/src/modules/scraper/scraper.module.ts b/src/modules/scraper/scraper.module.ts index 54cb42d..55d70c1 100644 --- a/src/modules/scraper/scraper.module.ts +++ b/src/modules/scraper/scraper.module.ts @@ -1,8 +1,14 @@ import { Module } from '@nestjs/common'; +import { BrowserService } from './services/browser.service'; +import { PuppeteerParseService } from './services/puppeteer-parse.service'; +import { PreHandlerModule } from '../pre-handler/pre-handler.module'; +import { ArticleModule } from '../article/article.module'; +import { ScraperController } from './scraper.controller'; @Module({ - imports: [], - controllers: [], - providers: [], + imports: [PreHandlerModule, ArticleModule], + controllers: [ScraperController], + providers: [BrowserService, PuppeteerParseService], + exports: [PuppeteerParseService], }) export class ScraperModule {} diff --git a/src/modules/scraper/services/puppeteer-parse.service.ts b/src/modules/scraper/services/puppeteer-parse.service.ts new file mode 100644 index 0000000..fdaa0cc --- /dev/null +++ b/src/modules/scraper/services/puppeteer-parse.service.ts @@ -0,0 +1,283 @@ +import { Injectable, Logger, Optional } from '@nestjs/common'; +import { BrowserContext, Page, Protocol } from 'puppeteer-core'; +import { BrowserService } from './browser.service'; +import { FetchContentInput } from '../dto/fetch-content.input'; +import { ScrapedContentOutput } from '../dto/scraped-content.output'; +import { InvalidUrlException } from '../exceptions/invalid-url.exception'; +import { PreHandlerService } from '../../pre-handler/pre-handler.service'; +import { ArticleService } from '../../article/services/article.service'; + +// ---------------------- CONSTANTS ---------------------- +const NON_SCRIPT_HOSTS = ['medium.com', 'fastcompany.com', 'fortelabs.com'] as const; +const ALLOWED_CONTENT_TYPES = ['text/html', 'application/octet-stream', 'text/plain', 'application/pdf'] as const; + +// Interfaces used by the service (kept outside the class for clarity) +export interface RetrievePageParams { + url: string; + locale?: string; + timezone?: string; +} + +export interface RetrievePageResult { + page?: Page; + context: BrowserContext; + finalUrl: string; + contentType: string; +} + +export interface FetchContentWithSaveInput extends FetchContentInput { + userId?: string; + saveToDatabase?: boolean; + tags?: string[]; + isBookmarked?: boolean; + isArchived?: boolean; +} + +/** + * Service responsible for fetching and parsing remote HTML / PDF content using Puppeteer. + */ +@Injectable() +export class PuppeteerParseService { + private readonly logger = new Logger(PuppeteerParseService.name); + + constructor( + private readonly browserService: BrowserService, + private readonly preHandlerService: PreHandlerService, + @Optional() private readonly articleService?: ArticleService, + ) {} + + /** + * Fetch readable content from the given URL. + */ + async fetchContent({ url, locale, timezone }: FetchContentInput): Promise { + const startedAt = Date.now(); + + url = this.normalizeUrl(url); + + // (1) Execute pre-handling using the new extensible service + const preHandleResult = await this.preHandlerService.execute(url); + + let { title, content, contentType } = preHandleResult; + url = preHandleResult.url; // URL might have been changed by a handler + + // (2) Fetch via Puppeteer when necessary + if (contentType !== 'application/pdf' && (!title || !content)) { + const pageResult = await this.retrievePage({ url, locale, timezone }); + url = pageResult.finalUrl; + contentType = pageResult.contentType; + + if (pageResult.page) { + const html = await this.retrieveHtml(pageResult.page); + title = html.title; + content = html.content; + } + + await pageResult.context?.close(); + } + + this.logger.debug(`Scraping done in ${Date.now() - startedAt} ms`); + return { finalUrl: url, title, content, contentType }; + } + + /** + * Fetch content and optionally save to database. + */ + async fetchContentWithSave(input: FetchContentWithSaveInput): Promise { + const { userId, saveToDatabase, tags, isBookmarked, isArchived, ...fetchInput } = input; + + // 일반 스크래핑 수행 + const scrapedContent = await this.fetchContent(fetchInput); + + // 데이터베이스 저장 옵션이 활성화되고 ArticleService가 사용 가능한 경우 + if (saveToDatabase && userId && this.articleService) { + try { + await this.articleService.saveScrapedContent(userId, scrapedContent, { + tags, + isBookmarked, + isArchived, + }); + this.logger.debug(`Content saved to database for user ${userId}: ${scrapedContent.finalUrl}`); + } catch (error) { + this.logger.warn(`Failed to save content to database: ${(error as Error).message}`); + // 저장 실패해도 스크래핑 결과는 반환 + } + } + + return scrapedContent; + } + + // ---------------------------------------------------- + // ------------------ PRIVATE HELPERS ---------------- + // ---------------------------------------------------- + + private normalizeUrl(raw: string): string { + const extracted = this.tryParseUrl(raw); + if (!extracted) throw new InvalidUrlException('value is empty'); + this.validateUrl(extracted); + return new URL(extracted).href; + } + + private tryParseUrl(str: string): string | null { + const match = /(https?:\/\/[^\s]+)/i.exec(str); + return match?.[0] ?? null; + } + + private validateUrl(url: string): void { + const parsed = new URL(url); + if (!['http:', 'https:'].includes(parsed.protocol)) { + throw new InvalidUrlException('protocol must be http/https'); + } + if (['localhost', '0.0.0.0'].includes(parsed.hostname)) { + throw new InvalidUrlException('localhost not allowed'); + } + if (/^(10|172\.16|192\.168)\./.test(parsed.hostname)) { + throw new InvalidUrlException('private ip not allowed'); + } + } + + private enableJavascriptForUrl(targetUrl: string): boolean { + try { + const host = new URL(targetUrl).hostname; + return !NON_SCRIPT_HOSTS.some((h) => host.endsWith(h)); + } catch { + return true; + } + } + + // -------------------- Puppeteer -------------------- + + private async retrievePage({ url, locale, timezone }: RetrievePageParams): Promise { + const browser = await this.browserService.getBrowser(); + const context = await browser.createBrowserContext(); + const page = await context.newPage(); + + if (!this.enableJavascriptForUrl(url)) { + await page.setJavaScriptEnabled(false); + } + if (locale) await page.setExtraHTTPHeaders({ 'Accept-Language': locale }); + if (timezone && process.env['USE_FIREFOX'] !== 'true') { + await page.emulateTimezone(timezone); + } + + await this.setupNetworkInterception(page); + + const response = await page.goto(url, { timeout: 30_000, waitUntil: ['load'] }); + if (!response) throw new Error('no response from page'); + + await this.waitForDomToSettle(page); + + return { + page, + context, + finalUrl: response.url(), + contentType: response.headers()['content-type'] ?? 'text/html', + }; + } + + private async setupNetworkInterception(page: Page): Promise { + const client = await page.createCDPSession(); + + // PDF / MIME type blocking + await client.send('Network.setRequestInterception', { + patterns: [ + { + urlPattern: '*', + resourceType: 'Document', + interceptionStage: 'HeadersReceived', + }, + ], + }); + + client.on('Network.requestIntercepted', (e: Protocol.Network.RequestInterceptedEvent) => { + void (async () => { + const headers = e.responseHeaders ?? {}; + const ctype = (headers['content-type'] ?? headers['Content-Type'] ?? '').split(';')[0].toLowerCase(); + const shouldBlock = + ctype && !ALLOWED_CONTENT_TYPES.includes(ctype as (typeof ALLOWED_CONTENT_TYPES)[number]); + + await client.send('Network.continueInterceptedRequest', { + interceptionId: e.interceptionId, + ...(shouldBlock ? { errorReason: 'BlockedByClient' } : {}), + }); + })(); + }); + + // request-level abort rules + const failed = new Set(); + page.on('request', (req) => { + void (async () => { + if (req.isInterceptResolutionHandled()) return; + const url = req.url().toLowerCase(); + if (url.endsWith('.woff2') || url.includes('mathjax') || failed.has(url)) { + await req.abort(); + return; + } + await req.continue(); + })(); + }); + + page.on('response', (res) => { + if (!res.ok()) failed.add(res.url()); + }); + } + + private async waitForDomToSettle(page: Page, timeoutMs = 5_000, debounceMs = 1_000): Promise { + await page.evaluate( + (tout, deb) => { + const debounce = (fn: () => void, ms: number) => { + let t: ReturnType; + return () => { + clearTimeout(t); + t = setTimeout(fn, ms); + }; + }; + return new Promise((resolve) => { + const mainT = setTimeout(() => { + obs.disconnect(); + resolve(); + }, tout); + + const obs = new MutationObserver( + debounce(() => { + clearTimeout(mainT); + obs.disconnect(); + resolve(); + }, deb), + ); + obs.observe(document.body, { childList: true, subtree: true, attributes: true }); + }); + }, + timeoutMs, + debounceMs, + ); + } + + // ------------------ HTML extraction ---------------- + + private async retrieveHtml(page: Page): Promise<{ title?: string; content: string }> { + const title = await page.title(); + await page.waitForSelector('body'); + + await Promise.race([this.autoScroll(page), new Promise((r) => setTimeout(r, 5_000))]); + + const domContent = await page.evaluate(() => document.documentElement.outerHTML); + return { title, content: domContent }; + } + + private async autoScroll(page: Page): Promise { + await page.evaluate(async () => { + await new Promise((resolve) => { + let total = 0; + const distance = 500; + const timer = setInterval(() => { + window.scrollBy(0, distance); + total += distance; + if (total >= document.body.scrollHeight) { + clearInterval(timer); + resolve(); + } + }, 10); + }); + }); + } +} diff --git a/src/modules/scraper/services/puppeteer-parse.ts b/src/modules/scraper/services/puppeteer-parse.ts deleted file mode 100644 index e69de29..0000000 From f9341f51f1a25bb2759d11a125ccada886062395 Mon Sep 17 00:00:00 2001 From: reach0908 Date: Fri, 4 Jul 2025 23:07:34 +0900 Subject: [PATCH 06/28] =?UTF-8?q?feat(auth):=20JWT=20=EA=B0=80=EB=93=9C=20?= =?UTF-8?q?=EB=B0=8F=20=EC=8A=A4=ED=81=AC=EB=9E=98=ED=8D=BC=20=EC=BB=A8?= =?UTF-8?q?=ED=8A=B8=EB=A1=A4=EB=9F=AC=20=EA=B0=9C=EC=84=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * JWT 가드에서 Bearer 토큰 요구 메시지를 수정하고, 로깅 기능을 추가하여 오류 발생 시 로그 기록을 강화했습니다. * Swagger 설정에서 JWT 토큰 유지 기능을 추가하여 브라우저 새로고침 시에도 인증 상태를 유지하도록 개선했습니다. * 스크래퍼 컨트롤러에서 인증된 사용자 정보를 사용하여 콘텐츠 저장 기능을 활성화하고, 관련 DTO를 업데이트했습니다. * 여러 API 엔드포인트에서 AuthRequest 타입을 사용하여 사용자 정보를 명확히 처리하도록 변경했습니다. --- src/common/guards/jwt.guard.ts | 18 +- .../interceptors/logging.interceptor.ts | 31 +- src/main.ts | 9 +- src/modules/article/article.controller.ts | 30 +- src/modules/auth/services/token.service.ts | 7 +- .../handlers/domain-specific.handler.ts | 408 +++++++++++++++++- .../pre-handler/handlers/news-site.handler.ts | 2 - .../pre-handler/handlers/pdf.handler.ts | 1 - .../handlers/readability.handler.ts | 1 - .../pre-handler/handlers/rss.handler.ts | 2 - .../handlers/social-media.handler.ts | 2 - .../pre-handler/handlers/youtube.handler.ts | 2 - .../pre-handler/pre-handler.service.ts | 19 +- src/modules/scraper/scraper.controller.ts | 31 +- .../scraper/services/browser.service.ts | 58 ++- .../services/puppeteer-parse.service.ts | 31 +- 16 files changed, 561 insertions(+), 91 deletions(-) diff --git a/src/common/guards/jwt.guard.ts b/src/common/guards/jwt.guard.ts index ed759c9..6e58da5 100644 --- a/src/common/guards/jwt.guard.ts +++ b/src/common/guards/jwt.guard.ts @@ -1,4 +1,4 @@ -import { CanActivate, ExecutionContext, Injectable, UnauthorizedException } from '@nestjs/common'; +import { CanActivate, ExecutionContext, Injectable, UnauthorizedException, Logger } from '@nestjs/common'; import { JwtService } from '@nestjs/jwt'; import { Request } from 'express'; import { AuthService } from 'src/modules/auth/services/auth.service'; @@ -10,6 +10,8 @@ interface AuthenticatedRequest extends Request { @Injectable() export class JwtAuthGuard implements CanActivate { + private readonly logger = new Logger(JwtAuthGuard.name); + constructor( private readonly jwtService: JwtService, private readonly authService: AuthService, @@ -18,33 +20,37 @@ export class JwtAuthGuard implements CanActivate { async canActivate(context: ExecutionContext): Promise { const request = context.switchToHttp().getRequest(); - // Authorization 헤더에서 Bearer 토큰 추출 const authHeader = request.headers['authorization']; + if (!authHeader || !authHeader.startsWith('Bearer ')) { - throw new UnauthorizedException('No access token provided'); + throw new UnauthorizedException('Bearer token required'); } + const accessToken = authHeader.split(' ')[1]; + if (!accessToken) { + throw new UnauthorizedException('No access token provided'); + } + try { - // Access Token 검증 const payload = this.jwtService.verify(accessToken); if (payload.type !== TokenType.ACCESS) { throw new UnauthorizedException('Invalid token type'); } - // 사용자 정보를 request에 추가 const user = await this.authService.validateUser(payload.email); if (!user) { throw new UnauthorizedException('Invalid access token'); } - request.user = user; + request.user = user; return true; } catch (err) { if (err instanceof UnauthorizedException) { throw err; } + this.logger.error(`JWT verification failed: ${(err as Error).message}`); throw new UnauthorizedException('Invalid access token'); } } diff --git a/src/common/interceptors/logging.interceptor.ts b/src/common/interceptors/logging.interceptor.ts index 96a7266..89833d7 100644 --- a/src/common/interceptors/logging.interceptor.ts +++ b/src/common/interceptors/logging.interceptor.ts @@ -5,7 +5,6 @@ import { Request, Response } from 'express'; @Injectable() export class LoggingInterceptor implements NestInterceptor { private readonly logger = new Logger(LoggingInterceptor.name); - private readonly logFormat = '[%s] %s - %d - %dms'; private getNow(): number { if (typeof process.hrtime?.bigint === 'function') { @@ -21,29 +20,29 @@ export class LoggingInterceptor implements NestInterceptor { const startTime = this.getNow(); const clientIp = this.getClientIp(req); - // 요청 시작 로그 (비동기) - setImmediate(() => { - this.logger.log(`${method} ${url} - ${clientIp}`); - }); + // 요청 시작 로그 + this.logger.log(`${method} ${url} - ${clientIp}`); return next.handle().pipe( tap(() => { - setImmediate(() => { - const endTime = this.getNow(); - const responseTime = endTime - startTime; - const statusCode = res.statusCode; - if (statusCode >= 400 || responseTime > 1000) { - this.logger.warn(this.logFormat, method, url, statusCode, responseTime); - } else { - this.logger.log(`${method} ${url} - ${statusCode}`); - } - }); + const endTime = this.getNow(); + const responseTime = endTime - startTime; + const statusCode = res.statusCode; + const logMessage = `${method} ${url} - ${statusCode} - ${responseTime.toFixed(2)}ms`; + + if (statusCode >= 400 || responseTime > 1000) { + this.logger.warn(logMessage); + } else { + this.logger.log(logMessage); + } }), catchError((err: unknown) => { const endTime = this.getNow(); const responseTime = endTime - startTime; const statusCode = res.statusCode || 500; - this.logger.error(this.logFormat, method, url, statusCode, responseTime); + const logMessage = `${method} ${url} - ${statusCode} - ${responseTime.toFixed(2)}ms`; + + this.logger.error(logMessage); if (err instanceof Error) { this.logger.error(`${method} ${url} - ${err.message}`); } diff --git a/src/main.ts b/src/main.ts index c96513a..5bfebb6 100644 --- a/src/main.ts +++ b/src/main.ts @@ -44,16 +44,21 @@ async function bootstrap() { type: 'http', scheme: 'bearer', bearerFormat: 'JWT', - description: 'Input your JWT token', + description: 'Enter JWT token (without "Bearer" prefix)', name: 'Authorization', in: 'header', }, 'access-token', ) + .addSecurityRequirements('access-token') .build(); const document = SwaggerModule.createDocument(app, config); - SwaggerModule.setup('api', app, document); + SwaggerModule.setup('api', app, document, { + swaggerOptions: { + persistAuthorization: true, // 브라우저 새로고침 시에도 토큰 유지 + }, + }); } await app.listen(process.env.PORT ?? 4000); diff --git a/src/modules/article/article.controller.ts b/src/modules/article/article.controller.ts index c224d7d..ecfbbec 100644 --- a/src/modules/article/article.controller.ts +++ b/src/modules/article/article.controller.ts @@ -20,12 +20,7 @@ import { UpdateArticleInput } from './dto/update-article.input'; import { ListArticlesInput } from './dto/list-articles.input'; import { ArticleOutput } from './dto/article.output'; import { PaginatedArticlesOutput } from './dto/paginated-articles.output'; -interface AuthenticatedRequest extends Request { - user: { - id: string; - email: string; - }; -} +import { AuthRequest } from 'src/types'; @ApiTags('articles') @Controller('articles') @@ -56,7 +51,7 @@ export class ArticleController { status: 401, description: '인증 실패', }) - async createArticle(@Request() req: any, @Body() input: CreateArticleInput): Promise { + async createArticle(@Request() req: AuthRequest, @Body() input: CreateArticleInput): Promise { return this.articleService.createArticle(req.user.id, input); } @@ -85,7 +80,10 @@ export class ArticleController { status: 401, description: '인증 실패', }) - async getArticles(@Request() req: any, @Query() query: ListArticlesInput): Promise { + async getArticles( + @Request() req: AuthRequest, + @Query() query: ListArticlesInput, + ): Promise { return this.articleService.getArticles(req.user.id, query); } @@ -110,7 +108,7 @@ export class ArticleController { status: 404, description: 'Article을 찾을 수 없음', }) - async getArticle(@Request() req: any, @Param('id') id: string): Promise { + async getArticle(@Request() req: AuthRequest, @Param('id') id: string): Promise { return this.articleService.getArticle(req.user.id, id); } @@ -137,7 +135,7 @@ export class ArticleController { description: 'Article을 찾을 수 없음', }) async updateArticle( - @Request() req: any, + @Request() req: AuthRequest, @Param('id') id: string, @Body() input: UpdateArticleInput, ): Promise { @@ -165,7 +163,7 @@ export class ArticleController { status: 404, description: 'Article을 찾을 수 없음', }) - async deleteArticle(@Request() req: any, @Param('id') id: string): Promise { + async deleteArticle(@Request() req: AuthRequest, @Param('id') id: string): Promise { return this.articleService.deleteArticle(req.user.id, id); } @@ -190,7 +188,7 @@ export class ArticleController { status: 404, description: 'Article을 찾을 수 없음', }) - async toggleBookmark(@Request() req: any, @Param('id') id: string): Promise { + async toggleBookmark(@Request() req: AuthRequest, @Param('id') id: string): Promise { return this.articleService.toggleBookmark(req.user.id, id); } @@ -215,7 +213,7 @@ export class ArticleController { status: 404, description: 'Article을 찾을 수 없음', }) - async toggleArchive(@Request() req: any, @Param('id') id: string): Promise { + async toggleArchive(@Request() req: AuthRequest, @Param('id') id: string): Promise { return this.articleService.toggleArchive(req.user.id, id); } @@ -244,7 +242,7 @@ export class ArticleController { status: 401, description: '인증 실패', }) - async getArticleStats(@Request() req: any): Promise<{ + async getArticleStats(@Request() req: AuthRequest): Promise<{ total: number; bookmarked: number; archived: number; @@ -273,7 +271,7 @@ export class ArticleController { status: 401, description: '인증 실패', }) - async getUserTags(@Request() req: any): Promise { + async getUserTags(@Request() req: AuthRequest): Promise { return this.articleService.getUserTags(req.user.id); } @@ -306,7 +304,7 @@ export class ArticleController { status: 401, description: '인증 실패', }) - async checkUrl(@Request() req: any, @Query('url') url: string): Promise<{ exists: boolean; url: string }> { + async checkUrl(@Request() req: AuthRequest, @Query('url') url: string): Promise<{ exists: boolean; url: string }> { const exists = await this.articleService.isUrlAlreadySaved(req.user.id, url); return { exists, url }; } diff --git a/src/modules/auth/services/token.service.ts b/src/modules/auth/services/token.service.ts index 70c44bd..bfc223b 100644 --- a/src/modules/auth/services/token.service.ts +++ b/src/modules/auth/services/token.service.ts @@ -104,9 +104,7 @@ export class TokenService { await this.removeRefreshToken(refreshToken); return await this.generateTokenPair(user); - } catch (error: unknown) { - // 디버깅을 위한 로그만 남기고, 클라이언트에는 일관된 메시지 반환 - this.logger.debug('Token refresh failed', error); + } catch { throw new UnauthorizedException('Invalid refresh token'); } } @@ -124,16 +122,13 @@ export class TokenService { tx, ); }); - this.logger.debug(`RefreshToken 저장: ${userId}`); } async removeRefreshToken(token: string): Promise { await this.refreshTokenRepository.deleteMany({ where: { token } }); - this.logger.debug(`RefreshToken 삭제: ${token}`); } async logout(userId: string): Promise { await this.refreshTokenRepository.deleteMany({ where: { userId } }); - this.logger.debug(`모든 RefreshToken 삭제(로그아웃): ${userId}`); } } diff --git a/src/modules/pre-handler/handlers/domain-specific.handler.ts b/src/modules/pre-handler/handlers/domain-specific.handler.ts index 19b6206..b9e6a20 100644 --- a/src/modules/pre-handler/handlers/domain-specific.handler.ts +++ b/src/modules/pre-handler/handlers/domain-specific.handler.ts @@ -1,6 +1,7 @@ import { Injectable, Logger } from '@nestjs/common'; import { IContentHandler } from '../interfaces/content-handler.interface'; import { PreHandleResult } from '../dto/pre-handle-result.dto'; +import { JSDOM } from 'jsdom'; /** * A map of domain names to their URL transformation functions. @@ -17,9 +18,12 @@ const DOMAIN_TRANSFORMATIONS: Record URL> = { return newUrl; }, 'medium.com': (url) => { - // Use a proxy/reader service to bypass paywalls and pop-ups, - // mirroring the approach used by Omnivore for enhanced compatibility. - return new URL(`https://r.jina.ai/${url.href}`); + // Clean up Medium URLs by removing tracking parameters + const newUrl = new URL(url.href); + newUrl.searchParams.delete('source'); + newUrl.searchParams.delete('gi'); + newUrl.searchParams.delete('sk'); + return newUrl; }, // Developer platforms (non-social aspects) @@ -45,6 +49,31 @@ const DOMAIN_TRANSFORMATIONS: Record URL> = { newUrl.hostname = newUrl.hostname.replace('en.wikipedia.org', 'm.wikipedia.org'); return newUrl; }, + + // Korean platforms + 'blog.naver.com': (url) => { + // Naver Blog: Use mobile version for better content extraction + // Convert https://blog.naver.com/username/postid to https://m.blog.naver.com/username/postid + const newUrl = new URL(url.href); + newUrl.hostname = 'm.blog.naver.com'; + return newUrl; + }, + 'cafe.naver.com': (url) => { + // Naver Cafe: Use mobile version + const newUrl = new URL(url.href); + newUrl.hostname = 'm.cafe.naver.com'; + return newUrl; + }, + 'post.naver.com': (url) => { + // Naver Post: Use mobile version + const newUrl = new URL(url.href); + newUrl.hostname = 'm.post.naver.com'; + return newUrl; + }, + 'tistory.com': (url) => { + // Tistory Blog: Keep original, usually accessible + return url; + }, 'stackoverflow.com': (url) => { // Stack Overflow: Keep original, it's usually accessible return url; @@ -340,24 +369,385 @@ export class DomainSpecificHandler implements IContentHandler { * @param url - The URL of the content to handle. * @returns A `PreHandleResult` with the new URL, or `null` on failure. */ - public handle(url: URL): Promise { + public async handle(url: URL): Promise { const domain = Object.keys(DOMAIN_TRANSFORMATIONS).find((d) => url.hostname.endsWith(d)); if (!domain) { - return Promise.resolve(null); + return null; } try { const transform = DOMAIN_TRANSFORMATIONS[domain]; const newUrl = transform(url); - this.logger.debug(`Transformed [${domain}] URL to: ${newUrl.href}`); - return Promise.resolve({ + // Extract title and content from the original URL for specific domains + let title: string | undefined; + let content: string | undefined; + let contentType = 'text/html'; // Default content type + + if (domain === 'medium.com') { + const mediumResult = await this.extractMediumContent(newUrl); + title = mediumResult.title; + content = mediumResult.content; + contentType = 'text/html'; // Medium content comes as HTML + } else if (domain === 'blog.naver.com') { + title = await this.extractNaverBlogTitle(url); + } else if (domain === 'substack.com') { + title = this.extractSubstackTitle(url); + } else if (domain === 'github.com') { + title = this.extractGitHubTitle(url); + } else if (domain === 'stackoverflow.com') { + title = this.extractStackOverflowTitle(url); + } else if (domain === 'wikipedia.org') { + title = this.extractWikipediaTitle(url); + } + + return { url: newUrl.href, - }); + title, + content, + contentType, + }; } catch (error) { this.logger.warn(`DomainSpecificHandler failed for ${url.href}: ${(error as Error).message}`); - return Promise.resolve(null); + return null; + } + } + + /** + * Extracts title and content from Medium URL by fetching and parsing HTML directly. + * @param url - The cleaned Medium URL. + * @returns The extracted title and content. + */ + private async extractMediumContent(url: URL): Promise<{ + title?: string; + content?: string; + }> { + try { + this.logger.debug(`Extracting Medium content from: ${url.href}`); + + // Create an AbortController for timeout + const controller = new AbortController(); + const timeoutId = setTimeout(() => controller.abort(), 15000); // 15 second timeout + + const response = await fetch(url.href, { + headers: { + 'User-Agent': + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Accept-Encoding': 'gzip, deflate', + Connection: 'keep-alive', + 'Cache-Control': 'no-cache', + }, + redirect: 'follow', + signal: controller.signal, + }); + + clearTimeout(timeoutId); + + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${response.statusText}`); + } + + const html = await response.text(); + this.logger.debug(`Successfully fetched Medium HTML, length: ${html.length}`); + + const dom = new JSDOM(html); + const document = dom.window.document; + + // Extract title + const title = document.title?.trim() || this.extractTitleFromUrl(url); + + // Process images: convert picture tags to img tags with optimal source + this.optimizeMediumImages(document); + + // Extract the main content + const content = document.body?.outerHTML; + + this.logger.log(`Successfully extracted Medium content: title="${title?.substring(0, 50)}"`); + + return { + title, + content, + }; + } catch (error) { + this.logger.warn(`Failed to extract Medium content from ${url.href}: ${(error as Error).message}`); + + // Fallback to URL-based title extraction + const fallbackTitle = this.extractTitleFromUrl(url); + return { + title: fallbackTitle, + content: undefined, + }; } } + + /** + * Optimizes Medium images by converting picture tags to img tags. + * Selects the largest image from srcSet for better quality. + * @param document - The DOM document to process. + */ + private optimizeMediumImages(document: Document): void { + const pictures = document.querySelectorAll('picture'); + + pictures.forEach((picture) => { + const source = picture.querySelector('source'); + if (source) { + const srcSet = source.getAttribute('srcSet'); + + if (srcSet) { + // Parse srcSet and sort by image width (descending) + const sources = srcSet + .split(', ') + .map((src) => src.trim().split(' ')) + .filter((parts) => parts.length >= 2) + .sort((a, b) => { + const widthA = Number(a[1].replace('w', '')); + const widthB = Number(b[1].replace('w', '')); + return widthB - widthA; // Sort descending (largest first) + }); + + // Use the largest image from the source set + if (sources.length > 0 && sources[0].length > 0) { + const imageUrl = sources[0][0]; + const img = document.createElement('img'); + img.src = imageUrl; + + // Copy any existing attributes from the picture element + const existingImg = picture.querySelector('img'); + if (existingImg) { + if (existingImg.alt) img.alt = existingImg.alt; + if (existingImg.title) img.title = existingImg.title; + } + + // Replace picture with img + picture.parentNode?.replaceChild(img, picture); + } + } + } + }); + } + + /** + * Extracts title from Naver Blog URL by fetching and parsing meta tags. + * @param url - The Naver Blog URL. + * @returns The extracted title or undefined. + */ + private async extractNaverBlogTitle(url: URL): Promise { + try { + // Try to extract from the original URL first + const metaInfo = await this.fetchNaverBlogMeta(url.href); + + if (metaInfo.title) { + this.logger.log(`Successfully extracted Naver Blog title: ${metaInfo.title}`); + return metaInfo.title; + } + + // Fallback to URL-based extraction + const fallbackTitle = this.extractNaverBlogTitleFromUrl(url); + this.logger.debug(`Using fallback title for Naver Blog: ${fallbackTitle}`); + return fallbackTitle; + } catch (error) { + this.logger.warn(`Failed to extract Naver Blog title from ${url.href}: ${(error as Error).message}`); + return this.extractNaverBlogTitleFromUrl(url); + } + } + + /** + * Fetches Naver Blog HTML with special headers and extracts meta information. + * @param urlString - The Naver Blog URL to fetch. + * @returns Meta information object. + */ + private async fetchNaverBlogMeta(urlString: string): Promise<{ + title?: string; + description?: string; + }> { + // Special headers for Naver Blog + const response = await fetch(urlString, { + headers: { + 'User-Agent': + 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1', + Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'ko-KR,ko;q=0.9,en;q=0.8', + 'Accept-Encoding': 'gzip, deflate', + Connection: 'keep-alive', + Referer: 'https://blog.naver.com/', + }, + redirect: 'follow', + }); + + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${response.statusText}`); + } + + const html = await response.text(); + this.logger.debug(`Successfully fetched Naver Blog HTML, length: ${html.length}`); + + const dom = new JSDOM(html); + const document = dom.window.document; + + // Extract meta information + const title = document.querySelector('title')?.textContent?.trim(); + const description = document.querySelector('meta[name="description"]')?.getAttribute('content')?.trim(); + const ogTitle = document.querySelector('meta[property="og:title"]')?.getAttribute('content')?.trim(); + const ogDescription = document + .querySelector('meta[property="og:description"]') + ?.getAttribute('content') + ?.trim(); + + // Clean up Naver Blog title (remove ":" and blog name) + let cleanTitle = ogTitle || title; + if (cleanTitle) { + // Remove common Naver Blog suffixes + cleanTitle = cleanTitle.replace(/\s*:\s*네이버 블로그$/, ''); + cleanTitle = cleanTitle.replace(/\s*\|\s*네이버 블로그$/, ''); + cleanTitle = cleanTitle.trim(); + } + + this.logger.debug(`Extracted Naver Blog meta: title="${cleanTitle?.substring(0, 50)}"`); + + return { + title: cleanTitle, + description: ogDescription || description, + }; + } + + /** + * Fallback method to extract title from Naver Blog URL pattern. + * @param url - The Naver Blog URL to extract title from. + * @returns The extracted title or undefined. + */ + private extractNaverBlogTitleFromUrl(url: URL): string | undefined { + // Naver Blog URL pattern: https://blog.naver.com/username/postid + const pathParts = url.pathname.split('/').filter((part) => part.length > 0); + + if (pathParts.length >= 2) { + const username = pathParts[0]; + const postId = pathParts[1]; + return `${username}의 블로그 - ${postId}`; + } + + return undefined; + } + + /** + * Fallback method to extract title from URL pattern. + * @param url - The URL to extract title from. + * @returns The extracted title or undefined. + */ + private extractTitleFromUrl(url: URL): string | undefined { + // Medium URL patterns: + // https://medium.com/@username/article-title-123abc + // https://medium.com/publication/article-title-123abc + // https://username.medium.com/article-title-123abc + + const pathParts = url.pathname.split('/').filter((part) => part.length > 0); + + if (pathParts.length >= 2) { + // Get the last part which should be the article slug + const articleSlug = pathParts[pathParts.length - 1]; + + // Remove hash-like ending (e.g., -123abc) + const cleanSlug = articleSlug.replace(/-[a-f0-9]{6,}$/i, ''); + + // Convert slug to title + const title = cleanSlug + .split('-') + .map((word) => word.charAt(0).toUpperCase() + word.slice(1)) + .join(' '); + + return title.length > 5 ? title : undefined; + } + + return undefined; + } + + /** + * Extracts title from Substack URL. + * @param url - The Substack URL. + * @returns The extracted title or undefined. + */ + private extractSubstackTitle(url: URL): string | undefined { + const pathParts = url.pathname.split('/').filter((part) => part.length > 0); + + if (pathParts.length > 0 && pathParts[0] === 'p') { + // Substack post URL: /p/article-title + const articleSlug = pathParts[1]; + if (articleSlug) { + return articleSlug + .split('-') + .map((word) => word.charAt(0).toUpperCase() + word.slice(1)) + .join(' '); + } + } + + return undefined; + } + + /** + * Extracts title from GitHub URL. + * @param url - The GitHub URL. + * @returns The extracted title or undefined. + */ + private extractGitHubTitle(url: URL): string | undefined { + const pathParts = url.pathname.split('/').filter((part) => part.length > 0); + + if (pathParts.length >= 2) { + const owner = pathParts[0]; + const repo = pathParts[1]; + + if (pathParts.length >= 4 && pathParts[2] === 'blob') { + // File URL: /owner/repo/blob/branch/path/to/file.md + const fileName = pathParts[pathParts.length - 1]; + return `${owner}/${repo}: ${fileName}`; + } else { + // Repository URL: /owner/repo + return `${owner}/${repo}`; + } + } + + return undefined; + } + + /** + * Extracts title from Stack Overflow URL. + * @param url - The Stack Overflow URL. + * @returns The extracted title or undefined. + */ + private extractStackOverflowTitle(url: URL): string | undefined { + const pathParts = url.pathname.split('/').filter((part) => part.length > 0); + + if (pathParts.length >= 3 && pathParts[0] === 'questions') { + // Stack Overflow question URL: /questions/123456/question-title + const titleSlug = pathParts[2]; + if (titleSlug) { + return titleSlug + .split('-') + .map((word) => word.charAt(0).toUpperCase() + word.slice(1)) + .join(' '); + } + } + + return undefined; + } + + /** + * Extracts title from Wikipedia URL. + * @param url - The Wikipedia URL. + * @returns The extracted title or undefined. + */ + private extractWikipediaTitle(url: URL): string | undefined { + const pathParts = url.pathname.split('/').filter((part) => part.length > 0); + + if (pathParts.length >= 2 && pathParts[0] === 'wiki') { + // Wikipedia article URL: /wiki/Article_Title + const articleTitle = pathParts[1]; + if (articleTitle) { + return decodeURIComponent(articleTitle.replace(/_/g, ' ')); + } + } + + return undefined; + } } diff --git a/src/modules/pre-handler/handlers/news-site.handler.ts b/src/modules/pre-handler/handlers/news-site.handler.ts index 5b1052c..0d07313 100644 --- a/src/modules/pre-handler/handlers/news-site.handler.ts +++ b/src/modules/pre-handler/handlers/news-site.handler.ts @@ -225,8 +225,6 @@ export class NewsSiteHandler implements IContentHandler { const transform = NEWS_SITE_TRANSFORMATIONS[domain]; const newUrl = transform(url); - this.logger.debug(`Transformed news site URL [${domain}]: ${url.href} -> ${newUrl.href}`); - // Extract potential title from URL let title: string | undefined; const siteName = this.getSiteName(domain); diff --git a/src/modules/pre-handler/handlers/pdf.handler.ts b/src/modules/pre-handler/handlers/pdf.handler.ts index 4e9c4ce..b0868b8 100644 --- a/src/modules/pre-handler/handlers/pdf.handler.ts +++ b/src/modules/pre-handler/handlers/pdf.handler.ts @@ -37,7 +37,6 @@ export class PdfHandler implements IContentHandler { try { // For PDF files, we don't extract content here but mark the content type // The main service will handle PDF extraction using appropriate tools - this.logger.debug(`Detected PDF file: ${url.href}`); // Try to extract title from URL path let title: string | undefined; diff --git a/src/modules/pre-handler/handlers/readability.handler.ts b/src/modules/pre-handler/handlers/readability.handler.ts index a60f5c5..ee63d4a 100644 --- a/src/modules/pre-handler/handlers/readability.handler.ts +++ b/src/modules/pre-handler/handlers/readability.handler.ts @@ -38,7 +38,6 @@ export class ReadabilityHandler implements IContentHandler { const article = reader.parse(); if (!article?.content) { - this.logger.debug(`Readability could not find content for: ${url.href}`); return null; } diff --git a/src/modules/pre-handler/handlers/rss.handler.ts b/src/modules/pre-handler/handlers/rss.handler.ts index d0e654d..7c3966f 100644 --- a/src/modules/pre-handler/handlers/rss.handler.ts +++ b/src/modules/pre-handler/handlers/rss.handler.ts @@ -46,8 +46,6 @@ export class RssHandler implements IContentHandler { */ public handle(url: URL): Promise { try { - this.logger.debug(`Detected RSS/Atom feed: ${url.href}`); - // Try to extract title from URL or domain let title: string | undefined; diff --git a/src/modules/pre-handler/handlers/social-media.handler.ts b/src/modules/pre-handler/handlers/social-media.handler.ts index 0c5f8e8..dc9530d 100644 --- a/src/modules/pre-handler/handlers/social-media.handler.ts +++ b/src/modules/pre-handler/handlers/social-media.handler.ts @@ -160,8 +160,6 @@ export class SocialMediaHandler implements IContentHandler { const transform = SOCIAL_MEDIA_TRANSFORMATIONS[domain]; const newUrl = transform(url); - this.logger.debug(`Transformed social media URL [${domain}]: ${url.href} -> ${newUrl.href}`); - // Extract potential title from URL let title: string | undefined; const platform = this.getPlatformName(domain); diff --git a/src/modules/pre-handler/handlers/youtube.handler.ts b/src/modules/pre-handler/handlers/youtube.handler.ts index 67146c2..95d3d9e 100644 --- a/src/modules/pre-handler/handlers/youtube.handler.ts +++ b/src/modules/pre-handler/handlers/youtube.handler.ts @@ -45,8 +45,6 @@ export class YoutubeHandler implements IContentHandler { return Promise.resolve(null); } - this.logger.debug(`Processing YouTube video: ${videoId}`); - // For now, we'll return basic information // In a full implementation, you might want to: // 1. Fetch video metadata from YouTube API diff --git a/src/modules/pre-handler/pre-handler.service.ts b/src/modules/pre-handler/pre-handler.service.ts index 27824d3..7659c42 100644 --- a/src/modules/pre-handler/pre-handler.service.ts +++ b/src/modules/pre-handler/pre-handler.service.ts @@ -28,23 +28,30 @@ export class PreHandlerService { let currentUrl = new URL(urlString); const finalResult: PreHandleResult = { url: urlString }; + this.logger.debug(`Starting pre-handler execution for: ${urlString}`); + for (const handler of this.handlers) { if (handler.canHandle(currentUrl)) { - this.logger.debug(`Attempting to use handler: ${handler.constructor.name}`); + this.logger.debug(`Handler ${handler.constructor.name} can handle ${currentUrl.href}`); const result = await handler.handle(currentUrl); if (result) { // URL이 핸들러에 의해 변경되었는지 확인하고 업데이트합니다. if (result.url && result.url !== currentUrl.href) { - this.logger.debug(`URL transformed by ${handler.constructor.name} to: ${result.url}`); currentUrl = new URL(result.url); finalResult.url = result.url; + this.logger.log(`URL transformed by ${handler.constructor.name}: ${urlString} → ${result.url}`); + } + + // 타이틀이 있으면 설정합니다. + if (result.title) { + finalResult.title = result.title; + this.logger.log(`Title extracted by ${handler.constructor.name}: ${result.title}`); } // 콘텐츠가 성공적으로 추출되면 즉시 반환합니다. if (result.content) { - this.logger.log(`Successfully handled by ${handler.constructor.name}`); - finalResult.title = result.title; + this.logger.log(`Content extracted by ${handler.constructor.name}`); finalResult.content = result.content; finalResult.contentType = result.contentType; return finalResult; @@ -53,7 +60,9 @@ export class PreHandlerService { } } - this.logger.debug('No suitable handler found. Returning final result.'); + this.logger.debug( + `Pre-handler execution completed. Final result: url=${finalResult.url}, title=${finalResult.title}`, + ); return finalResult; // 콘텐츠가 없더라도, 변환된 URL이 포함될 수 있는 최종 결과를 반환합니다. } } diff --git a/src/modules/scraper/scraper.controller.ts b/src/modules/scraper/scraper.controller.ts index 89b006d..898f873 100644 --- a/src/modules/scraper/scraper.controller.ts +++ b/src/modules/scraper/scraper.controller.ts @@ -1,6 +1,19 @@ -import { Controller, Post, Body, Get, Query, BadRequestException, Res, Header } from '@nestjs/common'; -import { ApiTags, ApiOperation, ApiResponse, ApiBody, ApiQuery } from '@nestjs/swagger'; +import { + Controller, + Post, + Body, + Get, + Query, + BadRequestException, + Res, + Header, + UseGuards, + Request, +} from '@nestjs/common'; +import { ApiTags, ApiOperation, ApiResponse, ApiBody, ApiQuery, ApiBearerAuth } from '@nestjs/swagger'; +import { JwtAuthGuard } from 'src/common/guards/jwt.guard'; import { Response } from 'express'; +import { AuthRequest } from 'src/types'; import { PuppeteerParseService, FetchContentWithSaveInput } from './services/puppeteer-parse.service'; import { FetchContentInput } from './dto/fetch-content.input'; import { ScrapedContentOutput } from './dto/scraped-content.output'; @@ -53,6 +66,8 @@ export class ScraperController { * 웹 콘텐츠 스크래핑 및 저장 (인증 필요) */ @Post('save-content') + @UseGuards(JwtAuthGuard) + @ApiBearerAuth('access-token') @ApiOperation({ summary: '웹 콘텐츠 스크래핑 및 저장', description: '웹 콘텐츠를 스크래핑하고 사용자 계정에 저장합니다. 인증이 필요합니다.', @@ -97,13 +112,16 @@ export class ScraperController { status: 500, description: '서버 오류', }) - async saveContent(@Body() input: Record): Promise { + async saveContent( + @Request() req: AuthRequest, + @Body() input: Record, + ): Promise { // URL 필수 필드 검증 if (!input.url) { throw new BadRequestException('URL is required'); } - // 간단한 구현: 인증 없이도 동작하도록 함 + // 인증된 사용자 정보 사용 const fetchInput: FetchContentWithSaveInput = { url: input.url as string, locale: input.locale as string | undefined, @@ -111,11 +129,12 @@ export class ScraperController { tags: input.tags as string[] | undefined, isBookmarked: input.isBookmarked as boolean | undefined, isArchived: input.isArchived as boolean | undefined, - saveToDatabase: false, // 인증 구현 전까지는 저장 비활성화 + saveToDatabase: true, // 저장 활성화 + userId: req.user.id, // 인증된 사용자 ID 사용 }; const result = await this.puppeteerParseService.fetchContentWithSave(fetchInput); - return { ...result, saved: false }; + return { ...result, saved: true }; } /** diff --git a/src/modules/scraper/services/browser.service.ts b/src/modules/scraper/services/browser.service.ts index 82f0b75..32cb6d1 100644 --- a/src/modules/scraper/services/browser.service.ts +++ b/src/modules/scraper/services/browser.service.ts @@ -2,8 +2,6 @@ import { Injectable, Logger, OnApplicationShutdown } from '@nestjs/common'; import { ConfigService } from '@nestjs/config'; import puppeteer from 'puppeteer-extra'; import { Browser } from 'puppeteer-core'; -import StealthPlugin from 'puppeteer-extra-plugin-stealth'; -import AdblockerPlugin from 'puppeteer-extra-plugin-adblocker'; @Injectable() export class BrowserService implements OnApplicationShutdown { @@ -11,9 +9,55 @@ export class BrowserService implements OnApplicationShutdown { private browser: Browser | null = null; constructor(private readonly configService: ConfigService) { - // 플러그인은 Chrome 전용이므로 생성자에서 한 번만 등록 - puppeteer.use(StealthPlugin()); - puppeteer.use(AdblockerPlugin({ blockTrackers: true })); + this.initializePlugins(); + } + + /** + * 플러그인을 안전하게 초기화합니다. + */ + private initializePlugins(): void { + try { + // 동적 임포트를 사용한 안전한 플러그인 로딩 + const usePlugins = this.configService.get('USE_BROWSER_PLUGINS', false); + + if (usePlugins) { + this.loadPluginsSafely(); + } else { + this.logger.log('Browser plugins disabled by configuration'); + } + } catch (error) { + this.logger.warn('Failed to initialize browser plugins, running without plugins:', error); + } + } + + /** + * 플러그인을 안전하게 로드합니다. + */ + /* eslint-disable @typescript-eslint/no-unsafe-assignment, @typescript-eslint/no-unsafe-member-access, @typescript-eslint/no-unsafe-call, @typescript-eslint/no-var-requires, @typescript-eslint/no-require-imports */ + private loadPluginsSafely(): void { + try { + const StealthPlugin = require('puppeteer-extra-plugin-stealth'); + const AdblockerPlugin = require('puppeteer-extra-plugin-adblocker'); + + // 플러그인이 함수인지 확인 + if (typeof StealthPlugin === 'function') { + puppeteer.use(StealthPlugin()); + this.logger.log('StealthPlugin loaded successfully'); + } else if (StealthPlugin?.default && typeof StealthPlugin.default === 'function') { + puppeteer.use(StealthPlugin.default()); + this.logger.log('StealthPlugin loaded successfully (default export)'); + } + + if (typeof AdblockerPlugin === 'function') { + puppeteer.use(AdblockerPlugin({ blockTrackers: true })); + this.logger.log('AdblockerPlugin loaded successfully'); + } else if (AdblockerPlugin?.default && typeof AdblockerPlugin.default === 'function') { + puppeteer.use(AdblockerPlugin.default({ blockTrackers: true })); + this.logger.log('AdblockerPlugin loaded successfully (default export)'); + } + } catch (error) { + this.logger.warn('Failed to load browser plugins:', error); + } } /** @@ -49,6 +93,10 @@ export class BrowserService implements OnApplicationShutdown { '--disable-background-networking', '--disable-gpu', '--disable-software-rasterizer', + // 동시성 제한 추가 + '--max-web-media-player-count=1', + '--disable-features=TranslateUI', + '--disable-ipc-flooding-protection', ], defaultViewport: { deviceScaleFactor: 1, diff --git a/src/modules/scraper/services/puppeteer-parse.service.ts b/src/modules/scraper/services/puppeteer-parse.service.ts index fdaa0cc..da93332 100644 --- a/src/modules/scraper/services/puppeteer-parse.service.ts +++ b/src/modules/scraper/services/puppeteer-parse.service.ts @@ -1,4 +1,4 @@ -import { Injectable, Logger, Optional } from '@nestjs/common'; +import { Injectable, Logger } from '@nestjs/common'; import { BrowserContext, Page, Protocol } from 'puppeteer-core'; import { BrowserService } from './browser.service'; import { FetchContentInput } from '../dto/fetch-content.input'; @@ -43,15 +43,13 @@ export class PuppeteerParseService { constructor( private readonly browserService: BrowserService, private readonly preHandlerService: PreHandlerService, - @Optional() private readonly articleService?: ArticleService, + private readonly articleService: ArticleService, ) {} /** * Fetch readable content from the given URL. */ async fetchContent({ url, locale, timezone }: FetchContentInput): Promise { - const startedAt = Date.now(); - url = this.normalizeUrl(url); // (1) Execute pre-handling using the new extensible service @@ -60,22 +58,33 @@ export class PuppeteerParseService { let { title, content, contentType } = preHandleResult; url = preHandleResult.url; // URL might have been changed by a handler + // Log pre-handling results + if (content) { + this.logger.log(`Pre-handler extracted content (${content.length} chars) from: ${url}`); + } + if (title) { + this.logger.log(`Pre-handler extracted title: ${title}`); + } + // (2) Fetch via Puppeteer when necessary if (contentType !== 'application/pdf' && (!title || !content)) { + this.logger.debug(`Puppeteer fallback required for: ${url}`); const pageResult = await this.retrievePage({ url, locale, timezone }); url = pageResult.finalUrl; contentType = pageResult.contentType; if (pageResult.page) { const html = await this.retrieveHtml(pageResult.page); - title = html.title; - content = html.content; + // 사전 처리에서 이미 타이틀이 있다면 유지, 없다면 HTML에서 추출 + title = title || html.title; + content = content || html.content; } await pageResult.context?.close(); + } else if (content) { + this.logger.log(`Using pre-processed content, skipping Puppeteer for: ${url}`); } - this.logger.debug(`Scraping done in ${Date.now() - startedAt} ms`); return { finalUrl: url, title, content, contentType }; } @@ -88,15 +97,14 @@ export class PuppeteerParseService { // 일반 스크래핑 수행 const scrapedContent = await this.fetchContent(fetchInput); - // 데이터베이스 저장 옵션이 활성화되고 ArticleService가 사용 가능한 경우 - if (saveToDatabase && userId && this.articleService) { + // 데이터베이스 저장 옵션이 활성화된 경우 + if (saveToDatabase && userId) { try { await this.articleService.saveScrapedContent(userId, scrapedContent, { tags, isBookmarked, isArchived, }); - this.logger.debug(`Content saved to database for user ${userId}: ${scrapedContent.finalUrl}`); } catch (error) { this.logger.warn(`Failed to save content to database: ${(error as Error).message}`); // 저장 실패해도 스크래핑 결과는 반환 @@ -175,6 +183,9 @@ export class PuppeteerParseService { } private async setupNetworkInterception(page: Page): Promise { + // Request interception 활성화 (page-level) + await page.setRequestInterception(true); + const client = await page.createCDPSession(); // PDF / MIME type blocking From bc8d5a1b38081ea8ad3b9f5bc7bd19df3a4120d9 Mon Sep 17 00:00:00 2001 From: reach0908 Date: Fri, 4 Jul 2025 23:19:38 +0900 Subject: [PATCH 07/28] =?UTF-8?q?feat(readability):=20Readability=20?= =?UTF-8?q?=ED=95=B8=EB=93=A4=EB=9F=AC=20=EB=B0=8F=20PuppeteerParseService?= =?UTF-8?q?=20=EA=B0=9C=EC=84=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * ReadabilityHandler에서 JSDOM 설정을 추가하여 스크립트 실행 및 리소스 로딩을 허용했습니다. * PuppeteerParseService에 Readability를 적용하여 HTML 콘텐츠에서 본문만 추출하는 기능을 추가했습니다. * 콘텐츠 추출 성공 및 실패 시 로깅 기능을 강화하여 디버깅을 용이하게 했습니다. --- .../handlers/readability.handler.ts | 8 +++++ .../services/puppeteer-parse.service.ts | 34 ++++++++++++++++++- 2 files changed, 41 insertions(+), 1 deletion(-) diff --git a/src/modules/pre-handler/handlers/readability.handler.ts b/src/modules/pre-handler/handlers/readability.handler.ts index ee63d4a..0d1f655 100644 --- a/src/modules/pre-handler/handlers/readability.handler.ts +++ b/src/modules/pre-handler/handlers/readability.handler.ts @@ -32,15 +32,23 @@ export class ReadabilityHandler implements IContentHandler { try { const dom = await JSDOM.fromURL(url.href, { userAgent: this.USER_AGENT, + resources: 'usable', + runScripts: 'dangerously', + pretendToBeVisual: true, }); + await new Promise((resolve) => setTimeout(resolve, 2000)); + const reader = new Readability(dom.window.document); const article = reader.parse(); if (!article?.content) { + this.logger.debug(`No readable content found for ${url.href}`); return null; } + this.logger.log(`Successfully extracted readable content: ${article.content.length} chars`); + return { url: url.href, title: article.title ?? undefined, diff --git a/src/modules/scraper/services/puppeteer-parse.service.ts b/src/modules/scraper/services/puppeteer-parse.service.ts index da93332..9184274 100644 --- a/src/modules/scraper/services/puppeteer-parse.service.ts +++ b/src/modules/scraper/services/puppeteer-parse.service.ts @@ -1,5 +1,7 @@ import { Injectable, Logger } from '@nestjs/common'; import { BrowserContext, Page, Protocol } from 'puppeteer-core'; +import { JSDOM } from 'jsdom'; +import { Readability } from '@mozilla/readability'; import { BrowserService } from './browser.service'; import { FetchContentInput } from '../dto/fetch-content.input'; import { ScrapedContentOutput } from '../dto/scraped-content.output'; @@ -77,7 +79,11 @@ export class PuppeteerParseService { const html = await this.retrieveHtml(pageResult.page); // 사전 처리에서 이미 타이틀이 있다면 유지, 없다면 HTML에서 추출 title = title || html.title; - content = content || html.content; + + // 🔧 해결: HTML 콘텐츠에 Readability 적용하여 불필요한 정보 제거 + if (html.content) { + content = await this.applyReadabilityToHtml(html.content, url); + } } await pageResult.context?.close(); @@ -291,4 +297,30 @@ export class PuppeteerParseService { }); }); } + + /** + * HTML 콘텐츠에 Readability를 적용하여 본문만 추출합니다. + * @param html - 전체 HTML 콘텐츠 + * @param url - 원본 URL (상대 링크 처리용) + * @returns 정제된 콘텐츠 또는 원본 HTML (실패 시) + */ + private async applyReadabilityToHtml(html: string, url: string): Promise { + try { + const dom = new JSDOM(html, { url }); + const reader = new Readability(dom.window.document); + const article = reader.parse(); + + if (article?.content) { + this.logger.log(`Successfully extracted readable content from HTML (${article.content.length} chars)`); + return article.content; + } else { + this.logger.warn(`Readability failed to extract content from HTML, using original`); + } + } catch (error) { + this.logger.warn(`Failed to apply Readability to HTML: ${(error as Error).message}`); + } + + // 실패 시 원본 반환 + return html; + } } From a65e36b0a7f68cc9af6ab1233f34cec11e2b42b7 Mon Sep 17 00:00:00 2001 From: reach0908 Date: Fri, 4 Jul 2025 23:33:24 +0900 Subject: [PATCH 08/28] =?UTF-8?q?feat(article):=20Article=20=EB=AA=A8?= =?UTF-8?q?=EB=8D=B8=EC=97=90=20=EB=B3=B5=ED=95=A9=20=EA=B3=A0=EC=9C=A0=20?= =?UTF-8?q?=EC=A0=9C=EC=95=BD=EC=A1=B0=EA=B1=B4=20=EC=B6=94=EA=B0=80=20?= =?UTF-8?q?=EB=B0=8F=20=EC=97=85=EB=8D=B0=EC=9D=B4=ED=8A=B8=20=EB=A1=9C?= =?UTF-8?q?=EC=A7=81=20=EA=B0=9C=EC=84=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Article 모델에 id와 userId를 조합한 복합 고유 제약조건을 추가하여 데이터 무결성을 강화했습니다. * Article 업데이트 로직에서 오류 발생 시 경고 로그를 추가하여 디버깅을 용이하게 했습니다. * Naver 블로그 콘텐츠 추출 기능을 개선하여 타이틀 및 콘텐츠를 보다 정확하게 처리하도록 했습니다. --- .../migration.sql | 8 + prisma/schema.prisma | 1 + .../repositories/article.repository.ts | 38 +-- .../handlers/domain-specific.handler.ts | 163 ++++++++++++- .../handlers/readability.handler.ts | 225 +++++++++++++++++- 5 files changed, 408 insertions(+), 27 deletions(-) create mode 100644 prisma/migrations/20250704142951_add_id_user_id_unique_constraint/migration.sql diff --git a/prisma/migrations/20250704142951_add_id_user_id_unique_constraint/migration.sql b/prisma/migrations/20250704142951_add_id_user_id_unique_constraint/migration.sql new file mode 100644 index 0000000..fd685c1 --- /dev/null +++ b/prisma/migrations/20250704142951_add_id_user_id_unique_constraint/migration.sql @@ -0,0 +1,8 @@ +/* + Warnings: + + - A unique constraint covering the columns `[id,userId]` on the table `Article` will be added. If there are existing duplicate values, this will fail. + +*/ +-- CreateIndex +CREATE UNIQUE INDEX "Article_id_userId_key" ON "Article"("id", "userId"); diff --git a/prisma/schema.prisma b/prisma/schema.prisma index 3efc506..6f4f5d4 100644 --- a/prisma/schema.prisma +++ b/prisma/schema.prisma @@ -66,6 +66,7 @@ model Article { user User @relation(fields: [userId], references: [id], onDelete: Cascade) @@unique([url, userId]) // 사용자별 URL 중복 방지 + @@unique([id, userId]) // id와 userId 복합 고유 제약조건 추가 @@index([userId]) @@index([userId, isBookmarked]) @@index([userId, isArchived]) diff --git a/src/modules/article/repositories/article.repository.ts b/src/modules/article/repositories/article.repository.ts index 0437b24..1881c2e 100644 --- a/src/modules/article/repositories/article.repository.ts +++ b/src/modules/article/repositories/article.repository.ts @@ -111,22 +111,28 @@ export class ArticleRepository { } }); - return this.prisma.article.update({ - where: { - id, - userId, - }, - data, - include: { - user: { - select: { - id: true, - email: true, - name: true, + try { + return await this.prisma.article.update({ + where: { + id, + userId, + }, + data, + include: { + user: { + select: { + id: true, + email: true, + name: true, + }, }, }, - }, - }); + }); + } catch (error) { + // Article이 존재하지 않거나 사용자가 소유하지 않은 경우 + this.logger.warn(`Failed to update article ${id} for user ${userId}: ${(error as Error).message}`); + return null; + } } /** @@ -190,9 +196,9 @@ export class ArticleRepository { ...(isArchived !== undefined && { isArchived }), }; - // 정렬 조건 구성 + // 정렬 조건 구성 - 타입 안전성 보장 const orderBy: Prisma.ArticleOrderByWithRelationInput = { - [sortBy]: sortOrder, + [sortBy as keyof Prisma.ArticleOrderByWithRelationInput]: sortOrder as Prisma.SortOrder, }; const [articles, total] = await Promise.all([ diff --git a/src/modules/pre-handler/handlers/domain-specific.handler.ts b/src/modules/pre-handler/handlers/domain-specific.handler.ts index b9e6a20..ccddfa6 100644 --- a/src/modules/pre-handler/handlers/domain-specific.handler.ts +++ b/src/modules/pre-handler/handlers/domain-specific.handler.ts @@ -391,7 +391,10 @@ export class DomainSpecificHandler implements IContentHandler { content = mediumResult.content; contentType = 'text/html'; // Medium content comes as HTML } else if (domain === 'blog.naver.com') { - title = await this.extractNaverBlogTitle(url); + const naverBlogResult = await this.extractNaverBlogContent(url); + title = naverBlogResult.title; + content = naverBlogResult.content; + contentType = 'text/html'; } else if (domain === 'substack.com') { title = this.extractSubstackTitle(url); } else if (domain === 'github.com') { @@ -529,6 +532,164 @@ export class DomainSpecificHandler implements IContentHandler { }); } + /** + * 네이버 블로그 콘텐츠와 이미지를 추출합니다. + * @param url - 네이버 블로그 URL + * @returns 추출된 타이틀과 콘텐츠 + */ + private async extractNaverBlogContent(url: URL): Promise<{ + title?: string; + content?: string; + }> { + try { + this.logger.debug(`Extracting Naver Blog content from: ${url.href}`); + + // 네이버 블로그 접근을 위한 특수 헤더 설정 + const response = await fetch(url.href, { + headers: { + 'User-Agent': + 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1', + Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'ko-KR,ko;q=0.9,en;q=0.8', + 'Accept-Encoding': 'gzip, deflate', + Connection: 'keep-alive', + Referer: 'https://blog.naver.com/', + }, + redirect: 'follow', + }); + + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${response.statusText}`); + } + + const html = await response.text(); + this.logger.debug(`Successfully fetched Naver Blog HTML, length: ${html.length}`); + + const dom = new JSDOM(html); + const document = dom.window.document; + + // 타이틀 추출 + const title = this.extractNaverBlogTitleFromDocument(document); + + // 네이버 블로그 이미지 최적화 + this.optimizeNaverBlogImages(document); + + // 콘텐츠 추출 + const content = document.body?.outerHTML; + + this.logger.log(`Successfully extracted Naver Blog content: title="${title?.substring(0, 50)}"`); + + return { + title, + content, + }; + } catch (error) { + this.logger.warn(`Failed to extract Naver Blog content from ${url.href}: ${(error as Error).message}`); + + // 실패 시 기존 타이틀 추출 로직 사용 + const fallbackTitle = await this.extractNaverBlogTitle(url); + return { + title: fallbackTitle, + content: undefined, + }; + } + } + + /** + * 네이버 블로그 문서에서 타이틀을 추출합니다. + * @param document - DOM 문서 + * @returns 추출된 타이틀 + */ + private extractNaverBlogTitleFromDocument(document: Document): string | undefined { + // 다양한 방법으로 타이틀 추출 시도 + const titleSelectors = [ + 'meta[property="og:title"]', + 'meta[name="title"]', + 'title', + '.se-title-text', + '.pcol1 .title', + '.blog-title', + ]; + + for (const selector of titleSelectors) { + const element = document.querySelector(selector); + if (element) { + const title = element.getAttribute('content') || element.textContent; + if (title?.trim()) { + // 네이버 블로그 타이틀 정리 + let cleanTitle = title.trim(); + cleanTitle = cleanTitle.replace(/\s*:\s*네이버 블로그$/, ''); + cleanTitle = cleanTitle.replace(/\s*\|\s*네이버 블로그$/, ''); + return cleanTitle.trim(); + } + } + } + + return undefined; + } + + /** + * 네이버 블로그의 이미지를 최적화합니다. + * @param document - DOM 문서 + */ + private optimizeNaverBlogImages(document: Document): void { + // 네이버 블로그 이미지 처리 + const images = document.querySelectorAll('img'); + + images.forEach((img) => { + // 네이버 블로그 썸네일 URL을 원본 이미지 URL로 변환 + const src = img.getAttribute('src') || img.getAttribute('data-src'); + if (src) { + // 네이버 블로그 이미지 URL 패턴 처리 + let optimizedSrc = src; + + // 썸네일 URL을 원본 URL로 변환 + if (src.includes('blogfiles.naver.net')) { + // 썸네일 파라미터 제거하여 원본 이미지 획득 + optimizedSrc = src.replace(/\?.*$/, ''); + } + + // 상대 경로를 절대 경로로 변환 + if (optimizedSrc.startsWith('//')) { + optimizedSrc = 'https:' + optimizedSrc; + } else if (optimizedSrc.startsWith('/')) { + optimizedSrc = 'https://blog.naver.com' + optimizedSrc; + } + + // 최적화된 src 설정 + img.setAttribute('src', optimizedSrc); + + // lazy loading 속성 제거 + img.removeAttribute('data-src'); + img.removeAttribute('loading'); + } + }); + + // 네이버 블로그 특수 이미지 태그 처리 + const specialImages = document.querySelectorAll('[data-ke-src]'); + specialImages.forEach((element) => { + const dataSrc = element.getAttribute('data-ke-src'); + if (dataSrc) { + let optimizedSrc = dataSrc; + + // 상대 경로를 절대 경로로 변환 + if (optimizedSrc.startsWith('//')) { + optimizedSrc = 'https:' + optimizedSrc; + } else if (optimizedSrc.startsWith('/')) { + optimizedSrc = 'https://blog.naver.com' + optimizedSrc; + } + + // img 태그로 변환 + const img = document.createElement('img'); + img.src = optimizedSrc; + img.alt = element.getAttribute('alt') || ''; + + // 기존 요소를 새로운 img 태그로 교체 + element.parentNode?.replaceChild(img, element); + } + }); + } + /** * Extracts title from Naver Blog URL by fetching and parsing meta tags. * @param url - The Naver Blog URL. diff --git a/src/modules/pre-handler/handlers/readability.handler.ts b/src/modules/pre-handler/handlers/readability.handler.ts index 0d1f655..740452b 100644 --- a/src/modules/pre-handler/handlers/readability.handler.ts +++ b/src/modules/pre-handler/handlers/readability.handler.ts @@ -4,6 +4,20 @@ import { Readability } from '@mozilla/readability'; import { IContentHandler } from '../interfaces/content-handler.interface'; import { PreHandleResult } from '../dto/pre-handle-result.dto'; +/** + * Readability 파싱 결과 타입 + */ +interface ReadabilityResult { + title: string | null; + content: string | null; + textContent: string | null; + length: number; + excerpt: string | null; + byline: string | null; + dir: string | null; + siteName: string | null; +} + /** * A content handler that uses Mozilla's Readability library to extract * the main readable content from a generic webpage. @@ -30,17 +44,16 @@ export class ReadabilityHandler implements IContentHandler { */ public async handle(url: URL): Promise { try { - const dom = await JSDOM.fromURL(url.href, { - userAgent: this.USER_AGENT, - resources: 'usable', - runScripts: 'dangerously', - pretendToBeVisual: true, - }); + // 첫 번째 시도: JavaScript 실행 활성화 + let dom = await this.createDOMWithScripts(url.href); + let article = await this.extractContentFromDOM(dom); - await new Promise((resolve) => setTimeout(resolve, 2000)); - - const reader = new Readability(dom.window.document); - const article = reader.parse(); + // JavaScript 실행 중 오류가 발생하면 두 번째 시도 + if (!article?.content) { + this.logger.debug(`First attempt failed, trying without scripts for ${url.href}`); + dom = await this.createDOMWithoutScripts(url.href); + article = await this.extractContentFromDOM(dom); + } if (!article?.content) { this.logger.debug(`No readable content found for ${url.href}`); @@ -60,4 +73,196 @@ export class ReadabilityHandler implements IContentHandler { return null; } } + + /** + * JavaScript 실행을 활성화한 JSDOM 생성 + * @param url - 처리할 URL + * @returns JSDOM 인스턴스 + */ + private async createDOMWithScripts(url: string): Promise { + const dom = await JSDOM.fromURL(url, { + userAgent: this.USER_AGENT, + resources: 'usable', + runScripts: 'dangerously', + pretendToBeVisual: true, + }); + + // 브라우저 API polyfill 추가 + this.addBrowserPolyfills(dom.window as unknown as Window & typeof globalThis); + + // 에러 핸들링 추가 + this.addErrorHandling(dom.window as unknown as Window & typeof globalThis); + + return dom; + } + + /** + * JavaScript 실행을 비활성화한 JSDOM 생성 + * @param url - 처리할 URL + * @returns JSDOM 인스턴스 + */ + private async createDOMWithoutScripts(url: string): Promise { + return JSDOM.fromURL(url, { + userAgent: this.USER_AGENT, + resources: 'usable', + runScripts: 'outside-only', + pretendToBeVisual: true, + }); + } + + /** + * DOM에서 콘텐츠 추출 + * @param dom - JSDOM 인스턴스 + * @returns 추출된 아티클 또는 null + */ + private async extractContentFromDOM(dom: JSDOM): Promise { + try { + // 페이지 로딩 대기 + await new Promise((resolve) => setTimeout(resolve, 2000)); + + const reader = new Readability(dom.window.document); + const article = reader.parse(); + + return article as ReadabilityResult | null; + } catch (error) { + this.logger.debug(`Content extraction failed: ${(error as Error).message}`); + return null; + } + } + + /** + * 브라우저 API polyfill 추가 + * @param window - JSDOM window 객체 + */ + private addBrowserPolyfills(window: Window & typeof globalThis): void { + // document.elementFromPoint polyfill + if (!window.document.elementFromPoint) { + window.document.elementFromPoint = function (_x: number, _y: number): Element | null { + // 간단한 fallback 구현 + return window.document.body || window.document.documentElement; + }; + } + + // MessageChannel polyfill + if (!(window as any).MessageChannel) { + (window as any).MessageChannel = class MessageChannel { + port1: any; + port2: any; + + constructor() { + this.port1 = { + postMessage: () => {}, + onmessage: null, + close: () => {}, + }; + this.port2 = { + postMessage: () => {}, + onmessage: null, + close: () => {}, + }; + } + }; + } + + // requestIdleCallback polyfill + if (!(window as any).requestIdleCallback) { + (window as any).requestIdleCallback = function ( + callback: (deadline: { didTimeout: boolean; timeRemaining: () => number }) => void, + _options?: any, + ) { + return setTimeout(() => { + callback({ + didTimeout: false, + timeRemaining: () => 50, + }); + }, 1); + }; + } + + // cancelIdleCallback polyfill + if (!(window as any).cancelIdleCallback) { + (window as any).cancelIdleCallback = function (id: number) { + clearTimeout(id); + }; + } + + // IntersectionObserver polyfill (최소 구현) + if (!(window as any).IntersectionObserver) { + (window as any).IntersectionObserver = class IntersectionObserver { + constructor(_callback: any, _options?: any) { + // 최소 구현 + } + observe() {} + unobserve() {} + disconnect() {} + }; + } + + // Performance API polyfill + if (!window.performance) { + (window as any).performance = { + now: () => Date.now(), + timeOrigin: Date.now(), + }; + } + + // crypto polyfill (기본 구현) + if (!window.crypto) { + (window as any).crypto = { + getRandomValues: (array: any) => { + for (let i = 0; i < array.length; i++) { + array[i] = Math.floor(Math.random() * 256); + } + return array; + }, + }; + } + } + + /** + * 에러 핸들링 추가 + * @param window - JSDOM window 객체 + */ + private addErrorHandling(window: Window & typeof globalThis): void { + // 글로벌 에러 핸들러 + window.addEventListener('error', (event: ErrorEvent) => { + // 특정 에러는 무시 + const ignoredErrors = [ + 'elementFromPoint is not a function', + 'MessageChannel is not defined', + 'requestIdleCallback is not defined', + 'IntersectionObserver is not defined', + 'clarity', + 'TypeError: document.elementFromPoint is not a function', + 'ReferenceError: MessageChannel is not defined', + ]; + + const errorMessage = event.error?.message || event.message || ''; + const shouldIgnore = ignoredErrors.some((ignoredError) => + errorMessage.toLowerCase().includes(ignoredError.toLowerCase()), + ); + + if (!shouldIgnore) { + this.logger.debug(`JavaScript error in JSDOM: ${errorMessage}`); + } + + // 에러 전파 방지 + event.preventDefault(); + }); + + // Promise rejection 핸들러 + window.addEventListener('unhandledrejection', (event: PromiseRejectionEvent) => { + const reason = event.reason?.message || event.reason || ''; + const ignoredReasons = ['elementFromPoint', 'MessageChannel', 'clarity']; + + const shouldIgnore = ignoredReasons.some((ignored) => reason.toLowerCase().includes(ignored.toLowerCase())); + + if (!shouldIgnore) { + this.logger.debug(`Unhandled promise rejection in JSDOM: ${reason}`); + } + + // 에러 전파 방지 + event.preventDefault(); + }); + } } From cf43619ad3834c9665e218c38b5cf1dd544624ee Mon Sep 17 00:00:00 2001 From: reach0908 Date: Sat, 5 Jul 2025 10:31:36 +0900 Subject: [PATCH 09/28] =?UTF-8?q?feat(pre-handler):=20StibeeHandler=20?= =?UTF-8?q?=EC=B6=94=EA=B0=80=20=EB=B0=8F=20=ED=95=B8=EB=93=A4=EB=9F=AC=20?= =?UTF-8?q?=EB=AA=A9=EB=A1=9D=20=EC=97=85=EB=8D=B0=EC=9D=B4=ED=8A=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 스티비(Stibee) 뉴스레터 플랫폼을 위한 StibeeHandler를 추가하여 스티비 뉴스레터 콘텐츠를 최적화하여 추출하는 기능을 구현했습니다. * 핸들러 목록에 StibeeHandler를 추가하고, 주석을 업데이트하여 핸들러의 순서를 명확히 했습니다. --- .../pre-handler/handlers/stibee.handler.ts | 313 ++++++++++++++++++ src/modules/pre-handler/pre-handler.module.ts | 12 +- 2 files changed, 321 insertions(+), 4 deletions(-) create mode 100644 src/modules/pre-handler/handlers/stibee.handler.ts diff --git a/src/modules/pre-handler/handlers/stibee.handler.ts b/src/modules/pre-handler/handlers/stibee.handler.ts new file mode 100644 index 0000000..ee86233 --- /dev/null +++ b/src/modules/pre-handler/handlers/stibee.handler.ts @@ -0,0 +1,313 @@ +import { Injectable, Logger } from '@nestjs/common'; +import { JSDOM } from 'jsdom'; +import { IContentHandler } from '../interfaces/content-handler.interface'; +import { PreHandleResult } from '../dto/pre-handle-result.dto'; + +/** + * 스티비(Stibee) 뉴스레터 플랫폼을 위한 전용 콘텐츠 핸들러 + * 스티비 뉴스레터의 특별한 구조와 스타일을 고려하여 최적화된 콘텐츠 추출을 제공합니다. + */ +@Injectable() +export class StibeeHandler implements IContentHandler { + private readonly logger = new Logger(StibeeHandler.name); + + /** + * 스티비 뉴스레터 URL인지 확인합니다. + * @param url - 확인할 URL + * @returns 스티비 URL이면 true, 아니면 false + */ + public canHandle(url: URL): boolean { + return url.hostname.endsWith('stibee.com'); + } + + /** + * 스티비 뉴스레터 콘텐츠를 추출합니다. + * @param url - 처리할 스티비 URL + * @returns 추출된 콘텐츠 또는 null + */ + public async handle(url: URL): Promise { + try { + this.logger.debug(`스티비 뉴스레터 콘텐츠 추출 시작: ${url.href}`); + + // 스티비 뉴스레터 페이지에서 콘텐츠를 가져옵니다 + const result = await this.extractStibeeContent(url); + + if (!result.content) { + this.logger.debug(`스티비 콘텐츠 추출 실패: ${url.href}`); + return null; + } + + this.logger.log(`스티비 콘텐츠 추출 성공: ${result.content.length} 글자`); + + return { + url: url.href, + title: result.title, + content: result.content, + contentType: 'text/html', + }; + } catch (error) { + this.logger.warn(`스티비 핸들러 처리 실패 ${url.href}: ${(error as Error).message}`); + return null; + } + } + + /** + * 스티비 뉴스레터에서 콘텐츠를 추출합니다. + * @param url - 스티비 뉴스레터 URL + * @returns 추출된 제목과 콘텐츠 + */ + private async extractStibeeContent(url: URL): Promise<{ + title?: string; + content?: string; + }> { + try { + // AbortController로 타임아웃 설정 + const controller = new AbortController(); + const timeoutId = setTimeout(() => controller.abort(), 20000); // 20초 타임아웃 + + // 스티비 뉴스레터 접근을 위한 최적화된 헤더 설정 + const response = await fetch(url.href, { + headers: { + 'User-Agent': + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': 'ko-KR,ko;q=0.9,en;q=0.8', + 'Accept-Encoding': 'gzip, deflate, br', + Connection: 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'none', + 'Cache-Control': 'no-cache', + }, + redirect: 'follow', + signal: controller.signal, + }); + + clearTimeout(timeoutId); + + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${response.statusText}`); + } + + const html = await response.text(); + this.logger.debug(`스티비 HTML 가져오기 성공, 길이: ${html.length}`); + + // DOM 파싱 + const dom = new JSDOM(html); + const document = dom.window.document; + + // 제목 추출 + const title = this.extractStibeeTitle(document); + + // 콘텐츠 추출 및 최적화 + const content = this.extractAndOptimizeStibeeContent(document); + + return { + title, + content, + }; + } catch (error) { + this.logger.warn(`스티비 콘텐츠 추출 실패 ${url.href}: ${(error as Error).message}`); + return { + title: undefined, + content: undefined, + }; + } + } + + /** + * 스티비 뉴스레터에서 제목을 추출합니다. + * @param document - DOM 문서 + * @returns 추출된 제목 + */ + private extractStibeeTitle(document: Document): string | undefined { + // 스티비 뉴스레터 제목 추출을 위한 다양한 셀렉터 시도 + const titleSelectors = [ + 'meta[property="og:title"]', + 'meta[name="twitter:title"]', + 'title', + 'h1', + '.newsletter-title', + '.post-title', + '[class*="title"]', + '[class*="headline"]', + ]; + + for (const selector of titleSelectors) { + const element = document.querySelector(selector); + if (element) { + let title = element.getAttribute('content') || element.textContent; + if (title?.trim()) { + // 스티비 관련 불필요한 텍스트 제거 + title = title.trim(); + title = title.replace(/\s*-\s*스티비$/, ''); + title = title.replace(/\s*\|\s*Stibee$/, ''); + title = title.replace(/\s*::.*$/, ''); + return title.trim(); + } + } + } + + return undefined; + } + + /** + * 스티비 뉴스레터 콘텐츠를 추출하고 최적화합니다. + * @param document - DOM 문서 + * @returns 최적화된 HTML 콘텐츠 + */ + private extractAndOptimizeStibeeContent(document: Document): string | undefined { + // 스티비 뉴스레터 콘텐츠 추출을 위한 셀렉터들 + const contentSelectors = [ + 'article', + '[class*="content"]', + '[class*="newsletter"]', + '[class*="post"]', + '[class*="body"]', + 'main', + '.container', + '#content', + ]; + + let contentElement: Element | null = null; + + // 가장 적절한 콘텐츠 컨테이너 찾기 + for (const selector of contentSelectors) { + const element = document.querySelector(selector); + if (element && element.textContent && element.textContent.trim().length > 100) { + contentElement = element; + break; + } + } + + // 콘텐츠 컨테이너를 찾지 못한 경우 body 사용 + if (!contentElement) { + contentElement = document.body; + } + + if (!contentElement) { + return undefined; + } + + // 콘텐츠 정리 및 최적화 + this.optimizeStibeeContent(contentElement as HTMLElement); + + // 불필요한 요소들 제거 + this.removeUnwantedElements(contentElement as HTMLElement); + + return contentElement.outerHTML; + } + + /** + * 스티비 뉴스레터 콘텐츠를 최적화합니다. + * @param element - 최적화할 HTML 요소 + */ + private optimizeStibeeContent(element: HTMLElement): void { + // 이미지 최적화 + this.optimizeStibeeImages(element); + + // 링크 최적화 + this.optimizeStibeeLinks(element); + + // 스타일 정리 + this.cleanupStibeeStyles(element); + } + + /** + * 스티비 뉴스레터의 이미지를 최적화합니다. + * @param element - 최적화할 요소 + */ + private optimizeStibeeImages(element: HTMLElement): void { + const images = element.querySelectorAll('img'); + + images.forEach((img) => { + // data-src 속성을 src로 변환 (lazy loading) + const dataSrc = img.getAttribute('data-src'); + if (dataSrc && !img.src) { + img.src = dataSrc; + } + + // 상대 경로를 절대 경로로 변환 + if (img.src && img.src.startsWith('//')) { + img.src = 'https:' + img.src; + } + + // 불필요한 속성 제거 + img.removeAttribute('data-src'); + img.removeAttribute('loading'); + img.removeAttribute('srcset'); // 단순화를 위해 srcset 제거 + }); + } + + /** + * 스티비 뉴스레터의 링크를 최적화합니다. + * @param element - 최적화할 요소 + */ + private optimizeStibeeLinks(element: HTMLElement): void { + const links = element.querySelectorAll('a'); + + links.forEach((link) => { + // 상대 경로를 절대 경로로 변환 + if (link.href && link.href.startsWith('/')) { + link.href = 'https://stibee.com' + link.href; + } + + // 새 탭에서 열기 설정 + link.target = '_blank'; + link.rel = 'noopener noreferrer'; + }); + } + + /** + * 스티비 뉴스레터의 스타일을 정리합니다. + * @param element - 정리할 요소 + */ + private cleanupStibeeStyles(element: HTMLElement): void { + // 인라인 스타일 중 불필요한 것들 제거 + const elementsWithStyle = element.querySelectorAll('[style]'); + + elementsWithStyle.forEach((el) => { + const style = el.getAttribute('style'); + if (style) { + // 폰트 크기와 색상만 유지하고 나머지는 제거 + const keepStyles = style.match(/(font-size|color|background-color):[^;]+;?/g); + if (keepStyles) { + el.setAttribute('style', keepStyles.join(' ')); + } else { + el.removeAttribute('style'); + } + } + }); + } + + /** + * 불필요한 요소들을 제거합니다. + * @param element - 정리할 요소 + */ + private removeUnwantedElements(element: HTMLElement): void { + // 제거할 요소들의 셀렉터 + const unwantedSelectors = [ + 'script', + 'style', + 'noscript', + 'iframe[src*="tracking"]', + 'iframe[src*="analytics"]', + '[class*="ad"]', + '[class*="advertisement"]', + '[class*="tracking"]', + '[class*="analytics"]', + '[id*="tracking"]', + '[id*="analytics"]', + '.footer', + '.header', + '.navigation', + '.sidebar', + ]; + + unwantedSelectors.forEach((selector) => { + const elements = element.querySelectorAll(selector); + elements.forEach((el) => el.remove()); + }); + } +} diff --git a/src/modules/pre-handler/pre-handler.module.ts b/src/modules/pre-handler/pre-handler.module.ts index 59ecfcd..5a80d5a 100644 --- a/src/modules/pre-handler/pre-handler.module.ts +++ b/src/modules/pre-handler/pre-handler.module.ts @@ -8,6 +8,7 @@ import { RssHandler } from './handlers/rss.handler'; import { YoutubeHandler } from './handlers/youtube.handler'; import { SocialMediaHandler } from './handlers/social-media.handler'; import { NewsSiteHandler } from './handlers/news-site.handler'; +import { StibeeHandler } from './handlers/stibee.handler'; // --- Register all handlers here --- // The order is important: more specific handlers should come first. @@ -15,14 +16,16 @@ import { NewsSiteHandler } from './handlers/news-site.handler'; // 2. Platform-specific handlers (YouTube) - very specific // 3. Social media handlers - moderately specific // 4. News site handlers - moderately specific -// 5. Domain transformation handlers - general transformations -// 6. General readability handler - fallback for everything else +// 5. Newsletter platform handlers (Stibee) - moderately specific +// 6. Domain transformation handlers - general transformations +// 7. General readability handler - fallback for everything else const handlers = [ PdfHandler, RssHandler, YoutubeHandler, SocialMediaHandler, NewsSiteHandler, + StibeeHandler, DomainSpecificHandler, ReadabilityHandler, ]; @@ -38,8 +41,9 @@ const handlers = [ * 3. YoutubeHandler - Extracts YouTube video information * 4. SocialMediaHandler - Transforms social media URLs * 5. NewsSiteHandler - Transforms news site URLs - * 6. DomainSpecificHandler - Transforms URLs for other specific domains - * 7. ReadabilityHandler - Fallback for general web content + * 6. StibeeHandler - Extracts Stibee newsletter content + * 7. DomainSpecificHandler - Transforms URLs for other specific domains + * 8. ReadabilityHandler - Fallback for general web content */ @Module({ providers: [ From c79ed657f0ad5f72cf8798cd07bc928c1fa95a84 Mon Sep 17 00:00:00 2001 From: reach0908 Date: Sat, 5 Jul 2025 10:34:51 +0900 Subject: [PATCH 10/28] =?UTF-8?q?feat(article):=20ArticleOutput=20DTO=20?= =?UTF-8?q?=EB=B3=80=ED=99=98=20=EB=A1=9C=EC=A7=81=20=EA=B0=9C=EC=84=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Article 엔티티를 ArticleOutput DTO로 변환하는 로직에서 각 필드에 대해 null 체크를 추가하여, undefined 값을 명시적으로 처리하도록 개선했습니다. 이를 통해 데이터의 일관성을 높이고, 클라이언트에서의 오류 발생 가능성을 줄였습니다. --- .../article/services/article.service.ts | 19 ++++++++++--------- 1 file changed, 10 insertions(+), 9 deletions(-) diff --git a/src/modules/article/services/article.service.ts b/src/modules/article/services/article.service.ts index 386e306..8e6ace6 100644 --- a/src/modules/article/services/article.service.ts +++ b/src/modules/article/services/article.service.ts @@ -1,4 +1,5 @@ import { Injectable, Logger, NotFoundException, BadRequestException } from '@nestjs/common'; +import { Article } from '@prisma/client'; import { ArticleRepository } from '../repositories/article.repository'; import { CreateArticleInput } from '../dto/create-article.input'; import { UpdateArticleInput } from '../dto/update-article.input'; @@ -188,19 +189,19 @@ export class ArticleService { /** * Article 엔티티를 ArticleOutput DTO로 변환합니다. */ - private mapToOutput(article: any): ArticleOutput { + private mapToOutput(article: Article & { user?: { id: string; email: string; name: string } }): ArticleOutput { return { id: article.id, url: article.url, finalUrl: article.finalUrl, - title: article.title, - content: article.content, - contentType: article.contentType, - summary: article.summary, - author: article.author, - publishedAt: article.publishedAt, - wordCount: article.wordCount, - readingTime: article.readingTime, + title: article.title ?? undefined, + content: article.content ?? undefined, + contentType: article.contentType ?? undefined, + summary: article.summary ?? undefined, + author: article.author ?? undefined, + publishedAt: article.publishedAt ?? undefined, + wordCount: article.wordCount ?? undefined, + readingTime: article.readingTime ?? undefined, tags: article.tags, isBookmarked: article.isBookmarked, isArchived: article.isArchived, From c8063c24e78813b75a0b96a515177b5a7e1a99c4 Mon Sep 17 00:00:00 2001 From: reach0908 Date: Sat, 5 Jul 2025 11:47:26 +0900 Subject: [PATCH 11/28] =?UTF-8?q?feat(pre-handler):=20MailyHandler=20?= =?UTF-8?q?=EC=B6=94=EA=B0=80=20=EB=B0=8F=20=ED=95=B8=EB=93=A4=EB=9F=AC=20?= =?UTF-8?q?=EB=AA=A9=EB=A1=9D=20=EC=97=85=EB=8D=B0=EC=9D=B4=ED=8A=B8?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 메일리(Maily) 뉴스레터 플랫폼을 위한 MailyHandler를 추가하여 메일리 뉴스레터 콘텐츠를 최적화하여 추출하는 기능을 구현했습니다. * 핸들러 목록에 MailyHandler를 추가하고, 주석을 업데이트하여 핸들러의 순서를 명확히 했습니다. --- .../pre-handler/handlers/maily.handler.ts | 356 ++++++++++++++++++ src/modules/pre-handler/pre-handler.module.ts | 9 +- 2 files changed, 362 insertions(+), 3 deletions(-) create mode 100644 src/modules/pre-handler/handlers/maily.handler.ts diff --git a/src/modules/pre-handler/handlers/maily.handler.ts b/src/modules/pre-handler/handlers/maily.handler.ts new file mode 100644 index 0000000..577cc85 --- /dev/null +++ b/src/modules/pre-handler/handlers/maily.handler.ts @@ -0,0 +1,356 @@ +import { Injectable, Logger } from '@nestjs/common'; +import { JSDOM } from 'jsdom'; +import { IContentHandler } from '../interfaces/content-handler.interface'; +import { PreHandleResult } from '../dto/pre-handle-result.dto'; + +/** + * 메일리(Maily) 뉴스레터 플랫폼을 위한 전용 콘텐츠 핸들러 + * 메일리 뉴스레터의 특별한 구조와 인증 시스템을 고려하여 최적화된 콘텐츠 추출을 제공합니다. + */ +@Injectable() +export class MailyHandler implements IContentHandler { + private readonly logger = new Logger(MailyHandler.name); + + /** + * 메일리 뉴스레터 URL인지 확인합니다. + * @param url - 확인할 URL + * @returns 메일리 URL이면 true, 아니면 false + */ + public canHandle(url: URL): boolean { + return url.hostname.endsWith('maily.so'); + } + + /** + * 메일리 뉴스레터 콘텐츠를 추출합니다. + * @param url - 처리할 메일리 URL + * @returns 추출된 콘텐츠 또는 null + */ + public async handle(url: URL): Promise { + try { + this.logger.debug(`메일리 뉴스레터 콘텐츠 추출 시작: ${url.href}`); + + // 메일리 뉴스레터 페이지에서 콘텐츠를 가져옵니다 + const result = await this.extractMailyContent(url); + + if (!result.content) { + this.logger.debug(`메일리 콘텐츠 추출 실패: ${url.href}`); + return null; + } + + this.logger.log(`메일리 콘텐츠 추출 성공: ${result.content.length} 글자`); + + return { + url: url.href, + title: result.title, + content: result.content, + contentType: 'text/html', + }; + } catch (error) { + this.logger.warn(`메일리 핸들러 처리 실패 ${url.href}: ${(error as Error).message}`); + return null; + } + } + + /** + * 메일리 뉴스레터에서 콘텐츠를 추출합니다. + * @param url - 메일리 뉴스레터 URL + * @returns 추출된 제목과 콘텐츠 + */ + private async extractMailyContent(url: URL): Promise<{ + title?: string; + content?: string; + }> { + try { + // AbortController로 타임아웃 설정 + const controller = new AbortController(); + const timeoutId = setTimeout(() => controller.abort(), 20000); // 20초 타임아웃 + + // 메일리 뉴스레터 접근을 위한 최적화된 헤더 설정 + const response = await fetch(url.href, { + headers: { + 'User-Agent': + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': 'ko-KR,ko;q=0.9,en;q=0.8', + 'Accept-Encoding': 'gzip, deflate, br', + Connection: 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'none', + 'Cache-Control': 'no-cache', + // 메일리 특화 헤더 추가 + 'X-Requested-With': 'XMLHttpRequest', + Origin: 'https://maily.so', + Referer: 'https://maily.so/', + }, + redirect: 'follow', + signal: controller.signal, + }); + + clearTimeout(timeoutId); + + if (!response.ok) { + throw new Error(`HTTP ${response.status}: ${response.statusText}`); + } + + const html = await response.text(); + this.logger.debug(`메일리 HTML 가져오기 성공, 길이: ${html.length}`); + + // DOM 파싱 + const dom = new JSDOM(html); + const document = dom.window.document; + + // 제목 추출 + const title = this.extractMailyTitle(document); + + // 콘텐츠 추출 및 최적화 + const content = this.extractAndOptimizeMailyContent(document); + + return { + title, + content, + }; + } catch (error) { + this.logger.warn(`메일리 콘텐츠 추출 실패 ${url.href}: ${(error as Error).message}`); + return { + title: undefined, + content: undefined, + }; + } + } + + /** + * 메일리 뉴스레터에서 제목을 추출합니다. + * @param document - DOM 문서 + * @returns 추출된 제목 + */ + private extractMailyTitle(document: Document): string | undefined { + // 메일리 뉴스레터 제목 추출을 위한 다양한 셀렉터 시도 + const titleSelectors = [ + 'meta[property="og:title"]', + 'meta[name="twitter:title"]', + 'title', + 'h1', + '.newsletter-title', + '.post-title', + '.article-title', + '[class*="title"]', + '[class*="headline"]', + '[data-testid="post-title"]', + '[data-testid="article-title"]', + // 메일리 특화 셀렉터 + '[class*="maily-title"]', + '[class*="letter-title"]', + 'article h1', + 'main h1', + ]; + + for (const selector of titleSelectors) { + const element = document.querySelector(selector); + if (element) { + let title = element.getAttribute('content') || element.textContent; + if (title?.trim()) { + // 메일리 관련 불필요한 텍스트 제거 + title = title.trim(); + title = title.replace(/\s*-\s*메일리$/, ''); + title = title.replace(/\s*\|\s*Maily$/, ''); + title = title.replace(/\s*::.*$/, ''); + title = title.replace(/\s*·\s*메일리$/, ''); + title = title.replace(/\s*뉴스레터를 쉽게, 메일리로 시작하세요$/, ''); + return title.trim(); + } + } + } + + return undefined; + } + + /** + * 메일리 뉴스레터 콘텐츠를 추출하고 최적화합니다. + * @param document - DOM 문서 + * @returns 최적화된 HTML 콘텐츠 + */ + private extractAndOptimizeMailyContent(document: Document): string | undefined { + // 메일리 뉴스레터 콘텐츠 추출을 위한 셀렉터들 + const contentSelectors = [ + 'article', + '[class*="content"]', + '[class*="newsletter"]', + '[class*="post"]', + '[class*="body"]', + '[class*="letter"]', + '[data-testid="post-content"]', + '[data-testid="article-content"]', + 'main', + '.container', + '#content', + // 메일리 특화 셀렉터 + '[class*="maily-content"]', + '[class*="letter-content"]', + '[class*="newsletter-content"]', + '.post-content', + '.article-content', + ]; + + let contentElement: Element | null = null; + + // 가장 적절한 콘텐츠 컨테이너 찾기 + for (const selector of contentSelectors) { + const element = document.querySelector(selector); + if (element && element.textContent && element.textContent.trim().length > 100) { + contentElement = element; + break; + } + } + + // 콘텐츠 컨테이너를 찾지 못한 경우 body 사용 + if (!contentElement) { + contentElement = document.body; + } + + if (!contentElement) { + return undefined; + } + + // 콘텐츠 정리 및 최적화 + this.optimizeMailyContent(contentElement as HTMLElement); + + // 불필요한 요소들 제거 + this.removeUnwantedElements(contentElement as HTMLElement); + + return contentElement.outerHTML; + } + + /** + * 메일리 뉴스레터 콘텐츠를 최적화합니다. + * @param element - 최적화할 HTML 요소 + */ + private optimizeMailyContent(element: HTMLElement): void { + // 이미지 최적화 + this.optimizeMailyImages(element); + + // 링크 최적화 + this.optimizeMailyLinks(element); + + // 스타일 정리 + this.cleanupMailyStyles(element); + } + + /** + * 메일리 뉴스레터의 이미지를 최적화합니다. + * @param element - 최적화할 요소 + */ + private optimizeMailyImages(element: HTMLElement): void { + const images = element.querySelectorAll('img'); + + images.forEach((img) => { + // data-src 속성을 src로 변환 (lazy loading) + const dataSrc = img.getAttribute('data-src'); + if (dataSrc && !img.src) { + img.src = dataSrc; + } + + // 상대 경로를 절대 경로로 변환 + if (img.src && img.src.startsWith('//')) { + img.src = 'https:' + img.src; + } + + // 메일리 CDN 경로 처리 + if (img.src && img.src.startsWith('/')) { + img.src = 'https://maily.so' + img.src; + } + + // 불필요한 속성 제거 + img.removeAttribute('data-src'); + img.removeAttribute('loading'); + img.removeAttribute('srcset'); // 단순화를 위해 srcset 제거 + }); + } + + /** + * 메일리 뉴스레터의 링크를 최적화합니다. + * @param element - 최적화할 요소 + */ + private optimizeMailyLinks(element: HTMLElement): void { + const links = element.querySelectorAll('a'); + + links.forEach((link) => { + // 상대 경로를 절대 경로로 변환 + if (link.href && link.href.startsWith('/')) { + link.href = 'https://maily.so' + link.href; + } + + // 새 탭에서 열기 설정 + link.target = '_blank'; + link.rel = 'noopener noreferrer'; + }); + } + + /** + * 메일리 뉴스레터의 스타일을 정리합니다. + * @param element - 정리할 요소 + */ + private cleanupMailyStyles(element: HTMLElement): void { + // 인라인 스타일 중 불필요한 것들 제거 + const elementsWithStyle = element.querySelectorAll('[style]'); + + elementsWithStyle.forEach((el) => { + const style = el.getAttribute('style'); + if (style) { + // 폰트 크기와 색상만 유지하고 나머지는 제거 + const keepStyles = style.match(/(font-size|color|background-color|text-align):[^;]+;?/g); + if (keepStyles) { + el.setAttribute('style', keepStyles.join(' ')); + } else { + el.removeAttribute('style'); + } + } + }); + } + + /** + * 불필요한 요소들을 제거합니다. + * @param element - 정리할 요소 + */ + private removeUnwantedElements(element: HTMLElement): void { + // 제거할 요소들의 셀렉터 + const unwantedSelectors = [ + 'script', + 'style', + 'noscript', + 'iframe[src*="tracking"]', + 'iframe[src*="analytics"]', + 'iframe[src*="google-analytics"]', + '[class*="ad"]', + '[class*="advertisement"]', + '[class*="tracking"]', + '[class*="analytics"]', + '[id*="tracking"]', + '[id*="analytics"]', + '.footer', + '.header', + '.navigation', + '.nav', + '.sidebar', + '.login', + '.auth', + '.subscription', + // 메일리 특화 제거 셀렉터 + '[class*="login"]', + '[class*="auth"]', + '[class*="signup"]', + '[class*="subscribe"]', + '[class*="maily-nav"]', + '[class*="maily-header"]', + '[class*="maily-footer"]', + '[data-testid="login-form"]', + '[data-testid="auth-form"]', + ]; + + unwantedSelectors.forEach((selector) => { + const elements = element.querySelectorAll(selector); + elements.forEach((el) => el.remove()); + }); + } +} diff --git a/src/modules/pre-handler/pre-handler.module.ts b/src/modules/pre-handler/pre-handler.module.ts index 5a80d5a..966beb7 100644 --- a/src/modules/pre-handler/pre-handler.module.ts +++ b/src/modules/pre-handler/pre-handler.module.ts @@ -9,6 +9,7 @@ import { YoutubeHandler } from './handlers/youtube.handler'; import { SocialMediaHandler } from './handlers/social-media.handler'; import { NewsSiteHandler } from './handlers/news-site.handler'; import { StibeeHandler } from './handlers/stibee.handler'; +import { MailyHandler } from './handlers/maily.handler'; // --- Register all handlers here --- // The order is important: more specific handlers should come first. @@ -16,7 +17,7 @@ import { StibeeHandler } from './handlers/stibee.handler'; // 2. Platform-specific handlers (YouTube) - very specific // 3. Social media handlers - moderately specific // 4. News site handlers - moderately specific -// 5. Newsletter platform handlers (Stibee) - moderately specific +// 5. Newsletter platform handlers (Stibee, Maily) - moderately specific // 6. Domain transformation handlers - general transformations // 7. General readability handler - fallback for everything else const handlers = [ @@ -26,6 +27,7 @@ const handlers = [ SocialMediaHandler, NewsSiteHandler, StibeeHandler, + MailyHandler, DomainSpecificHandler, ReadabilityHandler, ]; @@ -42,8 +44,9 @@ const handlers = [ * 4. SocialMediaHandler - Transforms social media URLs * 5. NewsSiteHandler - Transforms news site URLs * 6. StibeeHandler - Extracts Stibee newsletter content - * 7. DomainSpecificHandler - Transforms URLs for other specific domains - * 8. ReadabilityHandler - Fallback for general web content + * 7. MailyHandler - Extracts Maily newsletter content + * 8. DomainSpecificHandler - Transforms URLs for other specific domains + * 9. ReadabilityHandler - Fallback for general web content */ @Module({ providers: [ From f77e643a0018668c9901c01acb3134804a427a97 Mon Sep 17 00:00:00 2001 From: reach0908 Date: Sat, 5 Jul 2025 12:27:41 +0900 Subject: [PATCH 12/28] =?UTF-8?q?feat(pre-handler):=20PreHandlerService=20?= =?UTF-8?q?=EB=A6=AC=ED=8C=A9=ED=86=A0=EB=A7=81=20=EB=B0=8F=20=ED=95=B8?= =?UTF-8?q?=EB=93=A4=EB=9F=AC=20=EA=B5=AC=EC=A1=B0=20=EA=B0=9C=EC=84=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * PreHandlerService를 RefactoredPreHandlerService로 리팩토링하여 핸들러 팩토리 패턴을 적용했습니다. * 핸들러 목록을 HandlerFactory를 통해 관리하도록 변경하여 코드의 가독성과 유지보수성을 향상시켰습니다. * 기존 핸들러를 AbstractContentHandler를 상속받아 리팩토링하여 SOLID 원칙을 준수하도록 개선했습니다. * 각 핸들러의 HTTP 요청 및 DOM 생성 설정을 통일하여 일관성을 높였습니다. * 불필요한 주석을 제거하고, 코드의 명확성을 높였습니다. --- .../base/abstract-content-handler.ts | 145 ++++++ .../pre-handler/factories/handler-factory.ts | 67 +++ .../pre-handler/handlers/maily.handler.ts | 411 +++++------------- .../pre-handler/handlers/news-site.handler.ts | 145 +++--- .../pre-handler/handlers/pdf.handler.ts | 104 ++++- .../handlers/readability.handler.ts | 149 ++++--- .../pre-handler/handlers/rss.handler.ts | 105 ++++- .../pre-handler/handlers/stibee.handler.ts | 360 ++++----------- .../pre-handler/handlers/youtube.handler.ts | 226 +++++++--- src/modules/pre-handler/pre-handler.module.ts | 48 +- .../pre-handler/pre-handler.service.ts | 68 --- .../refactored-pre-handler.service.ts | 89 ++++ .../types/content-extraction.types.ts | 104 +++++ .../utils/content-cleaning-pipeline.ts | 215 +++++++++ .../pre-handler/utils/functional-utils.ts | 254 +++++++++++ src/modules/scraper/scraper.controller.ts | 4 +- .../services/puppeteer-parse.service.ts | 10 +- 17 files changed, 1588 insertions(+), 916 deletions(-) create mode 100644 src/modules/pre-handler/base/abstract-content-handler.ts create mode 100644 src/modules/pre-handler/factories/handler-factory.ts delete mode 100644 src/modules/pre-handler/pre-handler.service.ts create mode 100644 src/modules/pre-handler/refactored-pre-handler.service.ts create mode 100644 src/modules/pre-handler/types/content-extraction.types.ts create mode 100644 src/modules/pre-handler/utils/content-cleaning-pipeline.ts create mode 100644 src/modules/pre-handler/utils/functional-utils.ts diff --git a/src/modules/pre-handler/base/abstract-content-handler.ts b/src/modules/pre-handler/base/abstract-content-handler.ts new file mode 100644 index 0000000..6c412ae --- /dev/null +++ b/src/modules/pre-handler/base/abstract-content-handler.ts @@ -0,0 +1,145 @@ +/** + * 추상 핸들러 베이스 클래스 (템플릿 메서드 패턴) + * - SOLID 원칙 및 함수형 프로그래밍 기반 + */ +import { Logger } from '@nestjs/common'; +import { JSDOM } from 'jsdom'; +import { IContentHandler } from '../interfaces/content-handler.interface'; +import { PreHandleResult } from '../dto/pre-handle-result.dto'; +import { + HttpRequestConfig, + DomConfig, + ContentCleaningConfig, + TitleExtractionConfig, + ContentExtractionResult, +} from '../types/content-extraction.types'; +import { fetchHtml, createDom, extractTitle, findContentElement, Result, Option } from '../utils/functional-utils'; +import { createContentCleaningPipeline } from '../utils/content-cleaning-pipeline'; + +/** + * 콘텐츠 핸들러의 공통 추상 클래스 + * - 템플릿 메서드 패턴 기반 + * - 공통 로직을 추상화하고, 구체 구현은 하위 클래스에 위임 + */ +export abstract class AbstractContentHandler implements IContentHandler { + protected readonly logger = new Logger(this.constructor.name); + + /** + * 핸들러가 처리할 수 있는 URL인지 확인 + * @param url 검사할 URL + */ + public abstract canHandle(url: URL): boolean; + + /** + * 핸들러 이름 (로깅용) + */ + protected abstract get handlerName(): string; + + /** + * HTTP 요청 설정 + */ + protected abstract get httpConfig(): HttpRequestConfig; + + /** + * DOM 생성 설정 + */ + protected abstract get domConfig(): DomConfig; + + /** + * 콘텐츠 정제 설정 + */ + protected abstract get cleaningConfig(): ContentCleaningConfig; + + /** + * 제목 추출 설정 + */ + protected abstract get titleConfig(): TitleExtractionConfig; + + /** + * 콘텐츠 선택자들 + */ + protected abstract get contentSelectors(): readonly string[]; + + /** + * 템플릿 메서드: 핸들링 프로세스 + * @param url 처리할 URL + * @returns 추출 결과 또는 null + */ + public async handle(url: URL): Promise { + try { + this.logger.debug(`${this.handlerName} 콘텐츠 추출 시작: ${url.href}`); + const result = await this.extractContent(url); + if (!result.success) { + this.logger.debug(`${this.handlerName} 콘텐츠 추출 실패: ${result.error.message}`); + return null; + } + const { title, content } = result.data; + if (!content) { + this.logger.debug(`${this.handlerName} 콘텐츠 없음: ${url.href}`); + return null; + } + this.logger.log(`${this.handlerName} 콘텐츠 추출 성공: ${content.length} 글자`); + return { + url: url.href, + title, + content, + contentType: 'text/html', + }; + } catch (error) { + this.logger.warn(`${this.handlerName} 핸들러 처리 실패 ${url.href}: ${(error as Error).message}`); + return null; + } + } + + /** + * 콘텐츠 추출 (함수형 프로그래밍) + * @param url 처리할 URL + * @returns 추출 결과 Result + */ + private async extractContent(url: URL): Promise> { + const htmlResult = await fetchHtml(url.href, this.httpConfig); + if (!htmlResult.success) { + return { success: false, error: htmlResult.error }; + } + const domResult = createDom(htmlResult.data, this.domConfig); + if (!domResult.success) { + return { success: false, error: domResult.error }; + } + return { success: true, data: this.processDom(domResult.data, url.href) }; + } + + /** + * DOM 처리 및 콘텐츠 정제 + * @param dom JSDOM 인스턴스 + * @param url 기준 URL + * @returns 추출 결과 + */ + private processDom(dom: JSDOM, url: string): ContentExtractionResult { + const document = dom.window.document; + // 제목 추출 + const titleOption: Option = extractTitle( + document, + this.titleConfig.selectors, + this.titleConfig.patterns, + ); + const title: string | undefined = titleOption == null ? undefined : titleOption; + // 콘텐츠 요소 찾기 + const contentElement = findContentElement(document, this.contentSelectors); + if (!contentElement) { + return { title, contentType: 'text/html', url }; + } + // 콘텐츠 정제 + const cleaningPipeline = createContentCleaningPipeline(this.cleaningConfig); + const cleanedElement = cleaningPipeline(contentElement, { + baseUrl: url, + config: this.cleaningConfig, + logger: this.logger, + }); + return { + title, + content: cleanedElement.outerHTML, + contentType: 'text/html', + url, + }; + } +} diff --git a/src/modules/pre-handler/factories/handler-factory.ts b/src/modules/pre-handler/factories/handler-factory.ts new file mode 100644 index 0000000..524d537 --- /dev/null +++ b/src/modules/pre-handler/factories/handler-factory.ts @@ -0,0 +1,67 @@ +/** + * 핸들러 팩토리 + * - 도메인별 핸들러를 DI 받아 URL에 따라 적절한 핸들러를 반환 + * - getAllHandlers()로 전체 핸들러 배열 반환 + */ +import { Injectable } from '@nestjs/common'; +import { IContentHandler } from '../interfaces/content-handler.interface'; +import { MailyHandler } from '../handlers/maily.handler'; +import { StibeeHandler } from '../handlers/stibee.handler'; +import { PdfHandler } from '../handlers/pdf.handler'; +import { RssHandler } from '../handlers/rss.handler'; +import { YoutubeHandler } from '../handlers/youtube.handler'; +import { NewsSiteHandler } from '../handlers/news-site.handler'; +import { ReadabilityHandler } from '../handlers/readability.handler'; +// 필요시 다른 핸들러 import + +/** + * 핸들러 팩토리 클래스 + */ +@Injectable() +export class HandlerFactory { + private readonly handlerChain: IContentHandler[]; + + constructor( + private readonly mailyHandler: MailyHandler, + private readonly stibeeHandler: StibeeHandler, + private readonly pdfHandler: PdfHandler, + private readonly rssHandler: RssHandler, + private readonly youtubeHandler: YoutubeHandler, + private readonly newsSiteHandler: NewsSiteHandler, + private readonly readabilityHandler: ReadabilityHandler, + // 필요시 다른 핸들러 DI + ) { + // 우선순위: 도메인 특화 → 일반 → fallback + this.handlerChain = [ + this.mailyHandler, + this.stibeeHandler, + this.pdfHandler, + this.rssHandler, + this.youtubeHandler, + this.newsSiteHandler, + this.readabilityHandler, // 항상 마지막 fallback + ]; + } + + /** + * URL에 적합한 핸들러 반환 (우선순위 순회) + * @param url URL 객체 + * @returns IContentHandler + */ + public createHandler(url: URL): IContentHandler { + for (const handler of this.handlerChain) { + if (handler.canHandle(url)) { + return handler; + } + } + // 이론상 도달 불가 (readability가 항상 true) + throw new Error('No suitable handler found for this URL'); + } + + /** + * 전체 핸들러 배열 반환 + */ + public getAllHandlers(): IContentHandler[] { + return this.handlerChain; + } +} diff --git a/src/modules/pre-handler/handlers/maily.handler.ts b/src/modules/pre-handler/handlers/maily.handler.ts index 577cc85..ab1d3fd 100644 --- a/src/modules/pre-handler/handlers/maily.handler.ts +++ b/src/modules/pre-handler/handlers/maily.handler.ts @@ -1,179 +1,129 @@ -import { Injectable, Logger } from '@nestjs/common'; -import { JSDOM } from 'jsdom'; -import { IContentHandler } from '../interfaces/content-handler.interface'; -import { PreHandleResult } from '../dto/pre-handle-result.dto'; +/** + * 메일리(Maily) 뉴스레터 플랫폼을 위한 리팩토링된 콘텐츠 핸들러 + * - AbstractContentHandler 기반 + * - SOLID 원칙 및 함수형 프로그래밍 적용 + */ +import { Injectable } from '@nestjs/common'; +import { AbstractContentHandler } from '../base/abstract-content-handler'; +import { + HttpRequestConfig, + DomConfig, + ContentCleaningConfig, + TitleExtractionConfig, +} from '../types/content-extraction.types'; /** - * 메일리(Maily) 뉴스레터 플랫폼을 위한 전용 콘텐츠 핸들러 - * 메일리 뉴스레터의 특별한 구조와 인증 시스템을 고려하여 최적화된 콘텐츠 추출을 제공합니다. + * 메일리 뉴스레터 핸들러 */ @Injectable() -export class MailyHandler implements IContentHandler { - private readonly logger = new Logger(MailyHandler.name); - +export class MailyHandler extends AbstractContentHandler { /** - * 메일리 뉴스레터 URL인지 확인합니다. - * @param url - 확인할 URL - * @returns 메일리 URL이면 true, 아니면 false + * 메일리 도메인 처리 여부 + * @param url 검사할 URL */ public canHandle(url: URL): boolean { return url.hostname.endsWith('maily.so'); } /** - * 메일리 뉴스레터 콘텐츠를 추출합니다. - * @param url - 처리할 메일리 URL - * @returns 추출된 콘텐츠 또는 null + * 핸들러 이름 */ - public async handle(url: URL): Promise { - try { - this.logger.debug(`메일리 뉴스레터 콘텐츠 추출 시작: ${url.href}`); - - // 메일리 뉴스레터 페이지에서 콘텐츠를 가져옵니다 - const result = await this.extractMailyContent(url); - - if (!result.content) { - this.logger.debug(`메일리 콘텐츠 추출 실패: ${url.href}`); - return null; - } - - this.logger.log(`메일리 콘텐츠 추출 성공: ${result.content.length} 글자`); - - return { - url: url.href, - title: result.title, - content: result.content, - contentType: 'text/html', - }; - } catch (error) { - this.logger.warn(`메일리 핸들러 처리 실패 ${url.href}: ${(error as Error).message}`); - return null; - } + protected get handlerName(): string { + return '메일리 핸들러'; } /** - * 메일리 뉴스레터에서 콘텐츠를 추출합니다. - * @param url - 메일리 뉴스레터 URL - * @returns 추출된 제목과 콘텐츠 + * HTTP 요청 설정 */ - private async extractMailyContent(url: URL): Promise<{ - title?: string; - content?: string; - }> { - try { - // AbortController로 타임아웃 설정 - const controller = new AbortController(); - const timeoutId = setTimeout(() => controller.abort(), 20000); // 20초 타임아웃 - - // 메일리 뉴스레터 접근을 위한 최적화된 헤더 설정 - const response = await fetch(url.href, { - headers: { - 'User-Agent': - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', - Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Accept-Language': 'ko-KR,ko;q=0.9,en;q=0.8', - 'Accept-Encoding': 'gzip, deflate, br', - Connection: 'keep-alive', - 'Upgrade-Insecure-Requests': '1', - 'Sec-Fetch-Dest': 'document', - 'Sec-Fetch-Mode': 'navigate', - 'Sec-Fetch-Site': 'none', - 'Cache-Control': 'no-cache', - // 메일리 특화 헤더 추가 - 'X-Requested-With': 'XMLHttpRequest', - Origin: 'https://maily.so', - Referer: 'https://maily.so/', - }, - redirect: 'follow', - signal: controller.signal, - }); - - clearTimeout(timeoutId); - - if (!response.ok) { - throw new Error(`HTTP ${response.status}: ${response.statusText}`); - } - - const html = await response.text(); - this.logger.debug(`메일리 HTML 가져오기 성공, 길이: ${html.length}`); - - // DOM 파싱 - const dom = new JSDOM(html); - const document = dom.window.document; - - // 제목 추출 - const title = this.extractMailyTitle(document); - - // 콘텐츠 추출 및 최적화 - const content = this.extractAndOptimizeMailyContent(document); - - return { - title, - content, - }; - } catch (error) { - this.logger.warn(`메일리 콘텐츠 추출 실패 ${url.href}: ${(error as Error).message}`); - return { - title: undefined, - content: undefined, - }; - } + protected get httpConfig(): HttpRequestConfig { + return { + userAgent: + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + timeout: 20000, + headers: { + Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': 'ko-KR,ko;q=0.9,en;q=0.8', + 'Accept-Encoding': 'gzip, deflate, br', + Connection: 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'none', + 'Cache-Control': 'no-cache', + 'X-Requested-With': 'XMLHttpRequest', + Origin: 'https://maily.so', + Referer: 'https://maily.so/', + }, + redirect: 'follow', + }; } /** - * 메일리 뉴스레터에서 제목을 추출합니다. - * @param document - DOM 문서 - * @returns 추출된 제목 + * DOM 생성 설정 */ - private extractMailyTitle(document: Document): string | undefined { - // 메일리 뉴스레터 제목 추출을 위한 다양한 셀렉터 시도 - const titleSelectors = [ - 'meta[property="og:title"]', - 'meta[name="twitter:title"]', - 'title', - 'h1', - '.newsletter-title', - '.post-title', - '.article-title', - '[class*="title"]', - '[class*="headline"]', - '[data-testid="post-title"]', - '[data-testid="article-title"]', - // 메일리 특화 셀렉터 - '[class*="maily-title"]', - '[class*="letter-title"]', - 'article h1', - 'main h1', - ]; + protected get domConfig(): DomConfig { + return { + userAgent: this.httpConfig.userAgent, + resources: 'usable', + runScripts: 'outside-only', + pretendToBeVisual: true, + }; + } - for (const selector of titleSelectors) { - const element = document.querySelector(selector); - if (element) { - let title = element.getAttribute('content') || element.textContent; - if (title?.trim()) { - // 메일리 관련 불필요한 텍스트 제거 - title = title.trim(); - title = title.replace(/\s*-\s*메일리$/, ''); - title = title.replace(/\s*\|\s*Maily$/, ''); - title = title.replace(/\s*::.*$/, ''); - title = title.replace(/\s*·\s*메일리$/, ''); - title = title.replace(/\s*뉴스레터를 쉽게, 메일리로 시작하세요$/, ''); - return title.trim(); - } - } - } + /** + * 콘텐츠 정제 설정 + */ + protected get cleaningConfig(): ContentCleaningConfig { + return { + removeUnwantedElements: true, + cleanupStyles: true, + cleanupLinks: true, + cleanupImages: true, + cleanupText: true, + refineTitle: true, + }; + } - return undefined; + /** + * 제목 추출 설정 + */ + protected get titleConfig(): TitleExtractionConfig { + return { + selectors: [ + 'meta[property="og:title"]', + 'meta[name="twitter:title"]', + 'title', + 'h1', + '.newsletter-title', + '.post-title', + '.article-title', + '[class*="title"]', + '[class*="headline"]', + '[data-testid="post-title"]', + '[data-testid="article-title"]', + '[class*="maily-title"]', + '[class*="letter-title"]', + 'article h1', + 'main h1', + ], + patterns: [ + /\s*-\s*메일리$/, + /\s*\|\s*Maily$/, + /\s*::.*$/, + /\s*·\s*메일리$/, + /\s*뉴스레터를 쉽게, 메일리로 시작하세요$/, + ], + siteSpecificPatterns: { + 'maily.so': [/\s*-\s*메일리$/, /\s*\|\s*Maily$/, /\s*뉴스레터를 쉽게, 메일리로 시작하세요$/], + }, + }; } /** - * 메일리 뉴스레터 콘텐츠를 추출하고 최적화합니다. - * @param document - DOM 문서 - * @returns 최적화된 HTML 콘텐츠 + * 본문 콘텐츠 추출용 셀렉터 */ - private extractAndOptimizeMailyContent(document: Document): string | undefined { - // 메일리 뉴스레터 콘텐츠 추출을 위한 셀렉터들 - const contentSelectors = [ + protected get contentSelectors(): readonly string[] { + return [ 'article', '[class*="content"]', '[class*="newsletter"]', @@ -185,172 +135,11 @@ export class MailyHandler implements IContentHandler { 'main', '.container', '#content', - // 메일리 특화 셀렉터 '[class*="maily-content"]', '[class*="letter-content"]', '[class*="newsletter-content"]', '.post-content', '.article-content', ]; - - let contentElement: Element | null = null; - - // 가장 적절한 콘텐츠 컨테이너 찾기 - for (const selector of contentSelectors) { - const element = document.querySelector(selector); - if (element && element.textContent && element.textContent.trim().length > 100) { - contentElement = element; - break; - } - } - - // 콘텐츠 컨테이너를 찾지 못한 경우 body 사용 - if (!contentElement) { - contentElement = document.body; - } - - if (!contentElement) { - return undefined; - } - - // 콘텐츠 정리 및 최적화 - this.optimizeMailyContent(contentElement as HTMLElement); - - // 불필요한 요소들 제거 - this.removeUnwantedElements(contentElement as HTMLElement); - - return contentElement.outerHTML; - } - - /** - * 메일리 뉴스레터 콘텐츠를 최적화합니다. - * @param element - 최적화할 HTML 요소 - */ - private optimizeMailyContent(element: HTMLElement): void { - // 이미지 최적화 - this.optimizeMailyImages(element); - - // 링크 최적화 - this.optimizeMailyLinks(element); - - // 스타일 정리 - this.cleanupMailyStyles(element); - } - - /** - * 메일리 뉴스레터의 이미지를 최적화합니다. - * @param element - 최적화할 요소 - */ - private optimizeMailyImages(element: HTMLElement): void { - const images = element.querySelectorAll('img'); - - images.forEach((img) => { - // data-src 속성을 src로 변환 (lazy loading) - const dataSrc = img.getAttribute('data-src'); - if (dataSrc && !img.src) { - img.src = dataSrc; - } - - // 상대 경로를 절대 경로로 변환 - if (img.src && img.src.startsWith('//')) { - img.src = 'https:' + img.src; - } - - // 메일리 CDN 경로 처리 - if (img.src && img.src.startsWith('/')) { - img.src = 'https://maily.so' + img.src; - } - - // 불필요한 속성 제거 - img.removeAttribute('data-src'); - img.removeAttribute('loading'); - img.removeAttribute('srcset'); // 단순화를 위해 srcset 제거 - }); - } - - /** - * 메일리 뉴스레터의 링크를 최적화합니다. - * @param element - 최적화할 요소 - */ - private optimizeMailyLinks(element: HTMLElement): void { - const links = element.querySelectorAll('a'); - - links.forEach((link) => { - // 상대 경로를 절대 경로로 변환 - if (link.href && link.href.startsWith('/')) { - link.href = 'https://maily.so' + link.href; - } - - // 새 탭에서 열기 설정 - link.target = '_blank'; - link.rel = 'noopener noreferrer'; - }); - } - - /** - * 메일리 뉴스레터의 스타일을 정리합니다. - * @param element - 정리할 요소 - */ - private cleanupMailyStyles(element: HTMLElement): void { - // 인라인 스타일 중 불필요한 것들 제거 - const elementsWithStyle = element.querySelectorAll('[style]'); - - elementsWithStyle.forEach((el) => { - const style = el.getAttribute('style'); - if (style) { - // 폰트 크기와 색상만 유지하고 나머지는 제거 - const keepStyles = style.match(/(font-size|color|background-color|text-align):[^;]+;?/g); - if (keepStyles) { - el.setAttribute('style', keepStyles.join(' ')); - } else { - el.removeAttribute('style'); - } - } - }); - } - - /** - * 불필요한 요소들을 제거합니다. - * @param element - 정리할 요소 - */ - private removeUnwantedElements(element: HTMLElement): void { - // 제거할 요소들의 셀렉터 - const unwantedSelectors = [ - 'script', - 'style', - 'noscript', - 'iframe[src*="tracking"]', - 'iframe[src*="analytics"]', - 'iframe[src*="google-analytics"]', - '[class*="ad"]', - '[class*="advertisement"]', - '[class*="tracking"]', - '[class*="analytics"]', - '[id*="tracking"]', - '[id*="analytics"]', - '.footer', - '.header', - '.navigation', - '.nav', - '.sidebar', - '.login', - '.auth', - '.subscription', - // 메일리 특화 제거 셀렉터 - '[class*="login"]', - '[class*="auth"]', - '[class*="signup"]', - '[class*="subscribe"]', - '[class*="maily-nav"]', - '[class*="maily-header"]', - '[class*="maily-footer"]', - '[data-testid="login-form"]', - '[data-testid="auth-form"]', - ]; - - unwantedSelectors.forEach((selector) => { - const elements = element.querySelectorAll(selector); - elements.forEach((el) => el.remove()); - }); } } diff --git a/src/modules/pre-handler/handlers/news-site.handler.ts b/src/modules/pre-handler/handlers/news-site.handler.ts index 0d07313..9139235 100644 --- a/src/modules/pre-handler/handlers/news-site.handler.ts +++ b/src/modules/pre-handler/handlers/news-site.handler.ts @@ -1,5 +1,16 @@ +/** + * 뉴스 사이트용 리팩토링된 콘텐츠 핸들러 + * - AbstractContentHandler 기반 + * - SOLID 원칙 및 함수형 프로그래밍 적용 + */ import { Injectable, Logger } from '@nestjs/common'; -import { IContentHandler } from '../interfaces/content-handler.interface'; +import { AbstractContentHandler } from '../base/abstract-content-handler'; +import { + HttpRequestConfig, + DomConfig, + ContentCleaningConfig, + TitleExtractionConfig, +} from '../types/content-extraction.types'; import { PreHandleResult } from '../dto/pre-handle-result.dto'; /** @@ -192,80 +203,108 @@ const NEWS_SITE_TRANSFORMATIONS: Record URL> = { }; /** - * A content handler specifically for news websites. - * This handler detects news site URLs and transforms them to more - * accessible versions, often bypassing paywalls or using cleaner layouts. + * 뉴스 사이트 핸들러 */ @Injectable() -export class NewsSiteHandler implements IContentHandler { - private readonly logger = new Logger(NewsSiteHandler.name); +export class NewsSiteHandler extends AbstractContentHandler { + protected readonly logger = new Logger(NewsSiteHandler.name); /** - * Checks if the URL is from a supported news website. - * @param url - The URL to check. - * @returns `true` if the URL is from a supported news website. + * 뉴스 사이트 처리 여부 + * @param url 검사할 URL */ public canHandle(url: URL): boolean { return Object.keys(NEWS_SITE_TRANSFORMATIONS).some((domain) => url.hostname.endsWith(domain)); } /** - * Processes news site URLs by transforming them to more accessible versions. - * @param url - The news site URL to handle. - * @returns A `PreHandleResult` with the transformed URL, or `null` on failure. + * 핸들러 이름 */ - public handle(url: URL): Promise { - const domain = Object.keys(NEWS_SITE_TRANSFORMATIONS).find((d) => url.hostname.endsWith(d)); - - if (!domain) { - return Promise.resolve(null); - } - - try { - const transform = NEWS_SITE_TRANSFORMATIONS[domain]; - const newUrl = transform(url); + protected get handlerName(): string { + return '뉴스사이트 핸들러'; + } - // Extract potential title from URL - let title: string | undefined; - const siteName = this.getSiteName(domain); + /** + * HTTP 요청 설정 (뉴스사이트는 표준 설정) + */ + protected get httpConfig(): HttpRequestConfig { + return { + userAgent: '', + timeout: 10000, + headers: {}, + redirect: 'follow', + }; + } - // Try to extract article title from URL path - const pathParts = url.pathname.split('/').filter((part) => part.length > 0); - if (pathParts.length > 0) { - // Look for article identifiers in the path - const lastPart = pathParts[pathParts.length - 1]; - if (lastPart.includes('-') || lastPart.includes('_')) { - // Convert URL slug to title - title = lastPart - .replace(/[-_]/g, ' ') - .replace(/\.(html|htm|php|asp|aspx)$/i, '') - .replace(/\b\w/g, (l) => l.toUpperCase()) - .trim(); + /** + * DOM 생성 설정 (표준) + */ + protected get domConfig(): DomConfig { + return { + userAgent: '', + resources: 'usable', + runScripts: 'outside-only', + pretendToBeVisual: true, + }; + } - if (title.length > 60) { - title = title.substring(0, 60) + '...'; - } + /** + * 콘텐츠 정제 설정 (뉴스사이트용) + */ + protected get cleaningConfig(): ContentCleaningConfig { + return { + removeUnwantedElements: true, + cleanupStyles: true, + cleanupLinks: true, + cleanupImages: true, + cleanupText: true, + refineTitle: true, + }; + } - title = `${siteName}: ${title}`; - } - } + /** + * 제목 추출 설정 (뉴스사이트용) + */ + protected get titleConfig(): TitleExtractionConfig { + return { + selectors: ['meta[property="og:title"]', 'title', 'h1'], + patterns: [/[-_][^\s]+/g], + siteSpecificPatterns: {}, + }; + } - // Fallback title - if (!title) { - title = `${siteName} Article`; - } + /** + * 본문 콘텐츠 추출용 셀렉터 (뉴스사이트용) + */ + protected get contentSelectors(): readonly string[] { + return ['article', 'main', '.article-body', '.content', '#article-body']; + } - return Promise.resolve({ - url: newUrl.href, - title, - contentType: 'text/html', - }); + /** + * 뉴스사이트는 URL 변환 후 표준 추출 프로세스 사용 + */ + public async handle(url: URL): Promise { + try { + const transformedUrl = this.transformUrl(url); + return await super.handle(transformedUrl); } catch (error) { this.logger.warn(`NewsSiteHandler failed for ${url.href}: ${(error as Error).message}`); - return Promise.resolve(null); + return null; } } + /** + * 도메인별 URL 변환 + * @param url 원본 URL + */ + private transformUrl(url: URL): URL { + const domain = Object.keys(NEWS_SITE_TRANSFORMATIONS).find((d) => url.hostname.endsWith(d)); + if (domain) { + return NEWS_SITE_TRANSFORMATIONS[domain](url); + } + return url; + } + /** * Gets a human-readable site name from domain. * @param domain - The domain name. diff --git a/src/modules/pre-handler/handlers/pdf.handler.ts b/src/modules/pre-handler/handlers/pdf.handler.ts index b0868b8..5f79add 100644 --- a/src/modules/pre-handler/handlers/pdf.handler.ts +++ b/src/modules/pre-handler/handlers/pdf.handler.ts @@ -1,57 +1,115 @@ +/** + * PDF 파일을 위한 리팩토링된 콘텐츠 핸들러 + * - AbstractContentHandler 기반 + * - SOLID 원칙 및 함수형 프로그래밍 적용 + */ import { Injectable, Logger } from '@nestjs/common'; -import { IContentHandler } from '../interfaces/content-handler.interface'; +import { AbstractContentHandler } from '../base/abstract-content-handler'; +import { + HttpRequestConfig, + DomConfig, + ContentCleaningConfig, + TitleExtractionConfig, +} from '../types/content-extraction.types'; import { PreHandleResult } from '../dto/pre-handle-result.dto'; /** - * A content handler specifically for PDF files. - * This handler detects PDF URLs and marks them appropriately - * so that the main scraping service can handle them differently. + * PDF 파일 핸들러 */ @Injectable() -export class PdfHandler implements IContentHandler { - private readonly logger = new Logger(PdfHandler.name); +export class PdfHandler extends AbstractContentHandler { + protected readonly logger = new Logger(PdfHandler.name); /** - * Checks if the URL points to a PDF file. - * @param url - The URL to check. - * @returns `true` if the URL appears to be a PDF file. + * PDF 파일 처리 여부 + * @param url 검사할 URL */ public canHandle(url: URL): boolean { - // Check file extension if (url.pathname.toLowerCase().endsWith('.pdf')) { return true; } - - // Check for common PDF hosting patterns const pdfPatterns = [/\/pdf\//i, /\.pdf$/i, /\/download.*\.pdf/i, /\/files.*\.pdf/i, /\/documents.*\.pdf/i]; - return pdfPatterns.some((pattern) => pattern.test(url.pathname)); } /** - * Processes PDF URLs by marking them with the correct content type. - * @param url - The URL of the PDF to handle. - * @returns A `PreHandleResult` with PDF content type, or `null` on failure. + * 핸들러 이름 + */ + protected get handlerName(): string { + return 'PDF 핸들러'; + } + + /** + * HTTP 요청 설정 (PDF는 별도 요청 불필요) + */ + protected get httpConfig(): HttpRequestConfig { + return { + userAgent: '', + timeout: 0, + headers: {}, + redirect: 'follow', + }; + } + + /** + * DOM 생성 설정 (PDF는 사용하지 않음) + */ + protected get domConfig(): DomConfig { + return { + userAgent: '', + resources: 'usable', + runScripts: 'outside-only', + pretendToBeVisual: false, + }; + } + + /** + * 콘텐츠 정제 설정 (PDF는 정제 불필요) + */ + protected get cleaningConfig(): ContentCleaningConfig { + return { + removeUnwantedElements: false, + cleanupStyles: false, + cleanupLinks: false, + cleanupImages: false, + cleanupText: false, + refineTitle: true, + }; + } + + /** + * 제목 추출 설정 (파일명 기반) + */ + protected get titleConfig(): TitleExtractionConfig { + return { + selectors: [], + patterns: [/\.pdf$/i, /[-_]/g], + siteSpecificPatterns: {}, + }; + } + + /** + * 본문 콘텐츠 추출용 셀렉터 (사용하지 않음) + */ + protected get contentSelectors(): readonly string[] { + return []; + } + + /** + * PDF는 별도 본문 추출 없이 타입 마킹만 수행 */ public handle(url: URL): Promise { try { - // For PDF files, we don't extract content here but mark the content type - // The main service will handle PDF extraction using appropriate tools - - // Try to extract title from URL path let title: string | undefined; const pathParts = url.pathname.split('/'); const filename = pathParts[pathParts.length - 1]; - if (filename && filename.includes('.pdf')) { - // Remove .pdf extension and clean up the filename for title title = filename .replace(/\.pdf$/i, '') .replace(/[-_]/g, ' ') .replace(/\b\w/g, (l) => l.toUpperCase()) .trim(); } - return Promise.resolve({ url: url.href, title, diff --git a/src/modules/pre-handler/handlers/readability.handler.ts b/src/modules/pre-handler/handlers/readability.handler.ts index 740452b..8d80f11 100644 --- a/src/modules/pre-handler/handlers/readability.handler.ts +++ b/src/modules/pre-handler/handlers/readability.handler.ts @@ -1,42 +1,96 @@ import { Injectable, Logger } from '@nestjs/common'; import { JSDOM } from 'jsdom'; import { Readability } from '@mozilla/readability'; -import { IContentHandler } from '../interfaces/content-handler.interface'; +import { AbstractContentHandler } from '../base/abstract-content-handler'; +import { + HttpRequestConfig, + DomConfig, + ContentCleaningConfig, + TitleExtractionConfig, +} from '../types/content-extraction.types'; import { PreHandleResult } from '../dto/pre-handle-result.dto'; /** - * Readability 파싱 결과 타입 - */ -interface ReadabilityResult { - title: string | null; - content: string | null; - textContent: string | null; - length: number; - excerpt: string | null; - byline: string | null; - dir: string | null; - siteName: string | null; -} - -/** - * A content handler that uses Mozilla's Readability library to extract - * the main readable content from a generic webpage. + * Readability 기반 리팩토링된 콘텐츠 핸들러 + * - AbstractContentHandler 기반 + * - SOLID 원칙 및 함수형 프로그래밍 적용 */ @Injectable() -export class ReadabilityHandler implements IContentHandler { - private readonly logger = new Logger(ReadabilityHandler.name); +export class ReadabilityHandler extends AbstractContentHandler { + protected readonly logger = new Logger(ReadabilityHandler.name); private readonly USER_AGENT = 'Mozilla/5.0 (compatible; Googlebot/2.1; +http://www.google.com/bot.html)'; /** - * This handler can attempt to process any HTTP/HTTPS URL. - * It should typically be placed last in the handler chain as a fallback. - * @param url - The URL to check. - * @returns `true` if the protocol is http or https. + * Readability는 모든 http/https URL을 처리할 수 있음 + * @param url 검사할 URL */ public canHandle(url: URL): boolean { return ['http:', 'https:'].includes(url.protocol); } + /** + * 핸들러 이름 + */ + protected get handlerName(): string { + return 'Readability 핸들러'; + } + + /** + * HTTP 요청 설정 (표준) + */ + protected get httpConfig(): HttpRequestConfig { + return { + userAgent: this.USER_AGENT, + timeout: 10000, + headers: {}, + redirect: 'follow', + }; + } + + /** + * DOM 생성 설정 (스크립트 활성화) + */ + protected get domConfig(): DomConfig { + return { + userAgent: this.httpConfig.userAgent, + resources: 'usable', + runScripts: 'dangerously', + pretendToBeVisual: true, + }; + } + + /** + * 콘텐츠 정제 설정 (Readability는 자체 정제) + */ + protected get cleaningConfig(): ContentCleaningConfig { + return { + removeUnwantedElements: false, + cleanupStyles: false, + cleanupLinks: false, + cleanupImages: false, + cleanupText: false, + refineTitle: true, + }; + } + + /** + * 제목 추출 설정 (Readability 결과 기반) + */ + protected get titleConfig(): TitleExtractionConfig { + return { + selectors: [], + patterns: [], + siteSpecificPatterns: {}, + }; + } + + /** + * 본문 콘텐츠 추출용 셀렉터 (Readability는 사용하지 않음) + */ + protected get contentSelectors(): readonly string[] { + return []; + } + /** * Fetches the webpage, parses it with JSDOM, and extracts the article content. * @param url - The URL to handle. @@ -44,28 +98,24 @@ export class ReadabilityHandler implements IContentHandler { */ public async handle(url: URL): Promise { try { - // 첫 번째 시도: JavaScript 실행 활성화 + // 1차 시도: 스크립트 활성화 DOM let dom = await this.createDOMWithScripts(url.href); - let article = await this.extractContentFromDOM(dom); - - // JavaScript 실행 중 오류가 발생하면 두 번째 시도 + let article = this.extractContentFromDOM(dom); + // 실패 시: 스크립트 비활성화 DOM 재시도 if (!article?.content) { this.logger.debug(`First attempt failed, trying without scripts for ${url.href}`); dom = await this.createDOMWithoutScripts(url.href); - article = await this.extractContentFromDOM(dom); + article = this.extractContentFromDOM(dom); } - if (!article?.content) { this.logger.debug(`No readable content found for ${url.href}`); return null; } - this.logger.log(`Successfully extracted readable content: ${article.content.length} chars`); - return { url: url.href, title: article.title ?? undefined, - content: article.content, + content: article.content ?? undefined, contentType: 'text/html', }; } catch (error) { @@ -75,35 +125,23 @@ export class ReadabilityHandler implements IContentHandler { } /** - * JavaScript 실행을 활성화한 JSDOM 생성 - * @param url - 처리할 URL - * @returns JSDOM 인스턴스 + * 스크립트 활성화 JSDOM 생성 */ private async createDOMWithScripts(url: string): Promise { - const dom = await JSDOM.fromURL(url, { - userAgent: this.USER_AGENT, + return JSDOM.fromURL(url, { + userAgent: this.httpConfig.userAgent, resources: 'usable', runScripts: 'dangerously', pretendToBeVisual: true, }); - - // 브라우저 API polyfill 추가 - this.addBrowserPolyfills(dom.window as unknown as Window & typeof globalThis); - - // 에러 핸들링 추가 - this.addErrorHandling(dom.window as unknown as Window & typeof globalThis); - - return dom; } /** - * JavaScript 실행을 비활성화한 JSDOM 생성 - * @param url - 처리할 URL - * @returns JSDOM 인스턴스 + * 스크립트 비활성화 JSDOM 생성 */ private async createDOMWithoutScripts(url: string): Promise { return JSDOM.fromURL(url, { - userAgent: this.USER_AGENT, + userAgent: this.httpConfig.userAgent, resources: 'usable', runScripts: 'outside-only', pretendToBeVisual: true, @@ -111,19 +149,14 @@ export class ReadabilityHandler implements IContentHandler { } /** - * DOM에서 콘텐츠 추출 - * @param dom - JSDOM 인스턴스 - * @returns 추출된 아티클 또는 null + * Readability로 콘텐츠 추출 */ - private async extractContentFromDOM(dom: JSDOM): Promise { + private extractContentFromDOM(dom: JSDOM): { title?: string; content?: string } | null { try { - // 페이지 로딩 대기 - await new Promise((resolve) => setTimeout(resolve, 2000)); - const reader = new Readability(dom.window.document); const article = reader.parse(); - - return article as ReadabilityResult | null; + if (!article) return null; + return { title: article.title ?? undefined, content: article.content ?? undefined }; } catch (error) { this.logger.debug(`Content extraction failed: ${(error as Error).message}`); return null; diff --git a/src/modules/pre-handler/handlers/rss.handler.ts b/src/modules/pre-handler/handlers/rss.handler.ts index 7c3966f..17b54a6 100644 --- a/src/modules/pre-handler/handlers/rss.handler.ts +++ b/src/modules/pre-handler/handlers/rss.handler.ts @@ -1,29 +1,34 @@ +/** + * RSS/Atom 피드용 리팩토링된 콘텐츠 핸들러 + * - AbstractContentHandler 기반 + * - SOLID 원칙 및 함수형 프로그래밍 적용 + */ import { Injectable, Logger } from '@nestjs/common'; -import { IContentHandler } from '../interfaces/content-handler.interface'; +import { AbstractContentHandler } from '../base/abstract-content-handler'; +import { + HttpRequestConfig, + DomConfig, + ContentCleaningConfig, + TitleExtractionConfig, +} from '../types/content-extraction.types'; import { PreHandleResult } from '../dto/pre-handle-result.dto'; /** - * A content handler for RSS/Atom feeds. - * This handler detects feed URLs and marks them appropriately - * for specialized feed processing. + * RSS/Atom 피드 핸들러 */ @Injectable() -export class RssHandler implements IContentHandler { - private readonly logger = new Logger(RssHandler.name); +export class RssHandler extends AbstractContentHandler { + protected readonly logger = new Logger(RssHandler.name); /** - * Checks if the URL points to an RSS or Atom feed. - * @param url - The URL to check. - * @returns `true` if the URL appears to be a feed. + * RSS/Atom 피드 처리 여부 + * @param url 검사할 URL */ public canHandle(url: URL): boolean { - // Check file extension const feedExtensions = ['.rss', '.xml', '.atom']; if (feedExtensions.some((ext) => url.pathname.toLowerCase().endsWith(ext))) { return true; } - - // Check for common feed URL patterns const feedPatterns = [ /\/feed\/?$/i, /\/feeds?\//i, @@ -35,21 +40,78 @@ export class RssHandler implements IContentHandler { /\/atom\.xml$/i, /\/feed\.xml$/i, ]; - return feedPatterns.some((pattern) => pattern.test(url.pathname)); } /** - * Processes feed URLs by marking them with the correct content type. - * @param url - The URL of the feed to handle. - * @returns A `PreHandleResult` with feed content type, or `null` on failure. + * 핸들러 이름 + */ + protected get handlerName(): string { + return 'RSS 핸들러'; + } + + /** + * HTTP 요청 설정 (피드는 별도 요청 불필요) + */ + protected get httpConfig(): HttpRequestConfig { + return { + userAgent: '', + timeout: 0, + headers: {}, + redirect: 'follow', + }; + } + + /** + * DOM 생성 설정 (피드는 사용하지 않음) + */ + protected get domConfig(): DomConfig { + return { + userAgent: '', + resources: 'usable', + runScripts: 'outside-only', + pretendToBeVisual: false, + }; + } + + /** + * 콘텐츠 정제 설정 (피드는 정제 불필요) + */ + protected get cleaningConfig(): ContentCleaningConfig { + return { + removeUnwantedElements: false, + cleanupStyles: false, + cleanupLinks: false, + cleanupImages: false, + cleanupText: false, + refineTitle: true, + }; + } + + /** + * 제목 추출 설정 (URL/도메인 기반) + */ + protected get titleConfig(): TitleExtractionConfig { + return { + selectors: [], + patterns: [/[-_]/g], + siteSpecificPatterns: {}, + }; + } + + /** + * 본문 콘텐츠 추출용 셀렉터 (사용하지 않음) + */ + protected get contentSelectors(): readonly string[] { + return []; + } + + /** + * RSS/Atom 피드는 본문 추출 없이 타입 마킹만 수행 */ public handle(url: URL): Promise { try { - // Try to extract title from URL or domain let title: string | undefined; - - // Extract from path const pathParts = url.pathname.split('/').filter((part) => part.length > 0); if (pathParts.length > 0) { const lastPart = pathParts[pathParts.length - 1]; @@ -64,18 +126,13 @@ export class RssHandler implements IContentHandler { .trim(); } } - - // Fallback to domain name if (!title) { title = `${url.hostname} Feed`; } - - // Determine content type based on URL patterns let contentType = 'application/rss+xml'; if (url.pathname.toLowerCase().includes('atom')) { contentType = 'application/atom+xml'; } - return Promise.resolve({ url: url.href, title, diff --git a/src/modules/pre-handler/handlers/stibee.handler.ts b/src/modules/pre-handler/handlers/stibee.handler.ts index ee86233..373f9f9 100644 --- a/src/modules/pre-handler/handlers/stibee.handler.ts +++ b/src/modules/pre-handler/handlers/stibee.handler.ts @@ -1,165 +1,113 @@ -import { Injectable, Logger } from '@nestjs/common'; -import { JSDOM } from 'jsdom'; -import { IContentHandler } from '../interfaces/content-handler.interface'; -import { PreHandleResult } from '../dto/pre-handle-result.dto'; +/** + * 스티비(Stibee) 뉴스레터 플랫폼을 위한 리팩토링된 콘텐츠 핸들러 + * - AbstractContentHandler 기반 + * - SOLID 원칙 및 함수형 프로그래밍 적용 + */ +import { Injectable } from '@nestjs/common'; +import { AbstractContentHandler } from '../base/abstract-content-handler'; +import { + HttpRequestConfig, + DomConfig, + ContentCleaningConfig, + TitleExtractionConfig, +} from '../types/content-extraction.types'; /** - * 스티비(Stibee) 뉴스레터 플랫폼을 위한 전용 콘텐츠 핸들러 - * 스티비 뉴스레터의 특별한 구조와 스타일을 고려하여 최적화된 콘텐츠 추출을 제공합니다. + * 스티비 뉴스레터 핸들러 */ @Injectable() -export class StibeeHandler implements IContentHandler { - private readonly logger = new Logger(StibeeHandler.name); - +export class StibeeHandler extends AbstractContentHandler { /** - * 스티비 뉴스레터 URL인지 확인합니다. - * @param url - 확인할 URL - * @returns 스티비 URL이면 true, 아니면 false + * 스티비 도메인 처리 여부 + * @param url 검사할 URL */ public canHandle(url: URL): boolean { return url.hostname.endsWith('stibee.com'); } /** - * 스티비 뉴스레터 콘텐츠를 추출합니다. - * @param url - 처리할 스티비 URL - * @returns 추출된 콘텐츠 또는 null + * 핸들러 이름 */ - public async handle(url: URL): Promise { - try { - this.logger.debug(`스티비 뉴스레터 콘텐츠 추출 시작: ${url.href}`); - - // 스티비 뉴스레터 페이지에서 콘텐츠를 가져옵니다 - const result = await this.extractStibeeContent(url); - - if (!result.content) { - this.logger.debug(`스티비 콘텐츠 추출 실패: ${url.href}`); - return null; - } - - this.logger.log(`스티비 콘텐츠 추출 성공: ${result.content.length} 글자`); - - return { - url: url.href, - title: result.title, - content: result.content, - contentType: 'text/html', - }; - } catch (error) { - this.logger.warn(`스티비 핸들러 처리 실패 ${url.href}: ${(error as Error).message}`); - return null; - } + protected get handlerName(): string { + return '스티비 핸들러'; } /** - * 스티비 뉴스레터에서 콘텐츠를 추출합니다. - * @param url - 스티비 뉴스레터 URL - * @returns 추출된 제목과 콘텐츠 + * HTTP 요청 설정 */ - private async extractStibeeContent(url: URL): Promise<{ - title?: string; - content?: string; - }> { - try { - // AbortController로 타임아웃 설정 - const controller = new AbortController(); - const timeoutId = setTimeout(() => controller.abort(), 20000); // 20초 타임아웃 - - // 스티비 뉴스레터 접근을 위한 최적화된 헤더 설정 - const response = await fetch(url.href, { - headers: { - 'User-Agent': - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', - Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', - 'Accept-Language': 'ko-KR,ko;q=0.9,en;q=0.8', - 'Accept-Encoding': 'gzip, deflate, br', - Connection: 'keep-alive', - 'Upgrade-Insecure-Requests': '1', - 'Sec-Fetch-Dest': 'document', - 'Sec-Fetch-Mode': 'navigate', - 'Sec-Fetch-Site': 'none', - 'Cache-Control': 'no-cache', - }, - redirect: 'follow', - signal: controller.signal, - }); - - clearTimeout(timeoutId); - - if (!response.ok) { - throw new Error(`HTTP ${response.status}: ${response.statusText}`); - } - - const html = await response.text(); - this.logger.debug(`스티비 HTML 가져오기 성공, 길이: ${html.length}`); - - // DOM 파싱 - const dom = new JSDOM(html); - const document = dom.window.document; - - // 제목 추출 - const title = this.extractStibeeTitle(document); - - // 콘텐츠 추출 및 최적화 - const content = this.extractAndOptimizeStibeeContent(document); - - return { - title, - content, - }; - } catch (error) { - this.logger.warn(`스티비 콘텐츠 추출 실패 ${url.href}: ${(error as Error).message}`); - return { - title: undefined, - content: undefined, - }; - } + protected get httpConfig(): HttpRequestConfig { + return { + userAgent: + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + timeout: 20000, + headers: { + Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': 'ko-KR,ko;q=0.9,en;q=0.8', + 'Accept-Encoding': 'gzip, deflate, br', + Connection: 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + 'Sec-Fetch-Dest': 'document', + 'Sec-Fetch-Mode': 'navigate', + 'Sec-Fetch-Site': 'none', + 'Cache-Control': 'no-cache', + }, + redirect: 'follow', + }; } /** - * 스티비 뉴스레터에서 제목을 추출합니다. - * @param document - DOM 문서 - * @returns 추출된 제목 + * DOM 생성 설정 */ - private extractStibeeTitle(document: Document): string | undefined { - // 스티비 뉴스레터 제목 추출을 위한 다양한 셀렉터 시도 - const titleSelectors = [ - 'meta[property="og:title"]', - 'meta[name="twitter:title"]', - 'title', - 'h1', - '.newsletter-title', - '.post-title', - '[class*="title"]', - '[class*="headline"]', - ]; + protected get domConfig(): DomConfig { + return { + userAgent: this.httpConfig.userAgent, + resources: 'usable', + runScripts: 'outside-only', + pretendToBeVisual: true, + }; + } - for (const selector of titleSelectors) { - const element = document.querySelector(selector); - if (element) { - let title = element.getAttribute('content') || element.textContent; - if (title?.trim()) { - // 스티비 관련 불필요한 텍스트 제거 - title = title.trim(); - title = title.replace(/\s*-\s*스티비$/, ''); - title = title.replace(/\s*\|\s*Stibee$/, ''); - title = title.replace(/\s*::.*$/, ''); - return title.trim(); - } - } - } + /** + * 콘텐츠 정제 설정 + */ + protected get cleaningConfig(): ContentCleaningConfig { + return { + removeUnwantedElements: true, + cleanupStyles: true, + cleanupLinks: true, + cleanupImages: true, + cleanupText: true, + refineTitle: true, + }; + } - return undefined; + /** + * 제목 추출 설정 + */ + protected get titleConfig(): TitleExtractionConfig { + return { + selectors: [ + 'meta[property="og:title"]', + 'meta[name="twitter:title"]', + 'title', + 'h1', + '.newsletter-title', + '.post-title', + '[class*="title"]', + '[class*="headline"]', + ], + patterns: [/\s*-\s*스티비$/, /\s*\|\s*Stibee$/, /\s*::.*$/], + siteSpecificPatterns: { + 'stibee.com': [/\s*-\s*스티비$/, /\s*\|\s*Stibee$/], + }, + }; } /** - * 스티비 뉴스레터 콘텐츠를 추출하고 최적화합니다. - * @param document - DOM 문서 - * @returns 최적화된 HTML 콘텐츠 + * 본문 콘텐츠 추출용 셀렉터 */ - private extractAndOptimizeStibeeContent(document: Document): string | undefined { - // 스티비 뉴스레터 콘텐츠 추출을 위한 셀렉터들 - const contentSelectors = [ + protected get contentSelectors(): readonly string[] { + return [ 'article', '[class*="content"]', '[class*="newsletter"]', @@ -169,145 +117,5 @@ export class StibeeHandler implements IContentHandler { '.container', '#content', ]; - - let contentElement: Element | null = null; - - // 가장 적절한 콘텐츠 컨테이너 찾기 - for (const selector of contentSelectors) { - const element = document.querySelector(selector); - if (element && element.textContent && element.textContent.trim().length > 100) { - contentElement = element; - break; - } - } - - // 콘텐츠 컨테이너를 찾지 못한 경우 body 사용 - if (!contentElement) { - contentElement = document.body; - } - - if (!contentElement) { - return undefined; - } - - // 콘텐츠 정리 및 최적화 - this.optimizeStibeeContent(contentElement as HTMLElement); - - // 불필요한 요소들 제거 - this.removeUnwantedElements(contentElement as HTMLElement); - - return contentElement.outerHTML; - } - - /** - * 스티비 뉴스레터 콘텐츠를 최적화합니다. - * @param element - 최적화할 HTML 요소 - */ - private optimizeStibeeContent(element: HTMLElement): void { - // 이미지 최적화 - this.optimizeStibeeImages(element); - - // 링크 최적화 - this.optimizeStibeeLinks(element); - - // 스타일 정리 - this.cleanupStibeeStyles(element); - } - - /** - * 스티비 뉴스레터의 이미지를 최적화합니다. - * @param element - 최적화할 요소 - */ - private optimizeStibeeImages(element: HTMLElement): void { - const images = element.querySelectorAll('img'); - - images.forEach((img) => { - // data-src 속성을 src로 변환 (lazy loading) - const dataSrc = img.getAttribute('data-src'); - if (dataSrc && !img.src) { - img.src = dataSrc; - } - - // 상대 경로를 절대 경로로 변환 - if (img.src && img.src.startsWith('//')) { - img.src = 'https:' + img.src; - } - - // 불필요한 속성 제거 - img.removeAttribute('data-src'); - img.removeAttribute('loading'); - img.removeAttribute('srcset'); // 단순화를 위해 srcset 제거 - }); - } - - /** - * 스티비 뉴스레터의 링크를 최적화합니다. - * @param element - 최적화할 요소 - */ - private optimizeStibeeLinks(element: HTMLElement): void { - const links = element.querySelectorAll('a'); - - links.forEach((link) => { - // 상대 경로를 절대 경로로 변환 - if (link.href && link.href.startsWith('/')) { - link.href = 'https://stibee.com' + link.href; - } - - // 새 탭에서 열기 설정 - link.target = '_blank'; - link.rel = 'noopener noreferrer'; - }); - } - - /** - * 스티비 뉴스레터의 스타일을 정리합니다. - * @param element - 정리할 요소 - */ - private cleanupStibeeStyles(element: HTMLElement): void { - // 인라인 스타일 중 불필요한 것들 제거 - const elementsWithStyle = element.querySelectorAll('[style]'); - - elementsWithStyle.forEach((el) => { - const style = el.getAttribute('style'); - if (style) { - // 폰트 크기와 색상만 유지하고 나머지는 제거 - const keepStyles = style.match(/(font-size|color|background-color):[^;]+;?/g); - if (keepStyles) { - el.setAttribute('style', keepStyles.join(' ')); - } else { - el.removeAttribute('style'); - } - } - }); - } - - /** - * 불필요한 요소들을 제거합니다. - * @param element - 정리할 요소 - */ - private removeUnwantedElements(element: HTMLElement): void { - // 제거할 요소들의 셀렉터 - const unwantedSelectors = [ - 'script', - 'style', - 'noscript', - 'iframe[src*="tracking"]', - 'iframe[src*="analytics"]', - '[class*="ad"]', - '[class*="advertisement"]', - '[class*="tracking"]', - '[class*="analytics"]', - '[id*="tracking"]', - '[id*="analytics"]', - '.footer', - '.header', - '.navigation', - '.sidebar', - ]; - - unwantedSelectors.forEach((selector) => { - const elements = element.querySelectorAll(selector); - elements.forEach((el) => el.remove()); - }); } } diff --git a/src/modules/pre-handler/handlers/youtube.handler.ts b/src/modules/pre-handler/handlers/youtube.handler.ts index 95d3d9e..5f02e31 100644 --- a/src/modules/pre-handler/handlers/youtube.handler.ts +++ b/src/modules/pre-handler/handlers/youtube.handler.ts @@ -1,133 +1,235 @@ +/** + * YouTube 동영상용 리팩토링된 콘텐츠 핸들러 + * - AbstractContentHandler 기반 + * - SOLID 원칙 및 함수형 프로그래밍 적용 + */ import { Injectable, Logger } from '@nestjs/common'; -import { IContentHandler } from '../interfaces/content-handler.interface'; +import { AbstractContentHandler } from '../base/abstract-content-handler'; +import { + HttpRequestConfig, + DomConfig, + ContentCleaningConfig, + TitleExtractionConfig, +} from '../types/content-extraction.types'; import { PreHandleResult } from '../dto/pre-handle-result.dto'; +import { fetchHtml, createDom } from '../utils/functional-utils'; /** - * A content handler specifically for YouTube videos. - * This handler detects YouTube URLs and attempts to extract - * video metadata and transcripts when available. + * YouTube 동영상 핸들러 */ @Injectable() -export class YoutubeHandler implements IContentHandler { - private readonly logger = new Logger(YoutubeHandler.name); +export class YoutubeHandler extends AbstractContentHandler { + protected readonly logger = new Logger(YoutubeHandler.name); /** - * Checks if the URL is a YouTube video. - * @param url - The URL to check. - * @returns `true` if the URL is a YouTube video. + * YouTube 동영상 처리 여부 + * @param url 검사할 URL */ public canHandle(url: URL): boolean { const youtubeHosts = ['youtube.com', 'www.youtube.com', 'youtu.be', 'm.youtube.com']; - if (!youtubeHosts.includes(url.hostname)) { return false; } - - // Check for video patterns if (url.hostname === 'youtu.be') { - return url.pathname.length > 1; // Has video ID + return url.pathname.length > 1; } - - // For youtube.com domains return url.pathname.includes('/watch') || url.pathname.includes('/embed/') || url.pathname.includes('/v/'); } /** - * Processes YouTube URLs to extract video information. - * @param url - The YouTube URL to handle. - * @returns A `PreHandleResult` with video information, or `null` on failure. + * 핸들러 이름 + */ + protected get handlerName(): string { + return 'YouTube 핸들러'; + } + + /** + * HTTP 요청 설정 (YouTube는 별도 요청 불필요) + */ + protected get httpConfig(): HttpRequestConfig { + return { + userAgent: '', + timeout: 0, + headers: {}, + redirect: 'follow', + }; + } + + /** + * DOM 생성 설정 (YouTube는 사용하지 않음) + */ + protected get domConfig(): DomConfig { + return { + userAgent: '', + resources: 'usable', + runScripts: 'outside-only', + pretendToBeVisual: false, + }; + } + + /** + * 콘텐츠 정제 설정 (YouTube는 정제 불필요) + */ + protected get cleaningConfig(): ContentCleaningConfig { + return { + removeUnwantedElements: false, + cleanupStyles: false, + cleanupLinks: false, + cleanupImages: false, + cleanupText: false, + refineTitle: true, + }; + } + + /** + * 제목 추출 설정 (videoId 기반) + */ + protected get titleConfig(): TitleExtractionConfig { + return { + selectors: [], + patterns: [], + siteSpecificPatterns: {}, + }; + } + + /** + * 본문 콘텐츠 추출용 셀렉터 (사용하지 않음) + */ + protected get contentSelectors(): readonly string[] { + return []; + } + + /** + * YouTube 동영상은 HTML을 직접 fetch해서 타이틀, 설명, 자막(가능하면)을 추출 */ - public handle(url: URL): Promise { + public async handle(url: URL): Promise { try { const videoId = this.extractVideoId(url); if (!videoId) { this.logger.warn(`Could not extract video ID from: ${url.href}`); - return Promise.resolve(null); + return null; } - - // For now, we'll return basic information - // In a full implementation, you might want to: - // 1. Fetch video metadata from YouTube API - // 2. Extract auto-generated captions/transcripts - // 3. Convert video description to readable format - - const title = `YouTube Video: ${videoId}`; - const content = this.generateVideoContent(videoId, url); - - return Promise.resolve({ + const htmlResult = await fetchHtml(url.href, this.httpConfig); + if (!htmlResult.success) { + this.logger.warn(`YouTube HTML fetch 실패: ${url.href}`); + return { + url: url.href, + title: `YouTube Video: ${videoId}`, + content: this.generateVideoContent(videoId, url), + contentType: 'text/html', + }; + } + const domResult = createDom(htmlResult.data, this.domConfig); + if (!domResult.success) { + this.logger.warn(`YouTube DOM 파싱 실패: ${url.href}`); + return { + url: url.href, + title: `YouTube Video: ${videoId}`, + content: this.generateVideoContent(videoId, url), + contentType: 'text/html', + }; + } + const document = domResult.data.window.document; + const title = this.extractTitleFromDom(document) ?? `YouTube Video: ${videoId}`; + const description = this.extractDescriptionFromDom(document); + const captions = this.extractCaptionsFromDom(); + const content = this.generateVideoContent(videoId, url, title, description, captions); + return { url: url.href, title, content, contentType: 'text/html', - }); + }; } catch (error) { this.logger.warn(`YoutubeHandler failed for ${url.href}: ${(error as Error).message}`); - return Promise.resolve(null); + return null; } } /** - * Extracts the video ID from various YouTube URL formats. - * @param url - The YouTube URL. - * @returns The video ID or null if not found. + * 다양한 YouTube URL에서 videoId 추출 + * @param url YouTube URL */ private extractVideoId(url: URL): string | null { - // For youtu.be format if (url.hostname === 'youtu.be') { return url.pathname.slice(1); } - - // For youtube.com formats if (url.searchParams.has('v')) { return url.searchParams.get('v'); } - - // For embed URLs const embedMatch = url.pathname.match(/\/embed\/([^/?]+)/); if (embedMatch) { return embedMatch[1]; } - - // For /v/ URLs const vMatch = url.pathname.match(/\/v\/([^/?]+)/); if (vMatch) { return vMatch[1]; } - return null; } /** - * Generates readable content for a YouTube video. - * @param videoId - The YouTube video ID. - * @param originalUrl - The original URL. - * @returns HTML content representing the video. + * YouTube HTML에서 타이틀 추출 + */ + private extractTitleFromDom(document: Document): string | undefined { + const ogTitle = document.querySelector('meta[property="og:title"]')?.getAttribute('content'); + if (ogTitle) return ogTitle; + const titleTag = document.querySelector('title')?.textContent; + if (titleTag) return titleTag.replace(/ - YouTube$/, '').trim(); + return undefined; + } + + /** + * YouTube HTML에서 설명 추출 + */ + private extractDescriptionFromDom(document: Document): string | undefined { + const ogDesc = document.querySelector('meta[property="og:description"]')?.getAttribute('content'); + if (ogDesc) return ogDesc; + const descTag = document.querySelector('meta[name="description"]')?.getAttribute('content'); + if (descTag) return descTag; + return undefined; + } + + /** + * YouTube HTML에서 자막(캡션) 추출 (가능한 경우, 기본은 미지원) + * 실제 자막은 클라이언트 JS로 동적으로 로드되므로, 일반적으로는 추출 불가. (향후 개선 가능) */ - private generateVideoContent(videoId: string, originalUrl: URL): string { + private extractCaptionsFromDom(): string | undefined { + // HTML 내에서 자막 텍스트가 노출되는 경우는 거의 없음. (향후 개선 필요) + return undefined; + } + + /** + * videoId 기반 HTML 콘텐츠 생성 (타이틀, 설명, 자막 포함) + * @param videoId YouTube video ID + * @param originalUrl 원본 URL + * @param title 동영상 제목 + * @param description 동영상 설명 + * @param captions 자막(XML) + */ + private generateVideoContent( + videoId: string, + originalUrl: URL, + title?: string, + description?: string, + captions?: string, + ): string { const watchUrl = `https://www.youtube.com/watch?v=${videoId}`; const embedUrl = `https://www.youtube.com/embed/${videoId}`; - - // Extract timestamp if present const timestamp = originalUrl.searchParams.get('t'); const timestampText = timestamp ? ` (starting at ${timestamp})` : ''; - return `
-

YouTube Video

+

${title ?? 'YouTube Video'}

Video ID: ${videoId}

Watch URL: ${watchUrl}${timestampText}

Embed URL: ${embedUrl}

- + ${description ? `
설명:
${description.replace(/\n/g, '
')}
` : ''}
-

Note: This is a YouTube video. To get the full content including transcripts, - additional processing would be required using YouTube's API or transcript extraction tools.

+

Note: This is a YouTube video. To get the full content including transcripts, additional processing would be required using YouTube's API or transcript extraction tools.

- - + + ${captions ? `
자막 보기
${captions}
` : ''}
`.trim(); } diff --git a/src/modules/pre-handler/pre-handler.module.ts b/src/modules/pre-handler/pre-handler.module.ts index 966beb7..f6be292 100644 --- a/src/modules/pre-handler/pre-handler.module.ts +++ b/src/modules/pre-handler/pre-handler.module.ts @@ -1,7 +1,7 @@ import { Module } from '@nestjs/common'; -import { PreHandlerService } from './pre-handler.service'; +import { RefactoredPreHandlerService } from './refactored-pre-handler.service'; import { ReadabilityHandler } from './handlers/readability.handler'; -import { CONTENT_HANDLER_TOKEN, IContentHandler } from './interfaces/content-handler.interface'; +import { HandlerFactory } from './factories/handler-factory'; import { DomainSpecificHandler } from './handlers/domain-specific.handler'; import { PdfHandler } from './handlers/pdf.handler'; import { RssHandler } from './handlers/rss.handler'; @@ -11,27 +11,6 @@ import { NewsSiteHandler } from './handlers/news-site.handler'; import { StibeeHandler } from './handlers/stibee.handler'; import { MailyHandler } from './handlers/maily.handler'; -// --- Register all handlers here --- -// The order is important: more specific handlers should come first. -// 1. File type handlers (PDF, RSS) - most specific -// 2. Platform-specific handlers (YouTube) - very specific -// 3. Social media handlers - moderately specific -// 4. News site handlers - moderately specific -// 5. Newsletter platform handlers (Stibee, Maily) - moderately specific -// 6. Domain transformation handlers - general transformations -// 7. General readability handler - fallback for everything else -const handlers = [ - PdfHandler, - RssHandler, - YoutubeHandler, - SocialMediaHandler, - NewsSiteHandler, - StibeeHandler, - MailyHandler, - DomainSpecificHandler, - ReadabilityHandler, -]; - /** * Encapsulates all content pre-handling logic. * It provides the PreHandlerService and registers all available content handlers. @@ -50,17 +29,18 @@ const handlers = [ */ @Module({ providers: [ - PreHandlerService, - ...handlers, - { - provide: CONTENT_HANDLER_TOKEN, - // The useFactory provider collects all registered handlers and makes them - // available for injection as an array. To add a new handler, - // simply add it to the `handlers` array above and the `inject` array below. - useFactory: (...injectedHandlers: IContentHandler[]): IContentHandler[] => injectedHandlers, - inject: handlers, - }, + RefactoredPreHandlerService, + HandlerFactory, + PdfHandler, + RssHandler, + YoutubeHandler, + SocialMediaHandler, + NewsSiteHandler, + StibeeHandler, + MailyHandler, + DomainSpecificHandler, + ReadabilityHandler, ], - exports: [PreHandlerService], + exports: [RefactoredPreHandlerService], }) export class PreHandlerModule {} diff --git a/src/modules/pre-handler/pre-handler.service.ts b/src/modules/pre-handler/pre-handler.service.ts deleted file mode 100644 index 7659c42..0000000 --- a/src/modules/pre-handler/pre-handler.service.ts +++ /dev/null @@ -1,68 +0,0 @@ -import { Inject, Injectable, Logger } from '@nestjs/common'; -import { CONTENT_HANDLER_TOKEN, IContentHandler } from './interfaces/content-handler.interface'; -import { PreHandleResult } from './dto/pre-handle-result.dto'; - -/** - * Orchestrates the content pre-handling process by iterating through a chain of registered handlers. - */ -@Injectable() -export class PreHandlerService { - private readonly logger = new Logger(PreHandlerService.name); - - /** - * Injects all services that are provided with the `CONTENT_HANDLER_TOKEN`. - * @param handlers - An array of `IContentHandler` implementations. - */ - constructor( - @Inject(CONTENT_HANDLER_TOKEN) - private readonly handlers: IContentHandler[], - ) {} - - /** - * Executes the handler chain for a given URL. - * It tries handlers one by one until one successfully returns a result with content. - * @param urlString - The URL to process. - * @returns A `PreHandleResult`. If all handlers fail, it returns the original URL. - */ - public async execute(urlString: string): Promise { - let currentUrl = new URL(urlString); - const finalResult: PreHandleResult = { url: urlString }; - - this.logger.debug(`Starting pre-handler execution for: ${urlString}`); - - for (const handler of this.handlers) { - if (handler.canHandle(currentUrl)) { - this.logger.debug(`Handler ${handler.constructor.name} can handle ${currentUrl.href}`); - const result = await handler.handle(currentUrl); - - if (result) { - // URL이 핸들러에 의해 변경되었는지 확인하고 업데이트합니다. - if (result.url && result.url !== currentUrl.href) { - currentUrl = new URL(result.url); - finalResult.url = result.url; - this.logger.log(`URL transformed by ${handler.constructor.name}: ${urlString} → ${result.url}`); - } - - // 타이틀이 있으면 설정합니다. - if (result.title) { - finalResult.title = result.title; - this.logger.log(`Title extracted by ${handler.constructor.name}: ${result.title}`); - } - - // 콘텐츠가 성공적으로 추출되면 즉시 반환합니다. - if (result.content) { - this.logger.log(`Content extracted by ${handler.constructor.name}`); - finalResult.content = result.content; - finalResult.contentType = result.contentType; - return finalResult; - } - } - } - } - - this.logger.debug( - `Pre-handler execution completed. Final result: url=${finalResult.url}, title=${finalResult.title}`, - ); - return finalResult; // 콘텐츠가 없더라도, 변환된 URL이 포함될 수 있는 최종 결과를 반환합니다. - } -} diff --git a/src/modules/pre-handler/refactored-pre-handler.service.ts b/src/modules/pre-handler/refactored-pre-handler.service.ts new file mode 100644 index 0000000..f3ec000 --- /dev/null +++ b/src/modules/pre-handler/refactored-pre-handler.service.ts @@ -0,0 +1,89 @@ +/** + * 리팩토링된 PreHandlerService + * - HandlerFactory 기반 함수형/전략 패턴 적용 + */ +import { Injectable, Logger } from '@nestjs/common'; +import { PreHandleResult } from './dto/pre-handle-result.dto'; +import { HandlerFactory } from './factories/handler-factory'; +import { IContentHandler } from './interfaces/content-handler.interface'; + +/** + * 리팩토링된 PreHandlerService + */ +@Injectable() +export class RefactoredPreHandlerService { + private readonly logger = new Logger(RefactoredPreHandlerService.name); + + constructor(private readonly handlerFactory: HandlerFactory) {} + + /** + * 핸들러 체인을 순차적으로 실행하여 결과를 반환합니다. + * @param urlString 처리할 URL 문자열 + * @returns PreHandleResult + */ + public async execute(urlString: string): Promise { + const handlers = this.handlerFactory.getAllHandlers(); + const currentUrl = new URL(urlString); + this.logger.debug(`리팩토링된 pre-handler 실행 시작: ${urlString}`); + const result = await this.executeHandlerChain(handlers, currentUrl, { + url: urlString, + title: undefined, + content: undefined, + contentType: undefined, + }); + this.logger.debug(`리팩토링된 pre-handler 실행 완료. 최종 결과: url=${result.url}, title=${result.title}`); + return result; + } + + /** + * 핸들러 체인을 순차적으로 실행 (재귀적 함수형 접근) + * @param handlers 핸들러 배열 + * @param currentUrl 현재 URL + * @param accumulatedResult 누적 결과 + * @returns PreHandleResult + */ + private async executeHandlerChain( + handlers: IContentHandler[], + currentUrl: URL, + accumulatedResult: PreHandleResult, + ): Promise { + if (handlers.length === 0) { + return accumulatedResult; + } + const [currentHandler, ...remainingHandlers] = handlers; + if (!currentHandler.canHandle(currentUrl)) { + return this.executeHandlerChain(remainingHandlers, currentUrl, accumulatedResult); + } + this.logger.debug(`핸들러 ${currentHandler.constructor.name}가 ${currentUrl.href} 처리`); + try { + const result = await currentHandler.handle(currentUrl); + if (!result) { + return this.executeHandlerChain(remainingHandlers, currentUrl, accumulatedResult); + } + const updatedResult = this.updateAccumulatedResult(accumulatedResult, result); + if (result.content) { + this.logger.log(`핸들러 ${currentHandler.constructor.name}가 콘텐츠 추출 성공`); + return updatedResult; + } + return this.executeHandlerChain(remainingHandlers, currentUrl, updatedResult); + } catch (error) { + this.logger.warn(`핸들러 ${currentHandler.constructor.name} 처리 실패: ${(error as Error).message}`); + return this.executeHandlerChain(remainingHandlers, currentUrl, accumulatedResult); + } + } + + /** + * 누적 결과를 업데이트합니다. + * @param accumulated 기존 결과 + * @param newResult 새 결과 + * @returns 병합된 결과 + */ + private updateAccumulatedResult(accumulated: PreHandleResult, newResult: PreHandleResult): PreHandleResult { + return { + url: newResult.url || accumulated.url, + title: newResult.title || accumulated.title, + content: newResult.content || accumulated.content, + contentType: newResult.contentType || accumulated.contentType, + }; + } +} diff --git a/src/modules/pre-handler/types/content-extraction.types.ts b/src/modules/pre-handler/types/content-extraction.types.ts new file mode 100644 index 0000000..91acf34 --- /dev/null +++ b/src/modules/pre-handler/types/content-extraction.types.ts @@ -0,0 +1,104 @@ +/** + * 콘텐츠 추출 및 정제 관련 타입 정의 + * 함수형 프로그래밍 및 SOLID 원칙 기반 + */ + +/** + * HTTP 요청 설정 + */ +export interface HttpRequestConfig { + /** User-Agent 문자열 */ + readonly userAgent: string; + /** 요청 타임아웃(ms) */ + readonly timeout: number; + /** 추가 헤더 */ + readonly headers: Record; + /** 리다이렉트 정책 */ + readonly redirect: RequestRedirect; +} + +/** + * DOM 생성 설정 + */ +export interface DomConfig { + /** User-Agent 문자열 */ + readonly userAgent: string; + /** 리소스 사용 여부 */ + readonly resources: 'usable' | 'unusable'; + /** 스크립트 실행 정책 */ + readonly runScripts: 'dangerously' | 'outside-only'; + /** 시각적 환경 시뮬레이션 */ + readonly pretendToBeVisual: boolean; +} + +/** + * 콘텐츠 정제 설정 + */ +export interface ContentCleaningConfig { + /** 불필요한 요소 제거 여부 */ + readonly removeUnwantedElements: boolean; + /** 스타일 정리 여부 */ + readonly cleanupStyles: boolean; + /** 링크 정리 여부 */ + readonly cleanupLinks: boolean; + /** 이미지 정리 여부 */ + readonly cleanupImages: boolean; + /** 텍스트 정리 여부 */ + readonly cleanupText: boolean; + /** 제목 정제 여부 */ + readonly refineTitle: boolean; +} + +/** + * 제목 추출 설정 + */ +export interface TitleExtractionConfig { + /** 제목 추출용 셀렉터 목록 */ + readonly selectors: readonly string[]; + /** 제목 정제용 정규표현식 목록 */ + readonly patterns: readonly RegExp[]; + /** 사이트별 추가 정제 패턴 */ + readonly siteSpecificPatterns: Record; +} + +/** + * 콘텐츠 추출 결과 + */ +export interface ContentExtractionResult { + /** 추출된 제목 */ + readonly title?: string; + /** 추출된 본문(HTML) */ + readonly content?: string; + /** 콘텐츠 타입 */ + readonly contentType: string; + /** 최종 URL */ + readonly url: string; +} + +/** + * 정제 파이프라인 단계 + */ +export type CleaningStage = + | 'remove-unwanted-elements' + | 'cleanup-styles' + | 'cleanup-links' + | 'cleanup-images' + | 'cleanup-text' + | 'refine-title'; + +/** + * 정제 함수 타입 + */ +export type CleaningFunction = (element: T, context: CleaningContext) => T; + +/** + * 정제 컨텍스트 + */ +export interface CleaningContext { + /** 기준 URL */ + readonly baseUrl: string; + /** 정제 설정 */ + readonly config: ContentCleaningConfig; + /** 로거 */ + readonly logger: any; +} diff --git a/src/modules/pre-handler/utils/content-cleaning-pipeline.ts b/src/modules/pre-handler/utils/content-cleaning-pipeline.ts new file mode 100644 index 0000000..cd37444 --- /dev/null +++ b/src/modules/pre-handler/utils/content-cleaning-pipeline.ts @@ -0,0 +1,215 @@ +/** + * 콘텐츠 정제 파이프라인 함수 모음 + * - removeUnwantedElements, cleanupStyles, cleanupLinks, cleanupImages, cleanupText, refineTitle + * - createContentCleaningPipeline + */ +import { CleaningFunction, ContentCleaningConfig, CleaningContext } from '../types/content-extraction.types'; +import { compose } from './functional-utils'; + +/** + * 불필요한 요소를 제거합니다. + * @param element 정제할 요소 + * @param context 정제 컨텍스트 (logger만 사용) + * @returns 정제된 요소 + */ +export const removeUnwantedElements: CleaningFunction = (element, context: CleaningContext) => { + const unwantedSelectors = [ + 'nav', + '.nav', + '.navigation', + '.menu', + '.navbar', + 'header', + '.header', + '.site-header', + 'footer', + '.footer', + '.site-footer', + 'aside', + '.sidebar', + '.side-bar', + '.widget', + '.ad', + '.advertisement', + '.ads', + '[class*="ad-"]', + '[id*="ad-"]', + '[class*="banner"]', + '[id*="banner"]', + '.social', + '.share', + '.social-share', + '[class*="social"]', + '[id*="social"]', + '.comment', + '.comments', + '#comments', + '[class*="comment"]', + '[id*="comment"]', + '.breadcrumb', + '.breadcrumbs', + '.pagination', + '.pager', + '.author-bio', + '.author-info', + '.newsletter-signup', + '.subscribe', + '.cookie-notice', + '.privacy-notice', + '.back-to-top', + '.scroll-top', + '.login', + '.auth', + '.signup', + 'script', + 'style', + 'noscript', + '[style*="display: none"]', + '[style*="display:none"]', + '.hidden', + '.invisible', + '[aria-hidden="true"]', + 'iframe[src*="tracking"]', + 'iframe[src*="analytics"]', + 'iframe[src*="google-analytics"]', + 'iframe[src*="facebook"]', + '[class*="tracking"]', + '[id*="tracking"]', + '[class*="analytics"]', + '[id*="analytics"]', + ]; + unwantedSelectors.forEach((selector) => { + try { + const elements = element.querySelectorAll(selector); + elements.forEach((el) => el.remove()); + } catch { + (context.logger as { debug?: (msg: string) => void })?.debug?.(`Invalid selector: ${selector}`); + } + }); + return element; +}; + +/** + * 인라인 스타일을 정리합니다. + */ +export const cleanupStyles: CleaningFunction = (element) => { + const elementsWithStyle = element.querySelectorAll('[style]'); + elementsWithStyle.forEach((el) => { + const style = el.getAttribute('style'); + if (style) { + const keepStyles = style.match( + /(font-size|color|background-color|text-align|line-height|margin|padding|border):[^;]+;?/g, + ); + if (keepStyles) { + el.setAttribute('style', keepStyles.join(' ')); + } else { + el.removeAttribute('style'); + } + } + }); + return element; +}; + +/** + * 링크를 정리합니다. + */ +export const cleanupLinks: CleaningFunction = (element, context) => { + const links = element.querySelectorAll('a[href]'); + links.forEach((link) => { + const href = link.getAttribute('href'); + if (href) { + try { + const absoluteUrl = new URL(href, context.baseUrl).href; + link.setAttribute('href', absoluteUrl); + link.setAttribute('target', '_blank'); + link.setAttribute('rel', 'noopener noreferrer'); + } catch { + link.removeAttribute('href'); + } + } + }); + return element; +}; + +/** + * 이미지를 정리합니다. + */ +export const cleanupImages: CleaningFunction = (element, context) => { + const images = element.querySelectorAll('img'); + images.forEach((img) => { + const width = img.getAttribute('width'); + const height = img.getAttribute('height'); + if (width && height) { + const w = parseInt(width); + const h = parseInt(height); + if (w < 50 || h < 50) { + img.remove(); + return; + } + } + const dataSrc = img.getAttribute('data-src'); + if (dataSrc && !img.src) { + img.src = dataSrc; + } + if (context.baseUrl && img.src && img.src.startsWith('/')) { + try { + img.src = new URL(img.src, context.baseUrl).href; + } catch { + img.remove(); + return; + } + } + if (img.src && img.src.startsWith('//')) { + img.src = 'https:' + img.src; + } + (img as HTMLElement).style.maxWidth = '100%'; + (img as HTMLElement).style.height = 'auto'; + img.removeAttribute('data-src'); + img.removeAttribute('loading'); + img.removeAttribute('srcset'); + }); + return element; +}; + +/** + * 텍스트를 정리합니다. + */ +export const cleanupText: CleaningFunction = (element) => { + const emptyElements = element.querySelectorAll('p, div, span'); + emptyElements.forEach((el) => { + if (!el.textContent?.trim() && !el.querySelector('img')) { + el.remove(); + } + }); + const textNodes = element.ownerDocument?.evaluate( + './/text()', + element, + null, + XPathResult.UNORDERED_NODE_SNAPSHOT_TYPE, + null, + ); + if (textNodes) { + for (let i = 0; i < textNodes.snapshotLength; i++) { + const node = textNodes.snapshotItem(i); + if (node && node.textContent) { + node.textContent = node.textContent.replace(/\s+/g, ' '); + } + } + } + return element; +}; + +/** + * 콘텐츠 정제 파이프라인을 생성합니다. + * @param config 정제 설정 + * @returns 합성된 정제 함수 + */ +export const createContentCleaningPipeline = (config: ContentCleaningConfig): CleaningFunction => { + const stages: CleaningFunction[] = []; + if (config.removeUnwantedElements) stages.push(removeUnwantedElements); + if (config.cleanupStyles) stages.push(cleanupStyles); + if (config.cleanupLinks) stages.push(cleanupLinks); + if (config.cleanupImages) stages.push(cleanupImages); + if (config.cleanupText) stages.push(cleanupText); + return compose(...stages); +}; diff --git a/src/modules/pre-handler/utils/functional-utils.ts b/src/modules/pre-handler/utils/functional-utils.ts new file mode 100644 index 0000000..33b78be --- /dev/null +++ b/src/modules/pre-handler/utils/functional-utils.ts @@ -0,0 +1,254 @@ +/** + * 함수형 프로그래밍 기반 공통 유틸리티 함수 모음 + * - Result/Option 타입 + * - fetchHtml, createDom, createDomFromUrl 등 + * - map/flatMap/compose 등 고차 함수 + * - extractTitle, findContentElement 등 + */ +import { JSDOM } from 'jsdom'; +import { HttpRequestConfig, DomConfig } from '../types/content-extraction.types'; + +/** + * Result 타입 - 성공/실패를 명시적으로 표현 + */ +export type Result = + | { readonly success: true; readonly data: T } + | { readonly success: false; readonly error: E }; + +/** + * Option 타입 - 값이 있을 수도 없을 수도 있음을 명시적으로 표현 + */ +export type Option = T | null | undefined; + +/** + * HTTP 요청을 통해 HTML 문자열을 가져옵니다. + * @param url 요청할 URL + * @param config HTTP 요청 설정 + * @returns HTML 문자열 또는 에러 + */ +export const fetchHtml = async (url: string, config: HttpRequestConfig): Promise> => { + try { + const controller = new AbortController(); + const timeoutId = setTimeout(() => controller.abort(), config.timeout); + + const response = await fetch(url, { + headers: config.headers, + redirect: config.redirect, + signal: controller.signal, + }); + + clearTimeout(timeoutId); + + if (!response.ok) { + return { + success: false, + error: new Error(`HTTP ${response.status}: ${response.statusText}`), + }; + } + + const html = await response.text(); + return { success: true, data: html }; + } catch (error) { + return { + success: false, + error: error instanceof Error ? error : new Error(String(error)), + }; + } +}; + +/** + * HTML 문자열로부터 JSDOM 인스턴스를 생성합니다. + * @param html HTML 문자열 + * @param config DOM 생성 설정 + * @returns JSDOM 인스턴스 또는 에러 + */ +export const createDom = (html: string, config: DomConfig): Result => { + try { + const dom = new JSDOM(html, { + userAgent: config.userAgent, + resources: config.resources === 'usable' ? 'usable' : undefined, + runScripts: config.runScripts, + pretendToBeVisual: config.pretendToBeVisual, + }); + const window = dom.window; + + // matchMedia 모킹 + if (!window.matchMedia) { + window.matchMedia = () => ({ + matches: false, + media: '', + onchange: null, + addListener: () => {}, + removeListener: () => {}, + addEventListener: () => {}, + removeEventListener: () => {}, + dispatchEvent: () => false, + }); + } + + // fetch 모킹 (이미 Node.js에 있지만 안전을 위해) + if (!window.fetch) { + window.fetch = global.fetch; + } + + // elementFromPoint 모킹 (JSDOM 미구현 방지) + if (typeof window.document.elementFromPoint !== 'function') { + window.document.elementFromPoint = () => null; + } + + return { success: true, data: dom }; + } catch (error) { + return { + success: false, + error: error instanceof Error ? error : new Error(String(error)), + }; + } +}; + +/** + * URL로부터 JSDOM 인스턴스를 생성합니다. + * @param url HTML을 가져올 URL + * @param config DOM 생성 설정 + * @returns JSDOM 인스턴스 또는 에러 + */ +export const createDomFromUrl = async (url: string, config: DomConfig): Promise> => { + try { + const dom = await JSDOM.fromURL(url, { + userAgent: config.userAgent, + resources: config.resources === 'usable' ? 'usable' : undefined, + runScripts: config.runScripts, + pretendToBeVisual: config.pretendToBeVisual, + }); + return { success: true, data: dom }; + } catch (error) { + return { + success: false, + error: error instanceof Error ? error : new Error(String(error)), + }; + } +}; + +/** + * Result 타입을 변환하는 고차 함수 + * @param result 원본 Result + * @param fn 변환 함수 + * @returns 변환된 Result + */ +export const mapResult = (result: Result, fn: (data: T) => U): Result => { + if (result.success) { + return { success: true, data: fn(result.data) }; + } + return result; +}; + +/** + * Result 타입의 에러를 변환하는 고차 함수 + * @param result 원본 Result + * @param fn 에러 변환 함수 + * @returns 변환된 Result + */ +export const mapError = (result: Result, fn: (error: E) => F): Result => { + if (!result.success) { + return { success: false, error: fn(result.error) }; + } + return result; +}; + +/** + * Result 타입을 flatMap하는 고차 함수 + * @param result 원본 Result + * @param fn 변환 함수 + * @returns 변환된 Result + */ +export const flatMapResult = (result: Result, fn: (data: T) => Result): Result => { + if (result.success) { + return fn(result.data); + } + return result; +}; + +/** + * Option 타입을 변환하는 고차 함수 + * @param option 원본 Option + * @param fn 변환 함수 + * @returns 변환된 Option + */ +export const mapOption = (option: Option, fn: (value: T) => U): Option => { + return option != null ? fn(option) : null; +}; + +/** + * Option 타입을 flatMap하는 고차 함수 + * @param option 원본 Option + * @param fn 변환 함수 + * @returns 변환된 Option + */ +export const flatMapOption = (option: Option, fn: (value: T) => Option): Option => { + return option != null ? fn(option) : null; +}; + +/** + * 여러 셀렉터와 정규표현식을 이용해 제목을 추출합니다. + * @param document DOM 문서 + * @param selectors 셀렉터 목록 + * @param patterns 정제용 정규표현식 목록 + * @returns 추출된 제목 또는 null + */ +export const extractTitle = ( + document: Document, + selectors: readonly string[], + patterns: readonly RegExp[], +): Option => { + for (const selector of selectors) { + const element = document.querySelector(selector); + if (element) { + const title = element.getAttribute('content') || element.textContent; + if (title?.trim()) { + return patterns.reduce((cleanedTitle, pattern) => cleanedTitle.replace(pattern, ''), title.trim()); + } + } + } + return null; +}; + +/** + * 여러 셀렉터를 이용해 본문 콘텐츠 요소를 찾습니다. + * @param document DOM 문서 + * @param selectors 셀렉터 목록 + * @param minTextLength 최소 텍스트 길이 + * @returns 콘텐츠 요소 또는 null + */ +export const findContentElement = ( + document: Document, + selectors: readonly string[], + minTextLength: number = 100, +): Option => { + for (const selector of selectors) { + const element = document.querySelector(selector); + if (element?.textContent && element.textContent.trim().length > minTextLength) { + return element; + } + } + return document.body || null; +}; + +/** + * 여러 정제 함수를 순차적으로 적용하는 함수 합성 + * @param fns 정제 함수 목록 + * @returns 합성된 정제 함수 + */ +export const compose = (...fns: ((arg: T, ctx: any) => T)[]): ((arg: T, ctx: any) => T) => { + return (element: T, context: any): T => { + return fns.reduce((acc, fn) => fn(acc, context), element); + }; +}; + +/** + * 부분 적용 함수 + * @param fn 원본 함수 + * @param first 첫 번째 인자 + * @returns 두 번째 인자만 받는 함수 + */ +export const partial = (fn: (a: T, b: U) => V, first: T): ((second: U) => V) => { + return (second: U) => fn(first, second); +}; diff --git a/src/modules/scraper/scraper.controller.ts b/src/modules/scraper/scraper.controller.ts index 898f873..b414f9b 100644 --- a/src/modules/scraper/scraper.controller.ts +++ b/src/modules/scraper/scraper.controller.ts @@ -17,7 +17,7 @@ import { AuthRequest } from 'src/types'; import { PuppeteerParseService, FetchContentWithSaveInput } from './services/puppeteer-parse.service'; import { FetchContentInput } from './dto/fetch-content.input'; import { ScrapedContentOutput } from './dto/scraped-content.output'; -import { PreHandlerService } from '../pre-handler/pre-handler.service'; +import { RefactoredPreHandlerService } from '../pre-handler/refactored-pre-handler.service'; import { PreHandleResult } from '../pre-handler/dto/pre-handle-result.dto'; @ApiTags('scraper') @@ -25,7 +25,7 @@ import { PreHandleResult } from '../pre-handler/dto/pre-handle-result.dto'; export class ScraperController { constructor( private readonly puppeteerParseService: PuppeteerParseService, - private readonly preHandlerService: PreHandlerService, + private readonly preHandlerService: RefactoredPreHandlerService, ) {} /** diff --git a/src/modules/scraper/services/puppeteer-parse.service.ts b/src/modules/scraper/services/puppeteer-parse.service.ts index 9184274..7fa84cf 100644 --- a/src/modules/scraper/services/puppeteer-parse.service.ts +++ b/src/modules/scraper/services/puppeteer-parse.service.ts @@ -6,7 +6,7 @@ import { BrowserService } from './browser.service'; import { FetchContentInput } from '../dto/fetch-content.input'; import { ScrapedContentOutput } from '../dto/scraped-content.output'; import { InvalidUrlException } from '../exceptions/invalid-url.exception'; -import { PreHandlerService } from '../../pre-handler/pre-handler.service'; +import { RefactoredPreHandlerService } from '../../pre-handler/refactored-pre-handler.service'; import { ArticleService } from '../../article/services/article.service'; // ---------------------- CONSTANTS ---------------------- @@ -44,7 +44,7 @@ export class PuppeteerParseService { constructor( private readonly browserService: BrowserService, - private readonly preHandlerService: PreHandlerService, + private readonly preHandlerService: RefactoredPreHandlerService, private readonly articleService: ArticleService, ) {} @@ -304,7 +304,7 @@ export class PuppeteerParseService { * @param url - 원본 URL (상대 링크 처리용) * @returns 정제된 콘텐츠 또는 원본 HTML (실패 시) */ - private async applyReadabilityToHtml(html: string, url: string): Promise { + private applyReadabilityToHtml(html: string, url: string): Promise { try { const dom = new JSDOM(html, { url }); const reader = new Readability(dom.window.document); @@ -312,7 +312,7 @@ export class PuppeteerParseService { if (article?.content) { this.logger.log(`Successfully extracted readable content from HTML (${article.content.length} chars)`); - return article.content; + return Promise.resolve(article.content); } else { this.logger.warn(`Readability failed to extract content from HTML, using original`); } @@ -321,6 +321,6 @@ export class PuppeteerParseService { } // 실패 시 원본 반환 - return html; + return Promise.resolve(html); } } From 3bd374d51d1b4c9132679a0f721c586d5787f0c0 Mon Sep 17 00:00:00 2001 From: reach0908 Date: Sat, 5 Jul 2025 13:59:28 +0900 Subject: [PATCH 13/28] =?UTF-8?q?feat(pre-handler):=20=EC=83=88=EB=A1=9C?= =?UTF-8?q?=EC=9A=B4=20=ED=95=B8=EB=93=A4=EB=9F=AC=20=EC=B6=94=EA=B0=80=20?= =?UTF-8?q?=EB=B0=8F=20=EA=B8=B0=EC=A1=B4=20=ED=95=B8=EB=93=A4=EB=9F=AC=20?= =?UTF-8?q?=EA=B0=9C=EC=84=A0?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * TistoryHandler, MediumHandler, NaverBlogHandler, DisquietHandler를 추가하여 다양한 블로그 플랫폼의 콘텐츠를 처리할 수 있도록 했습니다. * 기존 DomainSpecificHandler를 개선하여 특정 도메인에 대한 URL 변환 로직을 최적화했습니다. * 핸들러의 HTTP 요청 및 DOM 생성 설정을 통일하여 일관성을 높였습니다. * 콘텐츠 추출 과정에서 발생할 수 있는 에러를 명확히 구분하기 위해 커스텀 에러 클래스를 추가했습니다. * 콘텐츠 품질 평가 기능을 추가하여 스크래핑 시 품질 기준을 설정하고, Puppeteer를 통한 대체 로직을 개선했습니다. --- package-lock.json | 181 +++++++- package.json | 2 + .../base/abstract-content-handler.ts | 16 +- .../pre-handler/factories/handler-factory.ts | 20 +- .../pre-handler/handlers/disquiet.handler.ts | 361 +++++++++++++++ .../handlers/domain-specific.handler.ts | 436 +----------------- .../pre-handler/handlers/medium.handler.ts | 91 ++++ .../handlers/naver-blog.handler.ts | 108 +++++ .../pre-handler/handlers/pdf.handler.ts | 12 +- .../handlers/readability.handler.ts | 127 ++--- .../pre-handler/handlers/rss.handler.ts | 8 +- .../pre-handler/handlers/tistory.handler.ts | 134 ++++++ src/modules/pre-handler/pre-handler.module.ts | 15 +- ...dler.service.ts => pre-handler.service.ts} | 6 + .../utils/content-cleaning-pipeline.ts | 64 +++ .../pre-handler/utils/functional-utils.ts | 154 +++---- src/modules/scraper/scraper.controller.ts | 2 +- src/modules/scraper/scraper.module.ts | 3 +- .../services/content-quality-evaluator.ts | 62 +++ .../services/puppeteer-parse.service.ts | 80 +++- 20 files changed, 1284 insertions(+), 598 deletions(-) create mode 100644 src/modules/pre-handler/handlers/disquiet.handler.ts create mode 100644 src/modules/pre-handler/handlers/medium.handler.ts create mode 100644 src/modules/pre-handler/handlers/naver-blog.handler.ts create mode 100644 src/modules/pre-handler/handlers/tistory.handler.ts rename src/modules/pre-handler/{refactored-pre-handler.service.ts => pre-handler.service.ts} (92%) create mode 100644 src/modules/scraper/services/content-quality-evaluator.ts diff --git a/package-lock.json b/package-lock.json index 850613b..626fd5c 100644 --- a/package-lock.json +++ b/package-lock.json @@ -22,6 +22,7 @@ "class-transformer": "^0.5.1", "class-validator": "^0.14.2", "cookie-parser": "^1.4.7", + "es-toolkit": "^1.39.6", "helmet": "^8.1.0", "jsdom": "^26.1.0", "passport": "^0.7.0", @@ -32,6 +33,7 @@ "puppeteer-extra-plugin-stealth": "^2.11.2", "reflect-metadata": "^0.2.2", "rxjs": "^7.8.1", + "sanitize-html": "^2.17.0", "swagger-ui-express": "^5.0.1" }, "devDependencies": { @@ -6732,6 +6734,68 @@ "node": "^14.15.0 || ^16.10.0 || >=18.0.0" } }, + "node_modules/dom-serializer": { + "version": "2.0.0", + "resolved": "https://registry.npmjs.org/dom-serializer/-/dom-serializer-2.0.0.tgz", + "integrity": "sha512-wIkAryiqt/nV5EQKqQpo3SToSOV9J0DnbJqwK7Wv/Trc92zIAYZ4FlMu+JPFW1DfGFt81ZTCGgDEabffXeLyJg==", + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.2", + "entities": "^4.2.0" + }, + "funding": { + "url": "https://github.com/cheeriojs/dom-serializer?sponsor=1" + } + }, + "node_modules/dom-serializer/node_modules/entities": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz", + "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, + "node_modules/domelementtype": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/domelementtype/-/domelementtype-2.3.0.tgz", + "integrity": "sha512-OLETBj6w0OsagBwdXnPdN0cnMfF9opN69co+7ZrbfPGrdpPVNBUj02spi6B1N7wChLQiPn4CSH/zJvXw56gmHw==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ] + }, + "node_modules/domhandler": { + "version": "5.0.3", + "resolved": "https://registry.npmjs.org/domhandler/-/domhandler-5.0.3.tgz", + "integrity": "sha512-cgwlv/1iFQiFnU96XXgROh8xTeetsnJiDsTc7TYCLFd9+/WNkIqPTxiM/8pSd8VIrhXGTf1Ny1q1hquVqDJB5w==", + "dependencies": { + "domelementtype": "^2.3.0" + }, + "engines": { + "node": ">= 4" + }, + "funding": { + "url": "https://github.com/fb55/domhandler?sponsor=1" + } + }, + "node_modules/domutils": { + "version": "3.2.2", + "resolved": "https://registry.npmjs.org/domutils/-/domutils-3.2.2.tgz", + "integrity": "sha512-6kZKyUajlDuqlHKVX1w7gyslj9MPIXzIFiz/rGu35uC1wMi+kMhQwGhl4lt9unC9Vb9INnY9Z3/ZA3+FhASLaw==", + "dependencies": { + "dom-serializer": "^2.0.0", + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3" + }, + "funding": { + "url": "https://github.com/fb55/domutils?sponsor=1" + } + }, "node_modules/dotenv": { "version": "16.4.7", "resolved": "https://registry.npmjs.org/dotenv/-/dotenv-16.4.7.tgz", @@ -6973,6 +7037,15 @@ "node": ">= 0.4" } }, + "node_modules/es-toolkit": { + "version": "1.39.6", + "resolved": "https://registry.npmjs.org/es-toolkit/-/es-toolkit-1.39.6.tgz", + "integrity": "sha512-uiVjnLem6kkfXumlwUEWEKnwUN5QbSEB0DHy2rNJt0nkYcob5K0TXJ7oJRzhAcvx+SRmz4TahKyN5V9cly/IPA==", + "workspaces": [ + "docs", + "benchmarks" + ] + }, "node_modules/escalade": { "version": "3.2.0", "resolved": "https://registry.npmjs.org/escalade/-/escalade-3.2.0.tgz", @@ -6992,7 +7065,6 @@ "version": "4.0.0", "resolved": "https://registry.npmjs.org/escape-string-regexp/-/escape-string-regexp-4.0.0.tgz", "integrity": "sha512-TtpcNJ3XAzx3Gq8sWRzJaVajRs0uVxA2YAkdb1jm2YkPz4G6egUFAyA3n5vtEIZefPk5Wa4UXbKuS5fKkJWdgA==", - "dev": true, "license": "MIT", "engines": { "node": ">=10" @@ -8271,6 +8343,35 @@ "dev": true, "license": "MIT" }, + "node_modules/htmlparser2": { + "version": "8.0.2", + "resolved": "https://registry.npmjs.org/htmlparser2/-/htmlparser2-8.0.2.tgz", + "integrity": "sha512-GYdjWKDkbRLkZ5geuHs5NY1puJ+PXwP7+fHPRz06Eirsb9ugf6d8kkXav6ADhcODhFFPMIXyxkxSuMf3D6NCFA==", + "funding": [ + "https://github.com/fb55/htmlparser2?sponsor=1", + { + "type": "github", + "url": "https://github.com/sponsors/fb55" + } + ], + "dependencies": { + "domelementtype": "^2.3.0", + "domhandler": "^5.0.3", + "domutils": "^3.0.1", + "entities": "^4.4.0" + } + }, + "node_modules/htmlparser2/node_modules/entities": { + "version": "4.5.0", + "resolved": "https://registry.npmjs.org/entities/-/entities-4.5.0.tgz", + "integrity": "sha512-V0hjH4dGPh9Ao5p0MoRY6BVqtwCjhz6vI5LT8AJ55H+4g9/4vbHx1I54fS0XuclLhDHArPQCiMjDxjaL8fPxhw==", + "engines": { + "node": ">=0.12" + }, + "funding": { + "url": "https://github.com/fb55/entities?sponsor=1" + } + }, "node_modules/http-cache-semantics": { "version": "4.2.0", "resolved": "https://registry.npmjs.org/http-cache-semantics/-/http-cache-semantics-4.2.0.tgz", @@ -10241,6 +10342,23 @@ "node": "^18.17.0 || >=20.5.0" } }, + "node_modules/nanoid": { + "version": "3.3.11", + "resolved": "https://registry.npmjs.org/nanoid/-/nanoid-3.3.11.tgz", + "integrity": "sha512-N8SpfPUnUp1bK+PMYW8qSWdl9U+wwNWI4QKxOYDy9JAro3WMX7p2OeVRF9v+347pnakNevPmiHhNmZ2HbFA76w==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "bin": { + "nanoid": "bin/nanoid.cjs" + }, + "engines": { + "node": "^10 || ^12 || ^13.7 || ^14 || >=15.0.1" + } + }, "node_modules/natural-compare": { "version": "1.4.0", "resolved": "https://registry.npmjs.org/natural-compare/-/natural-compare-1.4.0.tgz", @@ -10621,6 +10739,11 @@ "url": "https://github.com/sponsors/sindresorhus" } }, + "node_modules/parse-srcset": { + "version": "1.0.2", + "resolved": "https://registry.npmjs.org/parse-srcset/-/parse-srcset-1.0.2.tgz", + "integrity": "sha512-/2qh0lav6CmI15FzA3i/2Bzk2zCgQhGMkvhOhKNcBVQ1ldgpbfiNTVslmooUmWJcADi1f1kIeynbDRVzNlfR6Q==" + }, "node_modules/parse5": { "version": "7.3.0", "resolved": "https://registry.npmjs.org/parse5/-/parse5-7.3.0.tgz", @@ -10924,6 +11047,33 @@ "node": ">=4" } }, + "node_modules/postcss": { + "version": "8.5.6", + "resolved": "https://registry.npmjs.org/postcss/-/postcss-8.5.6.tgz", + "integrity": "sha512-3Ybi1tAuwAP9s0r1UQ2J4n5Y0G05bJkpUIO0/bI9MhwmD70S5aTWbXGBwxHrelT+XM1k6dM0pk+SwNkpTRN7Pg==", + "funding": [ + { + "type": "opencollective", + "url": "https://opencollective.com/postcss/" + }, + { + "type": "tidelift", + "url": "https://tidelift.com/funding/github/npm/postcss" + }, + { + "type": "github", + "url": "https://github.com/sponsors/ai" + } + ], + "dependencies": { + "nanoid": "^3.3.11", + "picocolors": "^1.1.1", + "source-map-js": "^1.2.1" + }, + "engines": { + "node": "^10 || ^12 || >=14" + } + }, "node_modules/prelude-ls": { "version": "1.2.1", "resolved": "https://registry.npmjs.org/prelude-ls/-/prelude-ls-1.2.1.tgz", @@ -11728,6 +11878,27 @@ "integrity": "sha512-YZo3K82SD7Riyi0E1EQPojLz7kpepnSQI9IyPbHHg1XXXevb5dJI7tpyN2ADxGcQbHG7vcyRHk0cbwqcQriUtg==", "license": "MIT" }, + "node_modules/sanitize-html": { + "version": "2.17.0", + "resolved": "https://registry.npmjs.org/sanitize-html/-/sanitize-html-2.17.0.tgz", + "integrity": "sha512-dLAADUSS8rBwhaevT12yCezvioCA+bmUTPH/u57xKPT8d++voeYE6HeluA/bPbQ15TwDBG2ii+QZIEmYx8VdxA==", + "dependencies": { + "deepmerge": "^4.2.2", + "escape-string-regexp": "^4.0.0", + "htmlparser2": "^8.0.0", + "is-plain-object": "^5.0.0", + "parse-srcset": "^1.0.2", + "postcss": "^8.3.11" + } + }, + "node_modules/sanitize-html/node_modules/is-plain-object": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/is-plain-object/-/is-plain-object-5.0.0.tgz", + "integrity": "sha512-VRSzKkbMm5jMDoKLbltAkFQ5Qr7VDiTFGXxYFXXowVj387GeGNOCsOH6Msy00SGZ3Fp84b1Naa1psqgcCIEP5Q==", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/saxes": { "version": "6.0.0", "resolved": "https://registry.npmjs.org/saxes/-/saxes-6.0.0.tgz", @@ -12105,6 +12276,14 @@ "node": ">= 8" } }, + "node_modules/source-map-js": { + "version": "1.2.1", + "resolved": "https://registry.npmjs.org/source-map-js/-/source-map-js-1.2.1.tgz", + "integrity": "sha512-UXWMKhLOwVKb728IUtQPXxfYU+usdybtUrK/8uGE8CQMvrhOpwvzDBwj0QhSL7MQc7vIsISBG8VQ8+IDQxpfQA==", + "engines": { + "node": ">=0.10.0" + } + }, "node_modules/source-map-support": { "version": "0.5.21", "resolved": "https://registry.npmjs.org/source-map-support/-/source-map-support-0.5.21.tgz", diff --git a/package.json b/package.json index 10c5df6..9ddf1f1 100644 --- a/package.json +++ b/package.json @@ -40,6 +40,7 @@ "class-transformer": "^0.5.1", "class-validator": "^0.14.2", "cookie-parser": "^1.4.7", + "es-toolkit": "^1.39.6", "helmet": "^8.1.0", "jsdom": "^26.1.0", "passport": "^0.7.0", @@ -50,6 +51,7 @@ "puppeteer-extra-plugin-stealth": "^2.11.2", "reflect-metadata": "^0.2.2", "rxjs": "^7.8.1", + "sanitize-html": "^2.17.0", "swagger-ui-express": "^5.0.1" }, "devDependencies": { diff --git a/src/modules/pre-handler/base/abstract-content-handler.ts b/src/modules/pre-handler/base/abstract-content-handler.ts index 6c412ae..0b83f8a 100644 --- a/src/modules/pre-handler/base/abstract-content-handler.ts +++ b/src/modules/pre-handler/base/abstract-content-handler.ts @@ -96,7 +96,7 @@ export abstract class AbstractContentHandler implements IContentHandler { * @param url 처리할 URL * @returns 추출 결과 Result */ - private async extractContent(url: URL): Promise> { + protected async extractContent(url: URL): Promise> { const htmlResult = await fetchHtml(url.href, this.httpConfig); if (!htmlResult.success) { return { success: false, error: htmlResult.error }; @@ -105,7 +105,7 @@ export abstract class AbstractContentHandler implements IContentHandler { if (!domResult.success) { return { success: false, error: domResult.error }; } - return { success: true, data: this.processDom(domResult.data, url.href) }; + return { success: true, data: await this.processDom(domResult.data, url.href) }; } /** @@ -114,8 +114,13 @@ export abstract class AbstractContentHandler implements IContentHandler { * @param url 기준 URL * @returns 추출 결과 */ - private processDom(dom: JSDOM, url: string): ContentExtractionResult { + private async processDom(dom: JSDOM, url: string): Promise { const document = dom.window.document; + // 동적 콘텐츠 대기 (waitForDynamicContent가 있으면 안전하게 호출) + const maybeWithWait = this as unknown as { waitForDynamicContent?: (doc: Document) => Promise }; + if (typeof maybeWithWait.waitForDynamicContent === 'function') { + await maybeWithWait.waitForDynamicContent(document); + } // 제목 추출 const titleOption: Option = extractTitle( document, @@ -123,9 +128,10 @@ export abstract class AbstractContentHandler implements IContentHandler { this.titleConfig.patterns, ); const title: string | undefined = titleOption == null ? undefined : titleOption; - // 콘텐츠 요소 찾기 - const contentElement = findContentElement(document, this.contentSelectors); + // 콘텐츠 요소 찾기 (minTextLength 30, logger 전달) + const contentElement = findContentElement(document, this.contentSelectors, 30, this.logger); if (!contentElement) { + this.logger.debug(`${this.handlerName} 본문 요소를 찾지 못해 body로 fallback`); return { title, contentType: 'text/html', url }; } // 콘텐츠 정제 diff --git a/src/modules/pre-handler/factories/handler-factory.ts b/src/modules/pre-handler/factories/handler-factory.ts index 524d537..5a168b5 100644 --- a/src/modules/pre-handler/factories/handler-factory.ts +++ b/src/modules/pre-handler/factories/handler-factory.ts @@ -11,7 +11,13 @@ import { PdfHandler } from '../handlers/pdf.handler'; import { RssHandler } from '../handlers/rss.handler'; import { YoutubeHandler } from '../handlers/youtube.handler'; import { NewsSiteHandler } from '../handlers/news-site.handler'; +import { TistoryHandler } from '../handlers/tistory.handler'; +import { MediumHandler } from '../handlers/medium.handler'; +import { NaverBlogHandler } from '../handlers/naver-blog.handler'; +import { DomainSpecificHandler } from '../handlers/domain-specific.handler'; +import { SocialMediaHandler } from '../handlers/social-media.handler'; import { ReadabilityHandler } from '../handlers/readability.handler'; +import { DisquietHandler } from '../handlers/disquiet.handler'; // 필요시 다른 핸들러 import /** @@ -28,16 +34,28 @@ export class HandlerFactory { private readonly rssHandler: RssHandler, private readonly youtubeHandler: YoutubeHandler, private readonly newsSiteHandler: NewsSiteHandler, + private readonly tistoryHandler: TistoryHandler, + private readonly mediumHandler: MediumHandler, + private readonly disquietHandler: DisquietHandler, + private readonly naverBlogHandler: NaverBlogHandler, + private readonly domainSpecificHandler: DomainSpecificHandler, + private readonly socialMediaHandler: SocialMediaHandler, private readonly readabilityHandler: ReadabilityHandler, // 필요시 다른 핸들러 DI ) { - // 우선순위: 도메인 특화 → 일반 → fallback + // 우선순위: 도메인 특화 → 소셜/뉴스 → 일반 → fallback this.handlerChain = [ this.mailyHandler, this.stibeeHandler, this.pdfHandler, this.rssHandler, this.youtubeHandler, + this.tistoryHandler, + this.naverBlogHandler, + this.mediumHandler, + this.disquietHandler, + this.domainSpecificHandler, + this.socialMediaHandler, this.newsSiteHandler, this.readabilityHandler, // 항상 마지막 fallback ]; diff --git a/src/modules/pre-handler/handlers/disquiet.handler.ts b/src/modules/pre-handler/handlers/disquiet.handler.ts new file mode 100644 index 0000000..a3c5f62 --- /dev/null +++ b/src/modules/pre-handler/handlers/disquiet.handler.ts @@ -0,0 +1,361 @@ +import { Injectable, Logger } from '@nestjs/common'; +import { AbstractContentHandler } from '../base/abstract-content-handler'; +import { + DomConfig, + HttpRequestConfig, + TitleExtractionConfig, + ContentCleaningConfig, + ContentExtractionResult, +} from '../types/content-extraction.types'; +import { JSDOM } from 'jsdom'; +import { fetchHtml, createDom, extractTitle } from '../utils/functional-utils'; +import { createContentCleaningPipeline } from '../utils/content-cleaning-pipeline'; +import { Result } from '../utils/functional-utils'; + +/** + * Disquiet.io 사이트 전용 핸들러 + * + * Disquiet.io는 로그인 유도 팝업이 나타나는 문제가 있어서, + * 특별한 처리가 필요합니다. + * + * 주요 특징: + * - 로그인 팝업 제거 + * - 동적 콘텐츠 로딩 대기 + * - 특정 CSS 선택자로 콘텐츠 추출 + */ +@Injectable() +export class DisquietHandler extends AbstractContentHandler { + protected readonly logger = new Logger(DisquietHandler.name); + + /** + * 핸들러가 처리할 수 있는 URL인지 확인 + * @param url 검사할 URL + * @returns true if can handle + */ + public canHandle(url: URL): boolean { + const result = url.hostname.endsWith('disquiet.io'); + this.logger.debug(`DisquietHandler canHandle: ${url.hostname} -> ${result}`); + return result; + } + + /** + * 핸들러 이름 + */ + protected get handlerName(): string { + return 'DisquietHandler'; + } + + /** + * HTTP 요청 설정 + */ + protected get httpConfig(): HttpRequestConfig { + return { + userAgent: + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + timeout: 30000, + headers: { + Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8', + 'Accept-Language': 'ko-KR,ko;q=0.9,en;q=0.8', + 'Accept-Encoding': 'gzip, deflate, br', + DNT: '1', + Connection: 'keep-alive', + 'Upgrade-Insecure-Requests': '1', + }, + redirect: 'follow', + }; + } + + /** + * DOM 생성 설정 (스크립트 활성화로 동적 콘텐츠 처리) + */ + protected get domConfig(): DomConfig { + return { + userAgent: this.httpConfig.userAgent, + resources: 'usable', + runScripts: 'dangerously', // 동적 콘텐츠를 위해 스크립트 실행 + pretendToBeVisual: true, + }; + } + + /** + * 콘텐츠 정제 설정 + */ + protected get cleaningConfig(): ContentCleaningConfig { + return { + removeUnwantedElements: true, + cleanupStyles: false, + cleanupLinks: false, + cleanupImages: false, + cleanupText: false, + refineTitle: false, + }; + } + + /** + * 제목 추출 설정 + */ + protected get titleConfig(): TitleExtractionConfig { + return { + selectors: [ + 'h1', + '.post-title', + '.article-title', + '[data-testid="post-title"]', + 'header h1', + 'header .title', + 'meta[property="og:title"]', + '.title.detail-page', + '.title-wrapper .title', + // Disquiet.io 전용 선택자 + '[data-testid="makerlog-title"]', + '.makerlog-title', + '.post-header h1', + '.post-header .title', + ], + patterns: [], + siteSpecificPatterns: {}, + }; + } + + /** + * 콘텐츠 선택자 (Disquiet.io 전용으로 최적화) + */ + protected get contentSelectors(): readonly string[] { + return [ + // Disquiet.io 전용 선택자 (우선순위 높음) + '[data-testid="makerlog-content"]', + '[data-testid="post-content"]', + '.makerlog-content', + '.maker-log-detail', + + // 이하 기존 선택자 + '.post-content', + '.article-content', + '.content-wrapper', + '.content-area', + '.post-body', + '.article-body', + '.makerlog-body', + '.post-detail', + '.article-detail', + '.detail-content', + '.main-content', + '.content', + 'article', + 'main', + '.container', + '#content', + '.body', + '.markdown-body', + '.reader-content', + '.entry-content', + '.blog-post', + '.post', + '.detail', + '.detail-page', + // Disquiet.io 특정 클래스 + '.sc-keuYuY', + '.sc-keuYuY.detail-page', + '.title.detail-page', + // 추가 Disquiet.io 선택자 + '[class*="makerlog"]', + '[class*="post-"]', + '[class*="article-"]', + '[class*="content"]', + '[class*="body"]', + ]; + } + + /** + * 로그인 팝업 등 불필요한 요소 제거 (후처리) + * @param dom JSDOM 인스턴스 + */ + protected postProcessDom(dom: Document): void { + // Disquiet.io 전용 제거 요소 (더 구체적으로) + const removeSelectors = [ + // 로그인/인증 관련 + '[data-testid="login-modal"]', + '[data-testid="auth-modal"]', + '.login-modal', + '.auth-modal', + '.modal', + '.popup', + '.Dialog', + '.modal-backdrop', + '.overlay', + '.Dialog-overlay', + '.backdrop', + // 네비게이션/헤더/푸터 + '.header', + '.footer', + '.nav', + '.navigation', + '.menu', + '.sidebar', + // 광고/프로모션 + '.ad', + '.advertisement', + '.promo', + '.sponsor', + '.banner', + // 소셜/공유 + '.share', + '.social', + '.comment', + '.comments', + // 기타 UI 요소 + '.button', + '.actions', + '.toolbar', + '.widget', + '.tool', + '.search', + '.breadcrumb', + '.pagination', + '.page-nav', + // 메타 정보 (제목은 유지) + '.meta', + '.info', + '.date', + '.time', + '.author', + '.tag', + '.category', + '.label', + '.count', + '.view', + '.like', + '.dislike', + '.vote', + '.star', + '.rating', + // 미디어 (텍스트 콘텐츠에 집중) + '.icon', + '.svg', + '.img', + '.figure', + '.caption', + '.gallery', + '.media', + '.video', + '.audio', + // 관련 콘텐츠 + '.related', + '.related-posts', + '.related-articles', + '.recommend', + '.suggest', + '.popular', + '.trending', + '.recent', + // 기술적 요소 + 'script', + 'style', + 'noscript', + 'iframe', + // 기타 불필요 요소 + '.subscribe', + '.newsletter', + '.cookie', + '.consent', + '.file', + '.download', + '.attachment', + '.external', + '.internal', + '.link', + ]; + + // 요소 제거 (콘텐츠 보존) + removeSelectors.forEach((selector) => { + dom.querySelectorAll(selector).forEach((el) => { + // 콘텐츠가 포함된 요소는 제거하지 않음 + const textContent = el.textContent?.trim(); + if (textContent && textContent.length > 50) { + this.logger.debug(`콘텐츠 보존: ${selector} (${textContent.length}글자)`); + return; + } + el.remove(); + }); + }); + + this.logger.log('Disquiet.io 전용 요소 제거 완료 (콘텐츠 보존)'); + } + + /** + * 동적 콘텐츠 로딩을 위한 대기 시간 추가 + * @param dom JSDOM 인스턴스 + */ + protected async waitForDynamicContent(dom: Document): Promise { + // Disquiet.io는 동적 콘텐츠 로딩이 필요할 수 있음 + await new Promise((resolve) => setTimeout(resolve, 2000)); + // 콘텐츠가 로드되었는지 확인 + const contentSelectors = this.contentSelectors; + for (const selector of contentSelectors) { + const elements = dom.querySelectorAll(selector); + for (const element of elements) { + const textContent = element.textContent?.trim(); + if (textContent && textContent.length > 100) { + this.logger.debug(`동적 콘텐츠 확인: ${selector} (${textContent.length}글자)`); + return; + } + } + } + } + + /** + * 여러 div(본문 파편)를 모두 합쳐서 반환하는 extractContent 오버라이드 + */ + public async extractContent(url: URL): Promise> { + try { + const htmlResult = await fetchHtml(url.href, this.httpConfig); + if (!htmlResult.success) { + return { success: false, error: htmlResult.error }; + } + const domResult = createDom(htmlResult.data, this.domConfig); + if (!domResult.success) { + return { success: false, error: domResult.error }; + } + const dom: JSDOM = domResult.data; + const document = dom.window.document; + + // waitForDynamicContent가 존재하는지 타입 가드로 안전하게 호출 + if ( + typeof (this as unknown as { waitForDynamicContent?: (doc: Document) => Promise }) + .waitForDynamicContent === 'function' + ) { + await ( + this as unknown as { waitForDynamicContent: (doc: Document) => Promise } + ).waitForDynamicContent(document); + } + // 제목 추출 + const titleOption = extractTitle(document, this.titleConfig.selectors, this.titleConfig.patterns); + const title: string | undefined = titleOption == null ? undefined : titleOption; + // 여러 div를 모두 합쳐서 본문으로 사용 + const elements = Array.from(document.querySelectorAll(this.contentSelectors.join(','))); + const content = elements.map((el) => el.innerHTML).join('\n'); + if (!content || content.trim().length < 10) { + this.logger.debug(`${this.handlerName} 본문 요소를 찾지 못해 body로 fallback`); + return { success: true, data: { title, contentType: 'text/html', url: url.href } }; + } + // 콘텐츠 정제 + const cleaningPipeline = createContentCleaningPipeline(this.cleaningConfig); + const fakeElement = document.createElement('div'); + fakeElement.innerHTML = content; + const cleanedElement = cleaningPipeline(fakeElement, { + baseUrl: url.href, + config: this.cleaningConfig, + logger: this.logger, + }); + const result: ContentExtractionResult = { + title, + content: cleanedElement.outerHTML, + contentType: 'text/html', + url: url.href, + }; + return { success: true, data: result }; + } catch (error) { + return { success: false, error: error instanceof Error ? error : new Error(String(error)) }; + } + } +} diff --git a/src/modules/pre-handler/handlers/domain-specific.handler.ts b/src/modules/pre-handler/handlers/domain-specific.handler.ts index ccddfa6..84aa28a 100644 --- a/src/modules/pre-handler/handlers/domain-specific.handler.ts +++ b/src/modules/pre-handler/handlers/domain-specific.handler.ts @@ -1,7 +1,6 @@ import { Injectable, Logger } from '@nestjs/common'; import { IContentHandler } from '../interfaces/content-handler.interface'; import { PreHandleResult } from '../dto/pre-handle-result.dto'; -import { JSDOM } from 'jsdom'; /** * A map of domain names to their URL transformation functions. @@ -70,10 +69,6 @@ const DOMAIN_TRANSFORMATIONS: Record URL> = { newUrl.hostname = 'm.post.naver.com'; return newUrl; }, - 'tistory.com': (url) => { - // Tistory Blog: Keep original, usually accessible - return url; - }, 'stackoverflow.com': (url) => { // Stack Overflow: Keep original, it's usually accessible return url; @@ -369,11 +364,11 @@ export class DomainSpecificHandler implements IContentHandler { * @param url - The URL of the content to handle. * @returns A `PreHandleResult` with the new URL, or `null` on failure. */ - public async handle(url: URL): Promise { + public handle(url: URL): Promise { const domain = Object.keys(DOMAIN_TRANSFORMATIONS).find((d) => url.hostname.endsWith(d)); if (!domain) { - return null; + return Promise.resolve(null); } try { @@ -383,19 +378,9 @@ export class DomainSpecificHandler implements IContentHandler { // Extract title and content from the original URL for specific domains let title: string | undefined; let content: string | undefined; - let contentType = 'text/html'; // Default content type + const contentType = 'text/html'; // Default content type - if (domain === 'medium.com') { - const mediumResult = await this.extractMediumContent(newUrl); - title = mediumResult.title; - content = mediumResult.content; - contentType = 'text/html'; // Medium content comes as HTML - } else if (domain === 'blog.naver.com') { - const naverBlogResult = await this.extractNaverBlogContent(url); - title = naverBlogResult.title; - content = naverBlogResult.content; - contentType = 'text/html'; - } else if (domain === 'substack.com') { + if (domain === 'substack.com') { title = this.extractSubstackTitle(url); } else if (domain === 'github.com') { title = this.extractGitHubTitle(url); @@ -405,423 +390,16 @@ export class DomainSpecificHandler implements IContentHandler { title = this.extractWikipediaTitle(url); } - return { + return Promise.resolve({ url: newUrl.href, title, content, contentType, - }; - } catch (error) { - this.logger.warn(`DomainSpecificHandler failed for ${url.href}: ${(error as Error).message}`); - return null; - } - } - - /** - * Extracts title and content from Medium URL by fetching and parsing HTML directly. - * @param url - The cleaned Medium URL. - * @returns The extracted title and content. - */ - private async extractMediumContent(url: URL): Promise<{ - title?: string; - content?: string; - }> { - try { - this.logger.debug(`Extracting Medium content from: ${url.href}`); - - // Create an AbortController for timeout - const controller = new AbortController(); - const timeoutId = setTimeout(() => controller.abort(), 15000); // 15 second timeout - - const response = await fetch(url.href, { - headers: { - 'User-Agent': - 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', - Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Language': 'en-US,en;q=0.5', - 'Accept-Encoding': 'gzip, deflate', - Connection: 'keep-alive', - 'Cache-Control': 'no-cache', - }, - redirect: 'follow', - signal: controller.signal, - }); - - clearTimeout(timeoutId); - - if (!response.ok) { - throw new Error(`HTTP ${response.status}: ${response.statusText}`); - } - - const html = await response.text(); - this.logger.debug(`Successfully fetched Medium HTML, length: ${html.length}`); - - const dom = new JSDOM(html); - const document = dom.window.document; - - // Extract title - const title = document.title?.trim() || this.extractTitleFromUrl(url); - - // Process images: convert picture tags to img tags with optimal source - this.optimizeMediumImages(document); - - // Extract the main content - const content = document.body?.outerHTML; - - this.logger.log(`Successfully extracted Medium content: title="${title?.substring(0, 50)}"`); - - return { - title, - content, - }; - } catch (error) { - this.logger.warn(`Failed to extract Medium content from ${url.href}: ${(error as Error).message}`); - - // Fallback to URL-based title extraction - const fallbackTitle = this.extractTitleFromUrl(url); - return { - title: fallbackTitle, - content: undefined, - }; - } - } - - /** - * Optimizes Medium images by converting picture tags to img tags. - * Selects the largest image from srcSet for better quality. - * @param document - The DOM document to process. - */ - private optimizeMediumImages(document: Document): void { - const pictures = document.querySelectorAll('picture'); - - pictures.forEach((picture) => { - const source = picture.querySelector('source'); - if (source) { - const srcSet = source.getAttribute('srcSet'); - - if (srcSet) { - // Parse srcSet and sort by image width (descending) - const sources = srcSet - .split(', ') - .map((src) => src.trim().split(' ')) - .filter((parts) => parts.length >= 2) - .sort((a, b) => { - const widthA = Number(a[1].replace('w', '')); - const widthB = Number(b[1].replace('w', '')); - return widthB - widthA; // Sort descending (largest first) - }); - - // Use the largest image from the source set - if (sources.length > 0 && sources[0].length > 0) { - const imageUrl = sources[0][0]; - const img = document.createElement('img'); - img.src = imageUrl; - - // Copy any existing attributes from the picture element - const existingImg = picture.querySelector('img'); - if (existingImg) { - if (existingImg.alt) img.alt = existingImg.alt; - if (existingImg.title) img.title = existingImg.title; - } - - // Replace picture with img - picture.parentNode?.replaceChild(img, picture); - } - } - } - }); - } - - /** - * 네이버 블로그 콘텐츠와 이미지를 추출합니다. - * @param url - 네이버 블로그 URL - * @returns 추출된 타이틀과 콘텐츠 - */ - private async extractNaverBlogContent(url: URL): Promise<{ - title?: string; - content?: string; - }> { - try { - this.logger.debug(`Extracting Naver Blog content from: ${url.href}`); - - // 네이버 블로그 접근을 위한 특수 헤더 설정 - const response = await fetch(url.href, { - headers: { - 'User-Agent': - 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1', - Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Language': 'ko-KR,ko;q=0.9,en;q=0.8', - 'Accept-Encoding': 'gzip, deflate', - Connection: 'keep-alive', - Referer: 'https://blog.naver.com/', - }, - redirect: 'follow', }); - - if (!response.ok) { - throw new Error(`HTTP ${response.status}: ${response.statusText}`); - } - - const html = await response.text(); - this.logger.debug(`Successfully fetched Naver Blog HTML, length: ${html.length}`); - - const dom = new JSDOM(html); - const document = dom.window.document; - - // 타이틀 추출 - const title = this.extractNaverBlogTitleFromDocument(document); - - // 네이버 블로그 이미지 최적화 - this.optimizeNaverBlogImages(document); - - // 콘텐츠 추출 - const content = document.body?.outerHTML; - - this.logger.log(`Successfully extracted Naver Blog content: title="${title?.substring(0, 50)}"`); - - return { - title, - content, - }; } catch (error) { - this.logger.warn(`Failed to extract Naver Blog content from ${url.href}: ${(error as Error).message}`); - - // 실패 시 기존 타이틀 추출 로직 사용 - const fallbackTitle = await this.extractNaverBlogTitle(url); - return { - title: fallbackTitle, - content: undefined, - }; - } - } - - /** - * 네이버 블로그 문서에서 타이틀을 추출합니다. - * @param document - DOM 문서 - * @returns 추출된 타이틀 - */ - private extractNaverBlogTitleFromDocument(document: Document): string | undefined { - // 다양한 방법으로 타이틀 추출 시도 - const titleSelectors = [ - 'meta[property="og:title"]', - 'meta[name="title"]', - 'title', - '.se-title-text', - '.pcol1 .title', - '.blog-title', - ]; - - for (const selector of titleSelectors) { - const element = document.querySelector(selector); - if (element) { - const title = element.getAttribute('content') || element.textContent; - if (title?.trim()) { - // 네이버 블로그 타이틀 정리 - let cleanTitle = title.trim(); - cleanTitle = cleanTitle.replace(/\s*:\s*네이버 블로그$/, ''); - cleanTitle = cleanTitle.replace(/\s*\|\s*네이버 블로그$/, ''); - return cleanTitle.trim(); - } - } - } - - return undefined; - } - - /** - * 네이버 블로그의 이미지를 최적화합니다. - * @param document - DOM 문서 - */ - private optimizeNaverBlogImages(document: Document): void { - // 네이버 블로그 이미지 처리 - const images = document.querySelectorAll('img'); - - images.forEach((img) => { - // 네이버 블로그 썸네일 URL을 원본 이미지 URL로 변환 - const src = img.getAttribute('src') || img.getAttribute('data-src'); - if (src) { - // 네이버 블로그 이미지 URL 패턴 처리 - let optimizedSrc = src; - - // 썸네일 URL을 원본 URL로 변환 - if (src.includes('blogfiles.naver.net')) { - // 썸네일 파라미터 제거하여 원본 이미지 획득 - optimizedSrc = src.replace(/\?.*$/, ''); - } - - // 상대 경로를 절대 경로로 변환 - if (optimizedSrc.startsWith('//')) { - optimizedSrc = 'https:' + optimizedSrc; - } else if (optimizedSrc.startsWith('/')) { - optimizedSrc = 'https://blog.naver.com' + optimizedSrc; - } - - // 최적화된 src 설정 - img.setAttribute('src', optimizedSrc); - - // lazy loading 속성 제거 - img.removeAttribute('data-src'); - img.removeAttribute('loading'); - } - }); - - // 네이버 블로그 특수 이미지 태그 처리 - const specialImages = document.querySelectorAll('[data-ke-src]'); - specialImages.forEach((element) => { - const dataSrc = element.getAttribute('data-ke-src'); - if (dataSrc) { - let optimizedSrc = dataSrc; - - // 상대 경로를 절대 경로로 변환 - if (optimizedSrc.startsWith('//')) { - optimizedSrc = 'https:' + optimizedSrc; - } else if (optimizedSrc.startsWith('/')) { - optimizedSrc = 'https://blog.naver.com' + optimizedSrc; - } - - // img 태그로 변환 - const img = document.createElement('img'); - img.src = optimizedSrc; - img.alt = element.getAttribute('alt') || ''; - - // 기존 요소를 새로운 img 태그로 교체 - element.parentNode?.replaceChild(img, element); - } - }); - } - - /** - * Extracts title from Naver Blog URL by fetching and parsing meta tags. - * @param url - The Naver Blog URL. - * @returns The extracted title or undefined. - */ - private async extractNaverBlogTitle(url: URL): Promise { - try { - // Try to extract from the original URL first - const metaInfo = await this.fetchNaverBlogMeta(url.href); - - if (metaInfo.title) { - this.logger.log(`Successfully extracted Naver Blog title: ${metaInfo.title}`); - return metaInfo.title; - } - - // Fallback to URL-based extraction - const fallbackTitle = this.extractNaverBlogTitleFromUrl(url); - this.logger.debug(`Using fallback title for Naver Blog: ${fallbackTitle}`); - return fallbackTitle; - } catch (error) { - this.logger.warn(`Failed to extract Naver Blog title from ${url.href}: ${(error as Error).message}`); - return this.extractNaverBlogTitleFromUrl(url); - } - } - - /** - * Fetches Naver Blog HTML with special headers and extracts meta information. - * @param urlString - The Naver Blog URL to fetch. - * @returns Meta information object. - */ - private async fetchNaverBlogMeta(urlString: string): Promise<{ - title?: string; - description?: string; - }> { - // Special headers for Naver Blog - const response = await fetch(urlString, { - headers: { - 'User-Agent': - 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1', - Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', - 'Accept-Language': 'ko-KR,ko;q=0.9,en;q=0.8', - 'Accept-Encoding': 'gzip, deflate', - Connection: 'keep-alive', - Referer: 'https://blog.naver.com/', - }, - redirect: 'follow', - }); - - if (!response.ok) { - throw new Error(`HTTP ${response.status}: ${response.statusText}`); - } - - const html = await response.text(); - this.logger.debug(`Successfully fetched Naver Blog HTML, length: ${html.length}`); - - const dom = new JSDOM(html); - const document = dom.window.document; - - // Extract meta information - const title = document.querySelector('title')?.textContent?.trim(); - const description = document.querySelector('meta[name="description"]')?.getAttribute('content')?.trim(); - const ogTitle = document.querySelector('meta[property="og:title"]')?.getAttribute('content')?.trim(); - const ogDescription = document - .querySelector('meta[property="og:description"]') - ?.getAttribute('content') - ?.trim(); - - // Clean up Naver Blog title (remove ":" and blog name) - let cleanTitle = ogTitle || title; - if (cleanTitle) { - // Remove common Naver Blog suffixes - cleanTitle = cleanTitle.replace(/\s*:\s*네이버 블로그$/, ''); - cleanTitle = cleanTitle.replace(/\s*\|\s*네이버 블로그$/, ''); - cleanTitle = cleanTitle.trim(); - } - - this.logger.debug(`Extracted Naver Blog meta: title="${cleanTitle?.substring(0, 50)}"`); - - return { - title: cleanTitle, - description: ogDescription || description, - }; - } - - /** - * Fallback method to extract title from Naver Blog URL pattern. - * @param url - The Naver Blog URL to extract title from. - * @returns The extracted title or undefined. - */ - private extractNaverBlogTitleFromUrl(url: URL): string | undefined { - // Naver Blog URL pattern: https://blog.naver.com/username/postid - const pathParts = url.pathname.split('/').filter((part) => part.length > 0); - - if (pathParts.length >= 2) { - const username = pathParts[0]; - const postId = pathParts[1]; - return `${username}의 블로그 - ${postId}`; - } - - return undefined; - } - - /** - * Fallback method to extract title from URL pattern. - * @param url - The URL to extract title from. - * @returns The extracted title or undefined. - */ - private extractTitleFromUrl(url: URL): string | undefined { - // Medium URL patterns: - // https://medium.com/@username/article-title-123abc - // https://medium.com/publication/article-title-123abc - // https://username.medium.com/article-title-123abc - - const pathParts = url.pathname.split('/').filter((part) => part.length > 0); - - if (pathParts.length >= 2) { - // Get the last part which should be the article slug - const articleSlug = pathParts[pathParts.length - 1]; - - // Remove hash-like ending (e.g., -123abc) - const cleanSlug = articleSlug.replace(/-[a-f0-9]{6,}$/i, ''); - - // Convert slug to title - const title = cleanSlug - .split('-') - .map((word) => word.charAt(0).toUpperCase() + word.slice(1)) - .join(' '); - - return title.length > 5 ? title : undefined; + this.logger.warn(`DomainSpecificHandler failed for ${url.href}: ${(error as Error).message}`); + return Promise.resolve(null); } - - return undefined; } /** diff --git a/src/modules/pre-handler/handlers/medium.handler.ts b/src/modules/pre-handler/handlers/medium.handler.ts new file mode 100644 index 0000000..490e468 --- /dev/null +++ b/src/modules/pre-handler/handlers/medium.handler.ts @@ -0,0 +1,91 @@ +import { Injectable } from '@nestjs/common'; +import { AbstractContentHandler } from '../base/abstract-content-handler'; +import { + HttpRequestConfig, + DomConfig, + ContentCleaningConfig, + TitleExtractionConfig, +} from '../types/content-extraction.types'; +import { PreHandleResult } from '../dto/pre-handle-result.dto'; +import { JSDOM } from 'jsdom'; +import { postProcessDom } from '../utils/content-cleaning-pipeline'; + +/** + * Medium 전용 핸들러 + * - SOLID 원칙 기반, AbstractContentHandler 상속 + */ +@Injectable() +export class MediumHandler extends AbstractContentHandler { + public canHandle(url: URL): boolean { + return url.hostname.endsWith('medium.com'); + } + + protected get handlerName(): string { + return 'Medium 핸들러'; + } + + protected get httpConfig(): HttpRequestConfig { + return { + userAgent: + 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36', + timeout: 15000, + headers: { + Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'en-US,en;q=0.5', + 'Accept-Encoding': 'gzip, deflate', + Connection: 'keep-alive', + 'Cache-Control': 'no-cache', + }, + redirect: 'follow', + }; + } + + protected get domConfig(): DomConfig { + return { + userAgent: this.httpConfig.userAgent, + resources: 'usable', + runScripts: 'outside-only', + pretendToBeVisual: true, + }; + } + + protected get cleaningConfig(): ContentCleaningConfig { + return { + removeUnwantedElements: true, + cleanupStyles: true, + cleanupLinks: true, + cleanupImages: true, + cleanupText: false, + refineTitle: true, + }; + } + + protected get titleConfig(): TitleExtractionConfig { + return { + selectors: ['meta[property="og:title"]', 'meta[name="title"]', 'title', 'h1'], + patterns: [], + siteSpecificPatterns: {}, + }; + } + + protected get contentSelectors(): readonly string[] { + return ['article', '.section-content', '.postArticle-content', '.meteredContent', '.main-content']; + } + + /** + * Medium 콘텐츠를 처리하여 후처리된 결과를 반환합니다. + * @param url 처리할 URL + * @returns 후처리된 PreHandleResult 또는 null + */ + public async handle(url: URL): Promise { + const result = await super.handle(url); + if (!result || !result.content) return result; + const dom = new JSDOM(result.content); + const document = dom.window.document; + postProcessDom(document); + return { + ...result, + content: document.body?.outerHTML ?? result.content, + }; + } +} diff --git a/src/modules/pre-handler/handlers/naver-blog.handler.ts b/src/modules/pre-handler/handlers/naver-blog.handler.ts new file mode 100644 index 0000000..0dbd34d --- /dev/null +++ b/src/modules/pre-handler/handlers/naver-blog.handler.ts @@ -0,0 +1,108 @@ +import { Injectable } from '@nestjs/common'; +import { AbstractContentHandler } from '../base/abstract-content-handler'; +import { + HttpRequestConfig, + DomConfig, + ContentCleaningConfig, + TitleExtractionConfig, +} from '../types/content-extraction.types'; +import { PreHandleResult } from '../dto/pre-handle-result.dto'; +import { JSDOM } from 'jsdom'; +import { postProcessDom } from '../utils/content-cleaning-pipeline'; + +/** + * 네이버 블로그 전용 핸들러 + * - SOLID 원칙 기반, AbstractContentHandler 상속 + */ +@Injectable() +export class NaverBlogHandler extends AbstractContentHandler { + public canHandle(url: URL): boolean { + return url.hostname.endsWith('blog.naver.com'); + } + + protected get handlerName(): string { + return '네이버블로그 핸들러'; + } + + protected get httpConfig(): HttpRequestConfig { + return { + userAgent: + 'Mozilla/5.0 (iPhone; CPU iPhone OS 14_7_1 like Mac OS X) AppleWebKit/605.1.15 (KHTML, like Gecko) Version/14.1.2 Mobile/15E148 Safari/604.1', + timeout: 15000, + headers: { + Accept: 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', + 'Accept-Language': 'ko-KR,ko;q=0.9,en;q=0.8', + 'Accept-Encoding': 'gzip, deflate', + Connection: 'keep-alive', + Referer: 'https://blog.naver.com/', + }, + redirect: 'follow', + }; + } + + protected get domConfig(): DomConfig { + return { + userAgent: this.httpConfig.userAgent, + resources: 'usable', + runScripts: 'outside-only', + pretendToBeVisual: true, + }; + } + + protected get cleaningConfig(): ContentCleaningConfig { + return { + removeUnwantedElements: true, + cleanupStyles: true, + cleanupLinks: true, + cleanupImages: true, + cleanupText: false, + refineTitle: true, + }; + } + + protected get titleConfig(): TitleExtractionConfig { + return { + selectors: [ + 'meta[property="og:title"]', + 'meta[name="title"]', + 'title', + '.se-title-text', + '.pcol1 .title', + '.blog-title', + ], + patterns: [], + siteSpecificPatterns: {}, + }; + } + + protected get contentSelectors(): readonly string[] { + return [ + '#postViewArea', + '.se-main-container', + '.post-view', + '.se_component_wrap', + '.se_textView', + '.blog2_container', + '.se_content', + '.view', + '.post', + ]; + } + + /** + * 네이버 블로그 콘텐츠를 처리하여 후처리된 결과를 반환합니다. + * @param url 처리할 URL + * @returns 후처리된 PreHandleResult 또는 null + */ + public async handle(url: URL): Promise { + const result = await super.handle(url); + if (!result || !result.content) return result; + const dom = new JSDOM(result.content); + const document = dom.window.document; + postProcessDom(document, { baseUrl: 'https://blog.naver.com' }); + return { + ...result, + content: document.body?.outerHTML ?? result.content, + }; + } +} diff --git a/src/modules/pre-handler/handlers/pdf.handler.ts b/src/modules/pre-handler/handlers/pdf.handler.ts index 5f79add..b549b32 100644 --- a/src/modules/pre-handler/handlers/pdf.handler.ts +++ b/src/modules/pre-handler/handlers/pdf.handler.ts @@ -12,6 +12,7 @@ import { TitleExtractionConfig, } from '../types/content-extraction.types'; import { PreHandleResult } from '../dto/pre-handle-result.dto'; +import { extractTitleFromPath } from '../utils/functional-utils'; /** * PDF 파일 핸들러 @@ -97,18 +98,15 @@ export class PdfHandler extends AbstractContentHandler { /** * PDF는 별도 본문 추출 없이 타입 마킹만 수행 + * @param url 처리할 URL + * @returns PreHandleResult 또는 null */ public handle(url: URL): Promise { try { let title: string | undefined; - const pathParts = url.pathname.split('/'); - const filename = pathParts[pathParts.length - 1]; + const filename = url.pathname.split('/').pop(); if (filename && filename.includes('.pdf')) { - title = filename - .replace(/\.pdf$/i, '') - .replace(/[-_]/g, ' ') - .replace(/\b\w/g, (l) => l.toUpperCase()) - .trim(); + title = extractTitleFromPath(filename, { removeExtension: true }); } return Promise.resolve({ url: url.href, diff --git a/src/modules/pre-handler/handlers/readability.handler.ts b/src/modules/pre-handler/handlers/readability.handler.ts index 8d80f11..b9acf8d 100644 --- a/src/modules/pre-handler/handlers/readability.handler.ts +++ b/src/modules/pre-handler/handlers/readability.handler.ts @@ -128,24 +128,36 @@ export class ReadabilityHandler extends AbstractContentHandler { * 스크립트 활성화 JSDOM 생성 */ private async createDOMWithScripts(url: string): Promise { - return JSDOM.fromURL(url, { + const dom = await JSDOM.fromURL(url, { userAgent: this.httpConfig.userAgent, resources: 'usable', runScripts: 'dangerously', pretendToBeVisual: true, }); + this.removeAllScripts(dom.window.document); + return dom; } /** * 스크립트 비활성화 JSDOM 생성 */ private async createDOMWithoutScripts(url: string): Promise { - return JSDOM.fromURL(url, { + const dom = await JSDOM.fromURL(url, { userAgent: this.httpConfig.userAgent, resources: 'usable', runScripts: 'outside-only', pretendToBeVisual: true, }); + this.removeAllScripts(dom.window.document); + return dom; + } + + /** + * DOM에서 모든