From c34afa8db78943db2d1c551f0def959ef57c16d8 Mon Sep 17 00:00:00 2001 From: TechQuery Date: Thu, 4 May 2023 22:51:55 +0800 Subject: [PATCH] [add] Table & Heading detection of Non-semantic Agenda pages --- package.json | 4 +- pnpm-lock.yaml | 78 ++++++++++++++++++++++++++++++------ source/Agenda/HuoDongXing.ts | 6 +++ source/Agenda/common.ts | 72 ++++++++++++++++++++++++++++----- source/Agenda/core.ts | 2 +- source/utility.ts | 16 ++++++-- 6 files changed, 148 insertions(+), 30 deletions(-) diff --git a/package.json b/package.json index f1729e3..f86f20f 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@fcc-cdc/it-events", - "version": "1.3.0-alpha.0", + "version": "1.3.0", "license": "MIT", "author": "shiy2008@gmail.com", "description": "IT Events Crawler of China", @@ -29,7 +29,7 @@ "commander-jsx": "^0.6.1", "file-type": "^16.5.4", "fs-extra": "^11.1.1", - "jsdom": "^22.0.0", + "jsdom": "^21.1.2", "node-fetch": "^2.6.9", "string-similarity": "^4.0.4", "urlpattern-polyfill": "^8.0.2", diff --git a/pnpm-lock.yaml b/pnpm-lock.yaml index a4daa8a..efbd42b 100644 --- a/pnpm-lock.yaml +++ b/pnpm-lock.yaml @@ -14,8 +14,8 @@ dependencies: specifier: ^11.1.1 version: 11.1.1 jsdom: - specifier: ^22.0.0 - version: 22.0.0 + specifier: ^21.1.2 + version: 21.1.2 node-fetch: specifier: ^2.6.9 version: 2.6.9 @@ -1065,6 +1065,13 @@ packages: resolution: {integrity: sha512-j2afSsaIENvHZN2B8GOpF566vZ5WVk5opAiMTvWgaQT8DkbOqsTfvNAvHoRGU2zzP8cPoqys+xHTRDWW8L+/BA==} dev: false + /acorn-globals@7.0.1: + resolution: {integrity: sha512-umOSDSDrfHbTNPuNpC2NSnnA3LUrqpevPb4T9jRx4MagXNS0rs+gwiTcAvqCRmsD6utzsrzNt+ebm00SNWiC3Q==} + dependencies: + acorn: 8.8.2 + acorn-walk: 8.2.0 + dev: false + /acorn-jsx@5.3.2(acorn@8.8.2): resolution: {integrity: sha512-rq9s+JNhf0IChjtDXxllJ7g41oZk5SlXtp0LHwyA5cejwn7vKmKp4pPri6YEePv2PU65sAsegbXtIinmDFDXgQ==} peerDependencies: @@ -1073,11 +1080,15 @@ packages: acorn: 8.8.2 dev: true + /acorn-walk@8.2.0: + resolution: {integrity: sha512-k+iyHEuPgSw6SbuDpGQM+06HQUa04DZ3o+F6CSzXMvvI5KMvnaEqXe+YVe555R9nn6GPt404fos4wcgpw12SDA==} + engines: {node: '>=0.4.0'} + dev: false + /acorn@8.8.2: resolution: {integrity: sha512-xjIYgE8HBrkpd/sJqOGNspf8uHG+NOHGOw6a/Urj8taM2EXfdNAH2oFcPeIFfsv3+kz/mJrS5VuMqbNLjCa2vw==} engines: {node: '>=0.4.0'} hasBin: true - dev: true /agent-base@6.0.2: resolution: {integrity: sha512-RZNwNclF7+MS/8bDg70amg32dyeZGZxiDuQmZxKLAlQjr3jGyLx+4Kkk58UO7D2QdgFIQCovuSuZESne6RG6XQ==} @@ -1508,7 +1519,6 @@ packages: /deep-is@0.1.4: resolution: {integrity: sha512-oIPzksmTg4/MriiaYGO+okXDT7ztn/w3Eptv/+gSIdMdKsJo0u4CfYNFJPy+4SKMuCqGw2wxnA+URMg3t8a/bQ==} - dev: true /deepmerge@4.3.1: resolution: {integrity: sha512-3sUqbMEc77XqpdNO7FRyRog+eW3ph+GYCbj+rK+uYyRMuwsVy0rMiVtPn+QJlKFvWP/1PYpapqYn0Me2knFn+A==} @@ -1605,6 +1615,19 @@ packages: engines: {node: '>=10'} dev: true + /escodegen@2.0.0: + resolution: {integrity: sha512-mmHKys/C8BFUGI+MAWNcSYoORYLMdPzjrknd2Vc+bUsjN5bXcr8EhrNB+UTqfL1y3I9c4fw2ihgtMPQLBRiQxw==} + engines: {node: '>=6.0'} + hasBin: true + dependencies: + esprima: 4.0.1 + estraverse: 5.3.0 + esutils: 2.0.3 + optionator: 0.8.3 + optionalDependencies: + source-map: 0.6.1 + dev: false + /eslint-config-prettier@8.8.0(eslint@8.39.0): resolution: {integrity: sha512-wLbQiFre3tdGgpDv67NQKnJuTlcUVYHas3k+DZCc2U2BadthoEY4B7hLPvAxaqdyOGCzuLfii2fqGph10va7oA==} hasBin: true @@ -1697,7 +1720,6 @@ packages: resolution: {integrity: sha512-eGuFFw7Upda+g4p+QHvnW0RyTX/SVeJBDM/gCtMARO0cLuT2HcEKnTPvhjV6aGeqrCB/sbNop0Kszm0jsaWU4A==} engines: {node: '>=4'} hasBin: true - dev: true /esquery@1.5.0: resolution: {integrity: sha512-YQLXUplAwJgCydQ78IMJywZCceoqk1oH01OERdSAJc/7U2AylwjhSCLDEtqwg811idIS/9fIU5GjG73IgjKMVg==} @@ -1721,12 +1743,10 @@ packages: /estraverse@5.3.0: resolution: {integrity: sha512-MMdARuVEQziNTeJD8DgMqmhwR11BRQ/cBP+pLtYdSTnf3MIO8fFeiINEbX36ZdNlfU/7A9f3gUw49B3oQsvwBA==} engines: {node: '>=4.0'} - dev: true /esutils@2.0.3: resolution: {integrity: sha512-kVscqXk4OCp68SZ0dkgEKVi6/8ij300KBWTJq32P/dYeWTSwK41WyTxalN1eRmA5Z9UU/LX9D7FWSmV9SAYx6g==} engines: {node: '>=0.10.0'} - dev: true /execa@5.1.1: resolution: {integrity: sha512-8uSpZZocAZRBAPIEINJj3Lo9HyGitllczc27Eh5YYojjMFMn8yHMDMaUHE2Jqfq05D/wucwI4JGURyXt1vchyg==} @@ -1795,7 +1815,6 @@ packages: /fast-levenshtein@2.0.6: resolution: {integrity: sha512-DCXu6Ifhqcks7TZKY3Hxp3y6qphY5SJZmrWMDrKcERSOXWQdMhU9Ig/PYrzyw/ul9jOIyh0N4M0tbC5hodg8dw==} - dev: true /fastq@1.15.0: resolution: {integrity: sha512-wBrocU2LCXXa+lWBt8RoIRD89Fi8OdABODa/kEnyeyjS5aZO5/GNvI5sEINADqP/h8M29UHTHUb53sUu5Ihqdw==} @@ -2640,9 +2659,9 @@ packages: argparse: 2.0.1 dev: true - /jsdom@22.0.0: - resolution: {integrity: sha512-p5ZTEb5h+O+iU02t0GfEjAnkdYPrQSkfuTSMkMYyIoMvUNEHsbG0bHHbfXIcfTqD2UfvjQX7mmgiFsyRwGscVw==} - engines: {node: '>=16'} + /jsdom@21.1.2: + resolution: {integrity: sha512-sCpFmK2jv+1sjff4u7fzft+pUh2KSUbUrEHYHyfSIbGTIcmnjyp83qg6qLwdJ/I3LpTXx33ACxeRL7Lsyc6lGQ==} + engines: {node: '>=14'} peerDependencies: canvas: ^2.5.0 peerDependenciesMeta: @@ -2650,10 +2669,13 @@ packages: optional: true dependencies: abab: 2.0.6 + acorn: 8.8.2 + acorn-globals: 7.0.1 cssstyle: 3.0.0 data-urls: 4.0.0 decimal.js: 10.4.3 domexception: 4.0.0 + escodegen: 2.0.0 form-data: 4.0.0 html-encoding-sniffer: 3.0.0 http-proxy-agent: 5.0.0 @@ -2724,6 +2746,14 @@ packages: engines: {node: '>=6'} dev: true + /levn@0.3.0: + resolution: {integrity: sha512-0OO4y2iOHix2W6ujICbKIaEQXvFQHue65vUG3pb5EUomzPI90z9hsA1VsO/dbIIpC53J8gxM9Q4Oho0jrCM/yA==} + engines: {node: '>= 0.8.0'} + dependencies: + prelude-ls: 1.1.2 + type-check: 0.3.2 + dev: false + /levn@0.4.1: resolution: {integrity: sha512-+bT2uH4E5LGE7h/n3evcS/sQlJXCpIp6ym8OWJ5eV6+67Dsql/LaaT7qJBAt2rzfoa/5QBGBhxDix1dMt2kQKQ==} engines: {node: '>= 0.8.0'} @@ -2983,6 +3013,18 @@ packages: mimic-fn: 4.0.0 dev: true + /optionator@0.8.3: + resolution: {integrity: sha512-+IW9pACdk3XWmmTXG8m3upGUJst5XRGzxMRjXzAuJ1XnIFNvfhjjIuYkDvysnPQ7qzqVzLt78BCruntqRhWQbA==} + engines: {node: '>= 0.8.0'} + dependencies: + deep-is: 0.1.4 + fast-levenshtein: 2.0.6 + levn: 0.3.0 + prelude-ls: 1.1.2 + type-check: 0.3.2 + word-wrap: 1.2.3 + dev: false + /optionator@0.9.1: resolution: {integrity: sha512-74RlY5FCnhq4jRxVUPKDaRwrVNXMqsGsiW6AJw4XK8hmtm10wC0ypZBLw5IIp85NZMr91+qd1RvvENwg7jjRFw==} engines: {node: '>= 0.8.0'} @@ -3118,6 +3160,11 @@ packages: find-up: 4.1.0 dev: true + /prelude-ls@1.1.2: + resolution: {integrity: sha512-ESF23V4SKG6lVSGZgYNpbsiaAkdab6ZgOxe52p7+Kid3W3u3bxR4Vfd/o21dmN7jSt0IwgZ4v5MUd26FEtXE9w==} + engines: {node: '>= 0.8.0'} + dev: false + /prelude-ls@1.2.1: resolution: {integrity: sha512-vkcDPrRZo1QZLbn5RLGPpg/WmIQ65qoWWhcGKf/b5eplkkarX0m9z8ppCat4mlOqUsWpyNuYgO3VRyrYHSzX5g==} engines: {node: '>= 0.8.0'} @@ -3368,7 +3415,6 @@ packages: /source-map@0.6.1: resolution: {integrity: sha512-UjgapumWlbMhkBgzT7Ykc5YXUT46F0iKu8SGXq0bcwP5dz/h0Plj6enJqjz1Zbq2l5WaqYnrVbwWOWMyF3F47g==} engines: {node: '>=0.10.0'} - dev: true /sprintf-js@1.0.3: resolution: {integrity: sha512-D9cPgkvLlV3t3IzL0D0YLvGA9Ahk4PcvVwUbN0dSGr1aP0Nrt4AEnTUbuGvquEC0mA64Gqt1fzirlRs5ibXx8g==} @@ -3607,6 +3653,13 @@ packages: typescript: 5.0.4 dev: true + /type-check@0.3.2: + resolution: {integrity: sha512-ZCmOJdvOWDBYJlzAoFkC+Q0+bUyEOS1ltgp1MGU03fqHG+dbi9tBFU2Rd9QKiDZFAYrhPh2JUf7rZRIuHRKtOg==} + engines: {node: '>= 0.8.0'} + dependencies: + prelude-ls: 1.1.2 + dev: false + /type-check@0.4.0: resolution: {integrity: sha512-XleUoc9uwGXqjWwXaUTZAmzMcFZ5858QA2vvx1Ur5xIcixXIP+8LnFDgRplU30us6teqdlskFfu+ae4K79Ooew==} engines: {node: '>= 0.8.0'} @@ -3786,7 +3839,6 @@ packages: /word-wrap@1.2.3: resolution: {integrity: sha512-Hz/mrNwitNRh/HUAtM/VT/5VH+ygD6DV7mYKZAtHOrbs8U7lvPS6xf7EJKMF0uW1KJCl0H701g3ZGus+muE5vQ==} engines: {node: '>=0.10.0'} - dev: true /wrap-ansi@6.2.0: resolution: {integrity: sha512-r6lPcBGxZXlIcymEu7InxDMhdW0KDxpLgoFLcguasxCaJ/SOIZwINatK9KY/tf+ZrlywOKU0UDj3ATXUBfxJXA==} diff --git a/source/Agenda/HuoDongXing.ts b/source/Agenda/HuoDongXing.ts index 19e6afe..0a393b5 100644 --- a/source/Agenda/HuoDongXing.ts +++ b/source/Agenda/HuoDongXing.ts @@ -5,3 +5,9 @@ export class HuoDongXingAgenda extends CommonAgendaCrawler { static schema = new URLPattern(`${this.baseURI}/:event(\\d+)`); } + +export class OldHuoDongXingAgenda extends CommonAgendaCrawler { + static baseURI = 'https://www.huodongxing.com/go'; + + static schema = new URLPattern(`${this.baseURI}/:event(\\w+)`); +} diff --git a/source/Agenda/common.ts b/source/Agenda/common.ts index 143ebb3..5e74306 100644 --- a/source/Agenda/common.ts +++ b/source/Agenda/common.ts @@ -1,11 +1,13 @@ import { JSDOM } from 'jsdom'; -import { walkDOM, countBy } from 'web-utility'; +import { byteLength, countBy, walkDOM } from 'web-utility'; +import { CSSSelectorPrecision, getCSSSelector, sameParentOf } from '../utility'; import { Agenda, AgendaCrawler } from './core'; -import { getCSSSelector, sameParentOf } from '../utility'; export const TimePattern = /\d{1,2}\s*[::]\s*\d{2}/; +const HeadingSelector = `h1, h2, h3, h4, h5, h6, strong, b`; + export abstract class CommonAgendaCrawler extends AgendaCrawler { document?: Document; @@ -20,7 +22,11 @@ export abstract class CommonAgendaCrawler extends AgendaCrawler { walkDOM(document.body, 3), ({ nodeValue, parentElement }) => TimePattern.test(nodeValue) && { - selector: getCSSSelector(parentElement, document.body) + selector: getCSSSelector( + parentElement, + document.body, + CSSSelectorPrecision.Medium + ) } ).filter(Boolean); @@ -32,12 +38,21 @@ export abstract class CommonAgendaCrawler extends AgendaCrawler { const [first, second] = document.querySelectorAll(agendaTimeSelector); const agendaBox = sameParentOf(first, second) as Element; - const agendaBoxSelector = getCSSSelector(agendaBox); + const agendaBoxSelector = getCSSSelector( + agendaBox, + document.body, + CSSSelectorPrecision.High + ); for (let i = 0; i < agendaBox.childElementCount; i++) - yield await this.getItem( - `${agendaBoxSelector} > :nth-child(${i + 1})` - ); + if (agendaBox.tagName.toLowerCase() === 'tbody') + yield* this.getItems( + agendaBox.children[i] as HTMLTableRowElement + ); + else + yield await this.getItem( + `${agendaBoxSelector} > :nth-child(${i + 1})` + ); this.document = undefined; } @@ -48,20 +63,55 @@ export abstract class CommonAgendaCrawler extends AgendaCrawler { let time = ''; - const [title, name, position, summary] = Array.from( - walkDOM(agendaItem, 3), - ({ nodeValue }) => nodeValue.trim() - ).filter(text => !TimePattern.test(text) || !(time = text)); + const [head, body] = Array.from(walkDOM(agendaItem, 3)).reduce( + (group, { parentElement, nodeValue }) => { + const isHeading = + parentElement.matches(HeadingSelector) || + !!parentElement.closest(HeadingSelector); + + if (TimePattern.test(nodeValue)) time = nodeValue.trim(); + else group[isHeading ? 0 : 1].push(nodeValue.trim()); + return group; + }, + [[], []] as string[][] + ); const [startTime, endTime] = time.split(/[^\d::]+/), + [name, title] = head.sort((a, b) => byteLength(a) - byteLength(b)), + [position, summary] = body.sort( + (a, b) => byteLength(a) - byteLength(b) + ), avatar = agendaItem.querySelector('img[src]')?.src; return { mentor: { name, position, avatar }, title, + summary, startTime, endTime }; } + + protected getItems({ children }: HTMLTableRowElement): Agenda[] { + const [time, ...agendas] = [...children]; + const [startTime, endTime] = time.textContent.trim().split(/[^\d::]+/); + + return agendas.map(agendaItem => { + const [name, position, title, summary] = agendaItem.textContent + .trim() + .split('\n') + .sort((a, b) => byteLength(a) - byteLength(b)), + avatar = + agendaItem.querySelector('img[src]')?.src; + + return { + mentor: { name, position, avatar }, + title, + summary, + startTime, + endTime + }; + }); + } } diff --git a/source/Agenda/core.ts b/source/Agenda/core.ts index 034417d..d8732c8 100644 --- a/source/Agenda/core.ts +++ b/source/Agenda/core.ts @@ -13,7 +13,7 @@ export type Duration = Partial>; export type Forum = Duration & Pick; -export interface Agenda extends Duration { +export interface Agenda extends Duration, Pick { title?: string; mentor?: Mentor; forum?: Forum; diff --git a/source/utility.ts b/source/utility.ts index 254874b..ab52c61 100644 --- a/source/utility.ts +++ b/source/utility.ts @@ -75,9 +75,16 @@ export async function saveFile( return path; } +export enum CSSSelectorPrecision { + Low, + Medium, + High +} + export function getCSSSelector( toElement: Element, - fromElement = toElement.getRootNode() + fromElement = toElement.getRootNode(), + precision = CSSSelectorPrecision.Low ) { const selectors: string[] = []; @@ -88,9 +95,12 @@ export function getCSSSelector( tagName.toLowerCase() + (className.trim() ? '.' + className.split(/\s+/).filter(Boolean).join('.') - : `:nth-child(${ + : precision === CSSSelectorPrecision.High || + (precision === CSSSelectorPrecision.Medium && !selectors[0]) + ? `:nth-child(${ [...parentNode.children].indexOf(toElement) + 1 - })`); + })` + : ''); selectors.unshift(selector); toElement = parentNode as Element;