Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: facebook group crawler #34

Open
wants to merge 3 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions lib/__tests__/fbgroup.spec.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
import { getPosts } from "@/lib/fbPublicGroupCrawler";

test("getPostList should return array of post data", async function () {
// https://mbasic.facebook.com/groups/464870710346711
const posts = await getPosts("464870710346711");

expect(Array.isArray(posts)).toBe(true);
expect(posts.length).toBeGreaterThan(0);

const post = posts[0];

expect(post).toHaveProperty("id");
expect(post).toHaveProperty("photoUrls");
expect(post).toHaveProperty("author");
expect(post).toHaveProperty("content");
expect(post).toHaveProperty("publishTime");

expect(typeof post.id).toBe("string");
expect(Array.isArray(post.photoUrls)).toBe(true);
expect(typeof post.author).toBe("string");
expect(typeof post.content).toBe("string");
expect(typeof post.publishTime).toBe("number");
});
74 changes: 74 additions & 0 deletions lib/fbPublicGroupCrawler.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,74 @@
import * as cheerio from "cheerio";
import createDOMPurify from "dompurify";
import { bootstrap } from "global-agent";
import { got } from "got";
import { JSDOM } from "jsdom";
import unescape from "lodash/unescape";

import { FacebookGroupPostData, GroupInsight } from "./types";

const window = new JSDOM("").window;
const DOMPurify = createDOMPurify(window as any);

if (process.env.GLOBAL_AGENT_HTTP_PROXY) {
bootstrap();
}

export async function getPosts(groupId: string) {
const groupUrl = `https://mbasic.facebook.com/groups/${groupId}`;
const html = await got(groupUrl).then((res) => res.body);

const $ = cheerio.load(html);

const postSection = $("#m_group_stories_container");

const posts = postSection
.find("> div > div")
.toArray()
.map((post) => {
const rawDataFt = $(post).attr("data-ft");
const postData = JSON.parse(unescape(rawDataFt) || "{}") as FacebookGroupPostData;

const author = $(post).find("h3 > span strong:first-child").text();

const contentParagraphs = $(post).find("div span > p");
const groupInsight = postData.page_insights[groupId] as GroupInsight;

let publishTime: number | null = null;

if (groupInsight) {
publishTime = groupInsight.post_context.publish_time * 1000;
}

const content = contentParagraphs
.toArray()
.map((p) => {
return DOMPurify.sanitize($(p).html()?.replace(/<br>/g, "\n") || "", {
ALLOWED_TAGS: [],
});
})
.join("\n\n");

const imageDiv = $(post).find("div > div:last-child");

let photoUrls: string[] = [];
if (imageDiv.length) {
photoUrls = imageDiv
.find("img")
.toArray()
.map((img) => $(img).attr("src") || "")
.filter(Boolean);
}

return {
id: postData.mf_story_key,
photoUrls,
author,
content,
publishTime,
permalink: `https://www.facebook.com/groups/${groupId}/permalink/${postData.mf_story_key}`,
};
});

return posts;
}
86 changes: 86 additions & 0 deletions lib/types.ts
Original file line number Diff line number Diff line change
Expand Up @@ -312,3 +312,89 @@ interface FavData {
posttime: number;
count: number;
}

export interface FacebookGroupPostData {
qid: string;
mf_story_key: string;
top_level_post_id: string;
tl_objid: string;
content_owner_id_new: string;
original_content_id: string;
original_content_owner_id: string;
page_id: string;
src: number;
photo_id: string;
story_location: number;
attached_story_attachment_style: string;
filter: string;
ott: string;
sty: number;
attached_story_type: string;
attached_story_attachment_type: string;
page_insights: Page_insights;
actrs: string;
tds_flgs: number;
ftmd_400706: string;
tn: string;
}

interface Page_insights {
[key: string]: PageInsight | GroupInsight;
}

export interface PageInsight {
page_id: string;
page_id_type: string;
actor_id: string;
attached_story: Attached_story;
dm: Dm;
psn: string;
role: number;
sl: number;
targets: TargetsItem[];
}
interface Attached_story {
page_id: string;
page_id_type: string;
actor_id: string;
dm: Dm;
psn: string;
post_context: Post_context;
role: number;
sl: number;
}
interface Dm {
isShare: number;
originalPostOwnerID: number;
}
interface Post_context {
object_fbtype: number;
publish_time: number;
story_name: string;
story_fbid: string[];
}
interface TargetsItem {
actor_id: string;
page_id: string;
post_id: string;
role: number;
share_id: number;
}

export interface GroupInsight {
page_id: string;
page_id_type: string;
actor_id: string;
dm: Dm;
psn: string;
post_context: Group_Post_context;
role: number;
sl: number;
}

interface Group_Post_context {
object_fbtype: number;
publish_time: number;
story_name: string;
story_fbid: string[];
}
8 changes: 8 additions & 0 deletions package.json
Original file line number Diff line number Diff line change
Expand Up @@ -21,10 +21,14 @@
"@types/react": "18.0.27",
"@types/react-dom": "18.0.10",
"cheerio": "1.0.0-rc.12",
"dompurify": "^3.0.0",
"eslint": "8.33.0",
"eslint-config-next": "13.1.6",
"global-agent": "^3.0.0",
"got": "npm:got-cjs@^12.5.4",
"jsdom": "^21.1.0",
"jsx-slack": "^5.3.0",
"lodash": "^4.17.21",
"mongodb": "^5.0.0",
"next": "13.1.6",
"react": "18.2.0",
Expand All @@ -36,7 +40,11 @@
"devDependencies": {
"@faker-js/faker": "^7.6.0",
"@trunkio/launcher": "^1.2.3",
"@types/dompurify": "^2.4.0",
"@types/global-agent": "^2.1.1",
"@types/jest": "^29.4.0",
"@types/jsdom": "^21.1.0",
"@types/lodash": "^4.14.191",
"@types/tough-cookie": "^4.0.2",
"eslint-config-prettier": "^8.6.0",
"eslint-plugin-import": "^2.27.5",
Expand Down
Loading