diff --git a/.gitignore b/.gitignore index aa5166f..239fc0d 100644 --- a/.gitignore +++ b/.gitignore @@ -4,7 +4,6 @@ config/* services/mailer/config.json npm-debug.log _* -secret output #Allow diff --git a/Graph.js b/Graph.js new file mode 100644 index 0000000..64c5226 --- /dev/null +++ b/Graph.js @@ -0,0 +1,136 @@ +'use strict'; + +let co = require('co'); +let fs = require('fs'); +let denodeify = require('denodeify'); + +let unlink = denodeify(fs.unlink); +let writeFile = denodeify(fs.writeFile); + +let User = require('./User'); + +class Graph { + constructor() { + this._users = []; + this._links = []; + } + + get users() { + return this._users; + } + + get links() { + //get connections of every user (and not duplicate, or yes?) + let links = []; + let users = this.users; + //create connections from the user and her contacts + for(let user of users) { + let contacts = user.contacts; + for(let contact of contacts) { + links.push([user.username, contact.username]); + } + } + return links; + } + + //returns boolean indicating whether user is already in the graph._users + hasUser (username) { + for(let usr of this._users) { + if(usr.username === username) { + return true; + } + } + return false; + } + + //async + + //filling the graph with the data + scrape(usernames) { + return co.call(this, function * () { + //TODO there may be also ids stored for efficiency + + let currentLevel = usernames; + let nextLevel; + while(currentLevel.length > 0) { + nextLevel = []; + for(let username of currentLevel) { + let user = new User(username); + + //get connections of each user from nextLevel + try { + yield user.scrape(); + + //save the user to this._users; + + this._users.push(user); + console.log(`${this.users.length} ${user.username} ${user.contacts.length}`); + + //fill first-time users to _nextLevel; + let contacts = user.contacts; + for(let contact of contacts) { + let isNotAnywhereYet = nextLevel.indexOf(contact.username) === -1 && currentLevel.indexOf(contact.username) === -1 && this.hasUser(contact.username) === false; + if(isNotAnywhereYet) { + nextLevel.push(contact.username); + } + } + } + catch(e) { + //there are some inconsistencies in the database - some contacts don't exist + if(e.status === 404) {} + else{ + throw e; + } + } + } + currentLevel = nextLevel; + } + }); + } + + outputGraph (filename) { + return co.call(this, function * () { + var path = `./output/${filename || 'graph'}.gdf`; + + //outputString will be eventually written to the gdf file + //first the node header + var outputString = 'nodedef>name VARCHAR,label VARCHAR\n'; + //write users to the outputString; + let users = this.users; + for(let user of users) { + outputString += `${user.username},${user.username}\n`; + } + + //write edge header to the outputString + outputString += 'edgedef>node1 VARCHAR,node2 VARCHAR\n' + //write links to the outputString + let links = this.links; + for(let link of links) { + outputString += `${link[0]},${link[1]}\n`; + } + + //write the data to the file + yield unlink(path); + yield writeFile(path, outputString); + }); + } + + outputUsers (filename) { + return co.call(this, function * () { + var path = `./output/${filename || 'users'}.txt`; + + //get usernames from the list of user objects + var usernames = []; + var users = this.users; + for(let user of users) { + usernames.push(user.username); + } + + //write to output file + yield unlink(path); + yield writeFile(path, usernames.join('\n')+'\n'); + }); + } +} + +module.exports = Graph; diff --git a/README.md b/README.md index 2aeaa17..cc00e80 100644 --- a/README.md +++ b/README.md @@ -7,9 +7,9 @@ Users without connection to the main network will not be found. ##Prerequisities -nodejs installed +Node.js supporting ES2016 installed -node package manager (npm) installed +Node Package Manager (npm) installed ##Installation @@ -17,30 +17,20 @@ node package manager (npm) installed 2. run the following commands in terminal in the folder of the repository npm install - mkdir secret - mkdir output - touch output/users.txt - touch output/graph.gdf - -3. create a file secret/login.json with following content - - { - "username": "[valid trustroots username]", - "password": "[valid password]" - } - + npm run prepare + ##Usage - run `npm start` -- watch the data scraper work. When it finishes, you'll find list of found users in `output/users.txt` and a graph in `.gdf` format in `output/graph.gdf` +- provide your login data +- provide a username where the scraper should start (defaults to your username) +- watch the data scraper work. When it finishes, you'll find list of connected users in `output/users.txt` and a graph in `.gdf` format in `output/graph.gdf` - play with the data (i.e. with [gephi](https://gephi.org/)) ##To do - time development of the network - shortest path between users -- easier installation -- comments and nicer code ##License MIT diff --git a/TR.js b/TR.js new file mode 100644 index 0000000..607076c --- /dev/null +++ b/TR.js @@ -0,0 +1,96 @@ +'use strict'; + +let denodeify = require('denodeify'); +let request = require('request').defaults({jar: true}); +let co = require('co'); + +let get = denodeify(request.get, function (err, resp, body) { + return [err, body]; +}); + +let post = denodeify(request.post, function (err, resp, body) { + return [err, body]; +}); + +class TR { + //async + static login(username, password) { + return co(function * () { + let body = yield post({url: 'https://www.trustroots.org/api/auth/signin', form:{username: username, password: password}}); + let jsonBody = JSON.parse(body); + let isNotLoggedIn = jsonBody && jsonBody.hasOwnProperty('message') && jsonBody.message === 'Unknown user or invalid password'; + let isLoggedIn = jsonBody && jsonBody.hasOwnProperty('_id'); + + if(isNotLoggedIn) { + let e = new Error('login not successful'); + e.status = 403; + throw e; + } + else if (isLoggedIn) { + return; + } + else { + let e = new Error(jsonBody.message || 'other error'); + e.status = 500; + throw e; + } + }); + } + + //async + static logout() { + + } + + //show all existing hosts + static hosts() { + } + + static user(username) { + return co(function * () { + let body = yield get({url: `https://www.trustroots.org/api/users/${username}`}); + let jsonBody = JSON.parse(body); + if(jsonBody && jsonBody.hasOwnProperty('message') && jsonBody.message === 'Not found.') { + let e = new Error('Not found.'); + e.status = 404; + throw e; + } + return ({ + id: jsonBody._id, + username: jsonBody.username, + name: jsonBody.displayName, + created: new Date(jsonBody.created), + gender: jsonBody.gender + }); + }); + } + + static contacts(id) { + return co(function * () { + let body = yield get({url: `https://www.trustroots.org/api/contacts/${id}`}); + let jsonBody = JSON.parse(body); + + let contacts = []; + for(let rawContact of jsonBody){ + //include only confirmed and not deleted + if(rawContact && rawContact.confirmed === true) { + let users = rawContact.users; + if(users.length === 2) { //protection from deleted contact + let contact = users[0]._id === id ? users[1] : users[0]; + delete rawContact.users; + let finalContact = { + id: contact._id, + username: contact.username, + name: contact.displayName, + created: new Date(rawContact.created) + } + contacts.push(finalContact); + } + } + } + return contacts; + }); + } +} + +module.exports = TR; diff --git a/User.js b/User.js new file mode 100644 index 0000000..e421272 --- /dev/null +++ b/User.js @@ -0,0 +1,41 @@ +'use strict'; + +let co = require('co'); + +let TR = require('./TR'); +//object User represents each user +//async User.scrape(): read User data +class User { + constructor (username) { + this._username = username; + this._contacts = []; + } + + get username () { + return this._username; + } + + get contacts () { + let contacts = []; + for(let contact of this._contacts) { + contacts.push({ + username: contact.username, + id: contact.id + }); + } + return contacts; + } + + //async + scrape () { + return co.call(this, function * () { + let user = yield TR.user(this._username); + let contacts = yield TR.contacts(user.id); + + this._id = user.id; + this._contacts = contacts; + }); + } +} + +module.exports = User; diff --git a/example/2016-09-20.png b/example/2016-09-20.png new file mode 100644 index 0000000..f07c676 Binary files /dev/null and b/example/2016-09-20.png differ diff --git a/functions.js b/functions.js deleted file mode 100644 index 8ccdea9..0000000 --- a/functions.js +++ /dev/null @@ -1,99 +0,0 @@ -'use strict'; - -var request = require('request').defaults({jar: true}); -var fs = require('fs'); - -function outputGraph(nodes, edges, filename) { - return new Promise(function (resolve, reject) { - var wholeName = (filename || 'graph')+'.gdf'; - var wholePath = './output/'+wholeName; - - var gdfString = 'nodedef>name VARCHAR,label VARCHAR\n'; - for(let un of nodes) { - gdfString+=un+','+un+'\n'; - } - - gdfString += 'edgedef>node1 VARCHAR,node2 VARCHAR\n' - - for(let cn of edges) { - gdfString += cn[0]+','+cn[1]+'\n'; - } - - fs.unlink(wholePath, function (err) { - if(err) return reject(err); - fs.writeFile(wholePath, gdfString, function (err) { - if(err) return reject(err); - return resolve(null); - }); - }); - }); -} - -function outputUsers(users, filename) { - return new Promise(function (resolve, reject) { - var filename = filename || 'users'; - var wholeFilename = filename + '.txt'; - var wholePath = './output/' + wholeFilename; - fs.unlink(wholePath, function (err) { - if(err) return reject(err); - fs.writeFile(wholePath, users.join('\n'), function (err) { - if(err) return reject(err); - return resolve(null); - }); - }); - }); -} - -function getUser(username) { -// console.log('search user', username); - return new Promise(function (resolve, reject) { - try{ - request.get({url: 'https://www.trustroots.org/api/users/'+username}, function (err, resp, body) { - // console.log(resp); - if(err) return reject(err); - var jb = JSON.parse(body); - resolve({_id: jb._id, username: jb.username}); - }); - } - catch(err) { - reject(err); - } - }); -} - -function getConnections(id) { -// console.log('getting connections of', id) - return new Promise(function (resolve, reject) { - try { - request.get({url: 'https://www.trustroots.org/api/contacts/'+id}, function (err, resp, body) { - if(err) return reject(err); - var jb = JSON.parse(body); - var cts = []; - - for(let cnt of jb){ - //include only confirmed contacts - if(cnt.confirmed === true || cnt.confirmed === undefined) { - let us = cnt.users; - let usr = cnt.users[0]._id === id ? cnt.users[1] : cnt.users[0]; - cts.push(usr); - } - } - resolve(cts); - }); - } - catch(err) { - reject(err); - } - }); -} - -module.exports = { - output: { - graph: outputGraph, - users: outputUsers - }, - get: { - user: getUser, - connections: getConnections - } -}; diff --git a/index.js b/index.js index 288025e..6558349 100644 --- a/index.js +++ b/index.js @@ -1,103 +1,75 @@ 'use strict'; -var fs = require('fs'); -var request = require('request').defaults({jar: true}); -var functions = require('./functions'); -var output = functions.output;//graph, users -var get = functions.get; //user, connections -var login = require('./secret/login'); +let TR = require('./TR'); +let Graph = require('./Graph'); +let co = require('co'); +let prompt = require('prompt'); +let denodeify = require('denodeify'); +let promptGet = denodeify(prompt.get); -request.post({url: 'https://www.trustroots.org/api/auth/signin', form:{username: login.username, password: login.password}}, function (err, resp, body) { - //initial username to start the search from - //change according to your taste - var checkuser = 'mrkvon'; - var nodes = [checkuser]; - var links = []; +return co(function * () { + var loginSchema = { + properties: { + username: { + required: true + }, + password: { + hidden: true, + replace: '*' + } + } + }; - return scrape([checkuser], nodes, links)//, paths, connections) - .then(null, function (err) { - if(err) console.org(err); - }); -}); + var startSchema = { + properties: { + username: { + } + } + } + + // + // prompt for username and password + // + console.log('\n**************************************************\n') + console.log('\tWelcome to the Trustroots scraper!'); + console.log('\n**************************************************\n\n') + console.log('Write your login data for trustroots.org:'); + prompt.start(); + let result = yield promptGet(loginSchema); + let username = result.username; + let password = result.password -//this function should get users connected to previous row of users -function scrape(usersToResearch, foundUsers, links) { - //console.log(usersToResearch, 'scrape!'); - //console.log(foundUsers, 'found users'); + yield TR.login(username, password); + console.log('successfully logged in as', username); - var contactUsernames = []; - var promiseChain = Promise.resolve(); - for(let usrnm of usersToResearch) { - promiseChain = promiseChain.then(researchFunction(usrnm, contactUsernames, links)); - + console.log(`\n\nWrite the username from which you wish to start crawling:\ndefault: ${username}`); + prompt.start(); + let startResult = yield promptGet(startSchema); + let startUsername = startResult.username || username; - } + console.log('\nscraping started. be patient. it can take a while (around 1000 connected users).\n'); + console.log('\noutput:\ncount username count contacts\n'); - return promiseChain - .then(function () { - var newLevel = []; - //console.log(contactArrays); - //if users were not scraped yet - //add them to usersToScrape - //and add the to newLevel - //console.log(ca); - //console.log(contactUsernames); - for(let c of contactUsernames) { - if(foundUsers.indexOf(c) === -1) { - foundUsers.push(c); - newLevel.push(c); - console.log(c); - } - else{ - console.log('***', c); - } - } - //console.log(newLevel); - return newLevel; - }) - .then(function (nl) { - //console.log(nl); - console.log(foundUsers.length, '*******'); - console.log(nl.length, 'next level'); - if(nl.length > 0) { - scrape(nl, foundUsers, links); - } - else { - console.log(foundUsers.length); - console.log('writing graph'); - return output.graph(foundUsers.sort(), links, 'graph') - .then(function () { - console.log('writing users') - return output.users(foundUsers.sort(), 'users'); - }) - .then(function () { - console.log('finished!'); - }) - .then(null, function (err){ - console.log(err); - }); - } - }); -} + var graph = new Graph(); + yield graph.scrape([startUsername]); -//this returns a function to put to .then() to create a for cycle. -function researchFunction(username, contactUsernames, links) { - return function () { - return get.user(username) - .then(function (_usr) { - return get.connections(_usr._id); - }) - .then(function (cts) { - for (let contact of cts) { - if(contact) { - links.push([contact.username, username]); - if(contactUsernames.indexOf(contact.username) === -1) - contactUsernames.push(contact.username); - } - } - return; - }) - .then(null, function (err) {console.log(err, username);}); - }; -} + //write the data to files + yield graph.outputUsers(); + yield graph.outputGraph(); + + console.log('\n\n********************************************************\n'); + console.log('\tfinished successfully!\n'); + console.log('\tyou will find the output in'); + console.log('\t./output/graph.gdt'); + console.log('\t./output/users.txt\n'); + console.log('\tyou can try Gephi to visualise them\n'); + console.log('\tGoodbye! ^_^\n'); + console.log('********************************************************\n'); +}) + .catch(function (e) { + if(e.status === 403) { + console.log('login not successful.'); + } + else console.error(e); + }); diff --git a/package.json b/package.json index 7338156..3c28cf1 100644 --- a/package.json +++ b/package.json @@ -1,15 +1,28 @@ { - "name": "scraper", - "version": "1.0.0", + "name": "tr_scraper", + "version": "1.1.0", "description": "information scraper for trustroots", "main": "index.js", + "dependencies": { + "co": "^4.6.0", + "denodeify": "^1.2.1", + "prompt": "^1.0.0", + "request": "^2.75.0" + }, + "devDependencies": {}, "scripts": { "test": "echo \"Error: no test specified\" && exit 1", - "start": "node ./index.js" + "start": "node ./index.js", + "prepare": ". ./prepare.sh" + }, + "repository": { + "type": "git", + "url": "git+https://github.com/mrkvon/trustroots-graph.git" }, - "author": "", + "author": "mrkvon", "license": "MIT", - "dependencies": { - "request": "latest" - } + "bugs": { + "url": "https://github.com/mrkvon/trustroots-graph/issues" + }, + "homepage": "https://github.com/mrkvon/trustroots-graph#readme" } diff --git a/prepare.sh b/prepare.sh new file mode 100755 index 0000000..feab99a --- /dev/null +++ b/prepare.sh @@ -0,0 +1,5 @@ +#!/usr/bin/env bash + +mkdir output +touch output/users.txt +touch output/graph.gdf