Skip to content

Commit

Permalink
pages or groups scraper, to programatically get the Columbia groups
Browse files Browse the repository at this point in the history
  • Loading branch information
ionox0 committed Nov 1, 2016
1 parent bbc18ff commit d150e2f
Show file tree
Hide file tree
Showing 2,169 changed files with 166,762 additions and 152 deletions.
98 changes: 0 additions & 98 deletions scraper/data/output.json

This file was deleted.

2 changes: 1 addition & 1 deletion scraper/dep/get_events.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@

response = requests.post('https://www.facebook.com/api/graphqlbatch/', headers=headers, data=data)
if response.content:
if len(response.content.split('\n')[0]) > 5: # yes we are hacking
if len(response.content.split('\n')[0]) > 5:
print(response.content.split('\n')[0])
json_data = json.loads(response.content.split('\n')[0])
else:
Expand Down
4 changes: 2 additions & 2 deletions scraper/dep/scraper.js
Original file line number Diff line number Diff line change
Expand Up @@ -56,8 +56,8 @@ function getEvents(url){
})
.replaceWith(function(){return this.data;});

//console.log(html);
savePage(html);
// console.log(html);
// savePage(html);
console.log('found the events div? ', $('._3j40').length);

$('._4dmd._4eok').each(function(){ // (Event div)
Expand Down
7 changes: 0 additions & 7 deletions scraper/dep/scraper_cheerio.js
Original file line number Diff line number Diff line change
Expand Up @@ -68,19 +68,12 @@ app.get('/scrape', function(req, res){

}

// To write to the system we will use the built in 'fs' library.
// In this example we will pass 3 parameters to the writeFile function
// Parameter 1 : output.json - this is what the created filename will be called
// Parameter 2 : JSON.stringify(json, null, 4) - the data to write, here we do an extra step by calling JSON.stringify to make our JSON easier to read
// Parameter 3 : callback function - a callback function to let us know the status of our function

fs.writeFile('output.json', JSON.stringify(events, null, 4), function(err){

console.log('File successfully written! - Check your project directory for the output.json file');

});

// Finally, we'll just send out a message to the browser reminding you that this app does not have a UI.
res.send('Check your console!')

});
Expand Down
110 changes: 66 additions & 44 deletions scraper/get_events_2.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,48 +21,70 @@
fileDir = os.path.dirname(os.path.realpath('__file__'))
filename = os.path.join(fileDir, 'data/pages_with_ids.json')

start_date = datetime.datetime.now().strftime("%Y-%m-%d")
print(start_date)
events = []
i = 0
with open(filename) as data_file:
pages_data = json.load(data_file)

# For every Columbia Page:
for page in pages_data:
pprint(page['node_id'])
pprint(page['url'])

response = requests.get('https://graph.facebook.com/v2.8/' + page['node_id'] + '/events?' + 'since=' + start_date + '&access_token=EAACEdEose0cBALFkO6rUmGl01Qt864YOXOWv67Lg2FgRQbsqeq8B3HnevZCFlTsW9jmuIX4nMedvZALi9DBLXj06O5K8b9AA3hazmm4UUAsXDcl5hZBEFHx6ZCiiEDFBi4peoF8Pxj7yyhPYqtmnv0x8m5JcGlBu7LfQtS6HywZDZD&debug=all&format=json&method=get&pretty=0&suppress_http_code=1&fields=name,place,start_time,description,cover,photos.limit(1),picture', headers=headers)
data = response.json()
print(response.json())

# Hack for skipping some nodes that have urls as ids:
if 'http' in page['node_id']:
continue

# Add all that Page's events:
for d in response.json()['data']:
event = {}
event['id'] = d['id']
event['title'] = d['name']
event['page_id'] = page['node_id']
event['group_url'] = page['url']
if 'description' in d:
event['description'] = d['description']
event['datetime'] = d['start_time']
if 'place' in d:
event['location'] = d['place']['name']
if 'cover' in d:
event['photo_url'] = d['cover']['source']

events.append(event)

i = i + 1
if i % 10 == 0:
print('\n\n\ndone with ' + str(i) + ' pages')
print('events count: ' + str(len(events)) + '\n\n\n')

with open('events_data.json', 'w') as outfile:
json.dump(events, outfile)



class FacebookScraper:

def get_groups(self):

return

def get_events(self, groups_file):
start_date = datetime.datetime.now().strftime("%Y-%m-%d")
events = []

with open(filename) as data_file:
pages_data = json.load(data_file)

# For every Columbia Page:
for i, page in enumerate(pages_data):
pprint(page['node_id'])
pprint(page['url'])

url = 'https://graph.facebook.com/v2.8/'
url = url + page['node_id']
url = url + '/events?'
url = url + 'since='
url = url + start_date
url = url + '&access_token=EAACEdEose0cBALFkO6rUmGl01Qt864YOXOWv67Lg2FgRQbsqeq8B3HnevZCFlTsW9jmuIX4nMedvZALi9DBLXj06O5K8b9AA3hazmm4UUAsXDcl5hZBEFHx6ZCiiEDFBi4peoF8Pxj7yyhPYqtmnv0x8m5JcGlBu7LfQtS6HywZDZD'
url = url + '&debug=all&format=json&method=get&pretty=0&suppress_http_code=1'
url = url + '&fields=name,place,start_time,description,cover,photos.limit(1),picture'

response = requests.get(url, headers=headers)
data = response.json()
print(response.json())

# Skipping some nodes that have urls instead of ids:
if 'http' in page['node_id']:
continue

# Add all that Page's events:
for d in response.json()['data']:
event = {}
event['id'] = d['id']
event['title'] = d['name']
event['page_id'] = page['node_id']
event['group_url'] = page['url']
if 'description' in d:
event['description'] = d['description']
event['datetime'] = d['start_time']
if 'place' in d:
event['location'] = d['place']['name']
if 'cover' in d:
event['photo_url'] = d['cover']['source']

events.append(event)

if i % 10 == 0:
print('\n\n\nFinished ' + str(i) + ' pages')
print('Events count: ' + str(len(events)) + '\n\n\n')

with open('events_data.json', 'w') as outfile:
json.dump(events, outfile)



if __name__ == "__main__":
scraper = FacebookScraper()
scraper.get_events('dummy')
125 changes: 125 additions & 0 deletions scraper/groups_scraper.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
var fs = require('fs');
var page = require('webpage').create();



page.open("http://www.facebook.com/login.php", function(status) {

if (status === "success") {

page.onConsoleMessage = function(msg, lineNum, sourceId) {
console.log('CONSOLE: ' + msg + ' (from line #' + lineNum + ' in "' + sourceId + '")');
};

page.evaluate(function() {
document.getElementById("email").value = "[email protected]";
document.getElementById("pass").value = "";
document.getElementById("loginbutton").click();
});

setTimeout(function() {
page.evaluate(function() {
console.log('At Homepage');
});
page.render("home_page.png");

routeToSearch();
}, 2000);

function routeToSearch() {
page.evaluate(function() {
document.getElementsByClassName("_1frb")[0].value = "columbia";
document.getElementsByClassName("_42ft _4jy0 _4w98 _4jy3 _517h _51sy")[0].click();
document.querySelector('button').click();
});

setTimeout(function() {
page.evaluate(function() {
console.log('At Search Page');
});
page.render("search_page.png");

routeToPages();
}, 5000)
}

function routeToPages() {
page.evaluate(function() {

function eventFire(el, etype){
if (el.fireEvent) {
el.fireEvent('on' + etype);
} else {
var evObj = document.createEvent('Events');
evObj.initEvent(etype, true, false);
el.dispatchEvent(evObj);
}
}

eventFire(document.querySelectorAll("._4xjz")[6], 'click');
});

setTimeout(function() {
page.evaluate(function() {
console.log('At Pages Page');
});
page.render("pages_page.png");

scrollLoop();
}, 5000)
}

var i = 1;
function scrollLoop() {
if (i === 10) {
grabGroups();
}
page.evaluate(function() {
window.document.body.scrollTop = document.body.scrollHeight;
});

i = i + 1;

setTimeout(function() {
page.evaluate(function() {
console.log('Scrolling iteration...');
});
page.render("pages_page_scroll_" + i + ".png");

scrollLoop();
}, 5000)
}

function grabGroups() {
var pages = page.evaluate(function() {
var pages_els = document.querySelectorAll("._5und");
var pages = [];
var i = 0;

for (var i = 0; i < pages_els.length; i++) {
pages.push({
group_id: JSON.parse(pages_els[i].getAttribute('data-bt')).id,
group_name: pages_els[i].querySelector('._5d-5').innerHTML,
group_url: pages_els[i].querySelector('a').href
})
}

console.log(pages_els);
console.log(pages_els.length);
return JSON.stringify(pages)
});

writeResults(pages);
}

function writeResults(pages) {
console.log(pages);
var path = 'pages_data.json';
fs.write(path, pages, 'w');

phantom.exit();
}

}

});
Binary file added scraper/home_page.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
1 change: 1 addition & 0 deletions scraper/node_modules/.bin/phantomjs

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Loading

0 comments on commit d150e2f

Please sign in to comment.