Skip to content

Commit

Permalink
[#71, #72] added loop checks to avoid duplicate entries in db
Browse files Browse the repository at this point in the history
  • Loading branch information
j-h-s committed Jan 4, 2018
1 parent 774e189 commit fa6596e
Show file tree
Hide file tree
Showing 6 changed files with 122 additions and 14 deletions.
2 changes: 1 addition & 1 deletion app/config/config_prod.yml
Original file line number Diff line number Diff line change
Expand Up @@ -31,4 +31,4 @@ monolog:
sentry:
type: raven
dsn: "%sentry_dsn%"
level: error
level: warning
10 changes: 5 additions & 5 deletions src/AppBundle/Command/ScraperCommand.php
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ protected function execute(InputInterface $input, OutputInterface $output)

$socialNetworks = $party->getSocialNetworks();
if (empty($socialNetworks)) {
$this->log->warning("- Social Network information missing for " . $partyCode);
$this->log->notice(" - Social Network information missing for " . $partyCode);
continue;
}

Expand Down Expand Up @@ -135,7 +135,7 @@ public function processParty($partyCode, $socialNetworks) {
public function scrapeFacebook($partyCode, $socialNetworks)
{
if (empty($socialNetworks['facebook']) || empty($socialNetworks['facebook']['username'])) {
$this->log->warning(" - Facebook data not found for " . $partyCode);
$this->log->notice(" - Facebook data not found for " . $partyCode);
return false;
}

Expand All @@ -162,7 +162,7 @@ public function scrapeFacebook($partyCode, $socialNetworks)
public function scrapeTwitter($partyCode, $socialNetworks)
{
if (empty($socialNetworks['twitter']) || empty($socialNetworks['twitter']['username'])) {
$this->log->warning(" - Twitter data not found for " . $partyCode);
$this->log->notice(" - Twitter data not found for " . $partyCode);
return false;
}

Expand All @@ -189,7 +189,7 @@ public function scrapeTwitter($partyCode, $socialNetworks)
public function scrapeGooglePlus($partyCode, $socialNetworks)
{
if (empty($socialNetworks['googlePlus'])) {
$this->log->warning(" - Google+ data not found for " . $partyCode);
$this->log->notice(" - Google+ data not found for " . $partyCode);
return false;
}

Expand All @@ -215,7 +215,7 @@ public function scrapeGooglePlus($partyCode, $socialNetworks)
public function scrapeYoutube($partyCode, $socialNetworks)
{
if (empty($socialNetworks['youtube'])) {
$this->log->warning(" - Youtube data not found for " . $partyCode);
$this->log->notice(" - Youtube data not found for " . $partyCode);
return false;
}

Expand Down
39 changes: 39 additions & 0 deletions src/AppBundle/Service/FacebookService.php
Original file line number Diff line number Diff line change
Expand Up @@ -111,11 +111,20 @@ public function getPosts() {
$pageCount = 0;
$txtCount = 0;
$vidCount = 0;
$loopCount = 0;
$temp = [];

do {
$this->log->debug(" + Page " . $pageCount);

foreach ($fdPosts as $key => $post) {
if (in_array($post->getField('id'), $temp, true)) {
// if post was already scraped this session
$loopCount++;
continue;
}
$temp[] = $post->getField('id');

$type = $post->getField('type');
// types = 'status', 'link', 'photo', 'video', 'event'

Expand All @@ -138,6 +147,10 @@ public function getPosts() {
} while ($timeCheck > $timeLimit && $fdPosts = $this->fb->next($fdPosts));
// while next page is not null and within our time limit

if ($loopCount > 0) {
$this->log->warning(" - Facebook post scraping for " . $this->partyCode . " looped " . $loopCount . " times");
}

$out['posts'] = $txtCount;
$out['videos'] = $vidCount;
$this->log->info(" + " . $txtCount . " text posts and " . $vidCount . " videos since " . date('d/m/Y', $timeCheck) . " processed");
Expand Down Expand Up @@ -214,6 +227,8 @@ public function getImages() {

$pageCount = 0;
$imgCount = 0;
$loopCount = 0;
$temp = [];

foreach ($fdAlbums as $key => $album) {
$photoCount[] = $album->getField('photo_count');
Expand All @@ -226,6 +241,13 @@ public function getImages() {
do {
$this->log->debug(" + Page " . $pageCount);
foreach ($fdPhotos as $key => $photo) {
if (in_array($photo->getField('picture'), $temp, true)) {
// if image was already scraped this session
$loopCount++;
continue;
}
$temp[] = $photo->getField('picture');

$this->getImageDetails($photo, $album);
$imgCount++;
}
Expand All @@ -237,6 +259,10 @@ public function getImages() {
// while next page is not null and within our time limit
}

if ($loopCount > 0) {
$this->log->warning(" - Facebook image scraping for " . $this->partyCode . " looped " . $loopCount . " times");
}

$out['imageCount'] = array_sum($photoCount);
$out['images'] = $imgCount;
$this->log->info(" + " . $out['imageCount'] . " images found, " . $imgCount . " since " . date('d/m/Y', $timeCheck) . " processed");
Expand Down Expand Up @@ -313,10 +339,19 @@ public function getEvents() {

$pageCount = 0;
$eveCount = 0;
$loopCount = 0;
$temp = [];

do { // process current page of results
$this->log->debug(" + Page " . $pageCount);
foreach ($fdEvents as $key => $event) {
if (in_array($event->getField('id'), $temp, true)) {
// if event was already scraped this session
$loopCount++;
continue;
}
$temp[] = $event->getField('id');

$this->getEventDetails($event);
$eveCount++;
}
Expand All @@ -327,6 +362,10 @@ public function getEvents() {
} while ($timeCheck > $timeLimit && $fdEvents = $this->fb->next($fdEvents));
// while next page is not null and within our time limit

if ($loopCount > 0) {
$this->log->warning(" - Facebook event scraping for " . $this->partyCode . " looped " . $loopCount . " times");
}

$out['eventCount'] = $eveCount;
$out['events'] = true;
$this->log->info(" + " . $out['eventCount'] . " events found and processed");
Expand Down
53 changes: 48 additions & 5 deletions src/AppBundle/Service/FbStatService.php
Original file line number Diff line number Diff line change
Expand Up @@ -144,7 +144,7 @@ public function getCover($graphNode) {
* @return int
*/
public function getPostCount() {
$requestFields = 'posts{created_time}';
$requestFields = 'posts{id,created_time}';
$graphNode = $this->connect->getFbGraphNode($this->fbPageId, $requestFields);

if (empty($graphNode) || is_null($graphNode->getField('posts'))) {
Expand All @@ -162,6 +162,7 @@ public function getPostCount() {
$this->log->info(" + Counting text posts...");
$oldCount = $this->db->getStatLimit($this->partyCode, 'fb', 'T');
$pageCount = 0;
$loopCount = 0;
$temp = [];

do {
Expand All @@ -170,15 +171,25 @@ public function getPostCount() {
foreach ($fdPcount as $key => $post) {
$timeCheck = $post->getField('created_time')->getTimestamp(); // check time of last scraped post

if (in_array($post->getField('id'), $temp, true)) {
// if post was already counted this session
$loopCount++;
continue;
}

if ($timeCheck > $oldCount['time']) {
$temp['posts'][] = ['time' => $timeCheck];
$temp['posts'][] = $post->getField('id');
}
}

$pageCount++;

} while ($timeCheck > $oldCount['time'] && $fdPcount = $this->fb->next($fdPcount)); // while next page is not null

if ($loopCount > 0) {
$this->log->warning(" - Facebook post counting for " . $this->partyCode . " looped " . $loopCount . " times");
}

$postCount = isset($temp['posts']) ? count($temp['posts']) : 0;
$totalCount = $oldCount['value'] + $postCount;

Expand All @@ -205,7 +216,7 @@ public function getPostCount() {
* @return int
*/
public function getImageCount() {
$requestFields = 'albums{count}';
$requestFields = 'albums{id,count}';
$graphNode = $this->connect->getFbGraphNode($this->fbPageId, $requestFields);

if (empty($graphNode) || is_null($graphNode->getField('albums'))) {
Expand All @@ -217,14 +228,26 @@ public function getImageCount() {
$this->log->info(" + Counting images...");
$fdAlbums = $graphNode->getField('albums');
$pageCount = 0;
$loopCount = 0;
$photoCount = [];

foreach ($fdAlbums as $key => $album) {
if (in_array($album->getField('id'), $temp, true)) {
// if album was already counted this session
$loopCount++;
continue;
}
$temp[] = $album->getField('id');

$this->log->debug(" + Page " . $pageCount);
$photoCount[] = $album->getField('count');
$pageCount++;
}

if ($loopCount > 0) {
$this->log->warning(" - Facebook image counting for " . $this->partyCode . " looped " . $loopCount . " times");
}

$imageCount = array_sum($photoCount);
if ($imageCount == 0) {
return false;
Expand Down Expand Up @@ -260,16 +283,26 @@ public function getVideoCount() {
$this->log->info(" + Counting videos...");
$fdVcount = $graphNode->getField('videos');
$pageCount = 0;
$loopCount = 0;
$temp = [];

do {
$this->log->debug(" + Page " . $pageCount);
foreach ($fdVcount as $key => $post) {
$temp['videos'][] = ['id' => $post->getField('id')];
if (in_array($post->getField('id'), $temp, true)) {
// if video was already counted this session
$loopCount++;
continue;
}
$temp['videos'][] = $post->getField('id');
}
$pageCount++;
} while ($fdVcount = $this->fb->next($fdVcount)); // while next page is not null

if ($loopCount > 0) {
$this->log->warning(" - Facebook video counting for " . $this->partyCode . " looped " . $loopCount . " times");
}

$videoCount = isset($temp['videos']) ? count($temp['videos']) : 0;
if ($videoCount == 0) {
return false;
Expand Down Expand Up @@ -305,16 +338,26 @@ public function getEventCount() {
$this->log->info(" + Counting events...");
$fdEvents = $graphNode->getField('events');
$pageCount = 0;
$loopCount = 0;
$temp = [];

do {
$this->log->debug(" + Page " . $pageCount);
foreach ($fdEvents as $key => $event) {
$temp['events'][] = ['id' => $event->getField('id')];
if (in_array($event->getField('id'), $temp, true)) {
// if event was already counted this session
$loopCount++;
continue;
}
$temp['events'][] = $event->getField('id');
}
$pageCount++;
} while ($fdEvents = $this->fb->next($fdEvents)); // while next page is not null

if ($loopCount > 0) {
$this->log->warning(" - Facebook event counting for " . $this->partyCode . " looped " . $loopCount . " times");
}

$eventCount = isset($temp['events']) ? count($temp['events']) : 0;
if ($eventCount == 0) {
return false;
Expand Down
19 changes: 16 additions & 3 deletions src/AppBundle/Service/GoogleService.php
Original file line number Diff line number Diff line change
Expand Up @@ -58,9 +58,11 @@ public function getYoutubeData($partyCode, $googleId, $scrapeData = false) {
return $out;
}

$playlist = $data->contentDetails->relatedPlaylists->uploads;
$videos = $this->yt->getPlaylistItemsByPlaylistId($playlist);
$vidCount = 0;
$playlist = $data->contentDetails->relatedPlaylists->uploads;
$videos = $this->yt->getPlaylistItemsByPlaylistId($playlist);
$vidCount = 0;
$loopCount = 0;
$temp = [];

if (empty($videos)) {
$this->log->notice(" - Youtube videos not found for " . $this->partyCode);
Expand All @@ -69,10 +71,21 @@ public function getYoutubeData($partyCode, $googleId, $scrapeData = false) {

$this->log->info(" + Getting video details...");
foreach ($videos as $key => $vid) {
if (in_array($vid->snippet->resourceId->videoId, $temp, true)) {
// if video was already scraped this session
$loopCount++;
continue;
}
$temp[] = $vid->snippet->resourceId->videoId;

$this->getVideoDetails($vid);
$vidCount++;
}

if ($loopCount > 0) {
$this->log->warning(" - Youtube video scraping for " . $this->partyCode . " looped " . $loopCount . " times");
}

$this->log->info(" + " . $vidCount . " videos found and processed");
$out['videos'] = $vidCount;

Expand Down
13 changes: 13 additions & 0 deletions src/AppBundle/Service/TwitterService.php
Original file line number Diff line number Diff line change
Expand Up @@ -154,11 +154,20 @@ public function getTweets() {
$txtCount = 0;
$imgCount = 0;
$vidCount = 0;
$loopCount = 0;
$temp = [];

do { // process current page of results
$this->log->debug(" + Page " . $pageCount);

foreach($tweetData as $item) {
if (in_array($item->id, $temp, true)) {
// if tweet was already scraped this session
$loopCount++;
continue;
}
$temp[] = $item->id;

$twTime = \DateTime::createFromFormat('D M d H:i:s P Y', $item->created_at);
// original string e.g. 'Mon Sep 08 15:19:11 +0000 2014'

Expand All @@ -182,6 +191,10 @@ public function getTweets() {
} while ($timeCheck > $timeLimit && $pageCount < 100);
// while tweet times are more recent than the limit as set above, up to 5000

if ($loopCount > 0) {
$this->log->warning(" - Tweet scraping for " . $this->partyCode . " looped " . $loopCount . " times");
}

$out['posts'] = $txtCount;
$out['images'] = $imgCount;
$out['videos'] = $vidCount;
Expand Down

0 comments on commit fa6596e

Please sign in to comment.