Skip to content

Commit

Permalink
[#71] simplified patch for duplicate entries
Browse files Browse the repository at this point in the history
  • Loading branch information
j-h-s committed Feb 19, 2018
1 parent 5f4f7ad commit 9b726de
Showing 1 changed file with 11 additions and 105 deletions.
116 changes: 11 additions & 105 deletions src/AppBundle/Command/PatchCommand.php
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
use Symfony\Component\Console\Input\InputOption;
use Symfony\Component\Console\Output\OutputInterface;


use AppBundle\Entity\SocialMedia;
use Pirates\PapiInfo\Compile;

Expand All @@ -24,8 +23,6 @@ protected function configure() {
->addOption('stats', 'x', InputOption::VALUE_NONE, "Alter Twitter and Facebook stat codes for consistency")
->addOption('exturls', 'u', InputOption::VALUE_NONE, "Decode Facebook's external image urls")
->addOption('duplicates', 'd', InputOption::VALUE_NONE, "Scan the social media database for duplicate entries")
->addOption('party', 'y', InputOption::VALUE_OPTIONAL, "Choose a single party to patch, by code")
->addOption('resume', 'z', InputOption::VALUE_OPTIONAL, "Choose a party to resume patching from, if interrupted")
;
}

Expand All @@ -36,7 +33,6 @@ protected function execute(InputInterface $input, OutputInterface $output) {
$this->output = $output;
$this->log = $this->getContainer()->get('logger');


switch (true) { // add more options here
case $input->getOption('twitter'):
$this->log->notice("##### Patching Twitter images #####");
Expand All @@ -62,9 +58,7 @@ protected function execute(InputInterface $input, OutputInterface $output) {
break;
case $input->getOption('duplicates');
$this->log->notice('##### Patching duplicate social media posts #####');
$partyCode = $input->getOption('party');
$resumePoint = $input->getOption('resume');
$this->patchDuplicateEntries($partyCode, $resumePoint);
$this->patchDuplicateEntries();
break;
case $input->getOption('metadata');
$this->log->notice('##### Patching metadata charset #####');
Expand Down Expand Up @@ -95,108 +89,20 @@ public function getConfirmation() {
//
// Occasionally the scraper will bug out and add duplicate entries of the same social media posts.
// This patch locates those posts in the database and deletes the duplicates, leaving only the most recent copy.
// Only run this if you know there are duplicate entries. It takes too long to waste time running it needlessly.
/////
public function patchDuplicateEntries($partyCode = null, $resumePoint = null) {
public function patchDuplicateEntries() {
$this->getConfirmation();
$time = new \DateTime('now');
$this->log->notice("# NOTE: This will take a long time. Go and make yourself a cup of tea. The time is now " . $time->format('H:i:s') . ".");
$this->log->info("Checking database... ");

if (is_null($partyCode)) {
$social = $this->em->getRepository('AppBundle:SocialMedia')->findAll();

$size = sizeof($social);
$this->log->info($size . " total posts found...");

$estLow = ($size / 4) / 60; // estimation of minutes based on 4 posts per second
$estHigh = ($size / 3) / 60; // estimation of minutes based on 3 posts per second
$this->log->info("Estimated time to process all posts... " . ceil($estLow / 60) . "-" . ceil($estHigh / 60) . " hours...");
$this->log->notice("Deleting duplicates...");

$parties = $this->container->get('DatabaseService')->getAllParties();
} else {
$parties = $this->container->get('DatabaseService')->getOneParty($partyCode);
}

foreach ($parties as $party) {
if (!is_null($resumePoint) && ($party->getCode() < strtoupper($resumePoint))) {
$this->log->debug("Skipping " . $party->getCode());
continue;
}

$this->log->info("Getting posts from " . $party->getCode());

$dateLimit = strtotime("-6 months");
$dateString = date('Y-m-d H:i:s', $dateLimit);

$posts = $this->em->createQueryBuilder()->select('p')
->from('AppBundle:SocialMedia', 'p')
->where(sprintf("p.code = '%s'", $party->getCode()))
->andWhere(sprintf("p.postTime > '%s'", $dateString))
->orderBy("p.id", 'DESC')
->getQuery()->getResult();

// $posts = $this->em->getRepository('AppBundle:SocialMedia')->findBy(['code' => $party->getCode()], ['id' => 'DESC']);

$size = sizeof($posts);
$this->log->info($size . " posts found from the past 6 months");

$estLow = ($size / 4) / 60; // estimation of minutes based on 4 posts per second
$estHigh = ($size / 3) / 60; // estimation of minutes based on 3 posts per second
$this->log->info("Estimated time to process " . $party->getCode() . "... " . ceil($estLow) . "-" . ceil($estHigh) . " minutes...");

$this->output->write("Processing...");
$postCount = 0;
foreach ($posts as $prime) {
$postCount++;
$terms = [
'code' => $party->getCode(),
'type' => $prime->getType(),
'subType' => $prime->getSubType(),
'postId' => $prime->getPostId(),
'postText' => $prime->getPostText(),
'postImage' => $prime->getPostImage()
];

$dupes = $this->em->getRepository('AppBundle:SocialMedia')->findBy($terms, ['id' => 'DESC']);

if (sizeof($dupes) == 1) {
// $this->log->debug($postCount . " - No duplicates found for " . $prime->getPostId());
$this->output->write($postCount . ",");
continue;
}
$sql = "DELETE s1 FROM social_media s1
INNER JOIN social_media s2
WHERE s1.postId = s2.postId
AND s1.postImage = s2.postImage
AND s1.id < s2.id;";

$this->output->writeln("");
$this->log->notice($postCount . " - " . (sizeof($dupes) -1) . " duplicates found for " . $prime->getPostId());

foreach ($dupes as $dupe) {
if ($dupe->getId() < $prime->getId()) {
$this->output->write("prime type = " . $prime->getType() . "-" . $prime->getSubType());
$this->output->write(", id = " . $prime->getId() . ", post id = " . $prime->getPostId());
$this->output->writeln(", time = " . $prime->getPostTime()->format('Y-m-d H:i:s'));
$this->output->write(" dupe type = " . $dupe->getType() . "-" . $dupe->getSubType());
$this->output->write(", id = " . $dupe->getId() . ", post id = " . $dupe->getPostId());
$this->output->writeln(", time = " . $dupe->getPostTime()->format('Y-m-d H:i:s'));
$this->output->writeln("prime text = " . $prime->getPostText());
$this->output->writeln(" dupe text = " . $dupe->getPostText());
$this->output->writeln("prime image = " . $prime->getPostImage());
$this->output->writeln(" dupe image = " . $dupe->getPostImage());
$this->log->info("Deleting...");
$this->em->remove($dupe);
$this->em->flush();
$this->log->info("Done...");
}
}
}

$mid = new \Datetime('now');
$diff = $time->diff($mid);
$this->log->info("Done with " . $party->getCode() . " in " . $diff->format('%H:%I:%S'));
}

$end = new \Datetime('now');
$diff = $time->diff($end);
$this->log->notice("All done in " . $diff->format('H:%I:%S'));
$query = $this->em->getConnection()->prepare($sql);
$query->execute();
$this->log->notice("Done.");
}


Expand Down

0 comments on commit 9b726de

Please sign in to comment.