LB-1382: Place private dumps into wholly seperate file system paths

To avoid accidental leaking of private data, make the separation between public and private dumps more explicit. Often such errors occur when running scripts by hand, so adding a few more safeguards to the dumps running code too.
metabrainz · Dec 27, 2023 · 7d8c087 · 7d8c087
1 parent 60ab3c3
commit 7d8c087
Show file tree

Hide file tree

Showing 7 changed files with 196 additions and 54 deletions.
diff --git a/admin/config.sh.ctmpl b/admin/config.sh.ctmpl
@@ -5,6 +5,7 @@
 
 DUMP_THREADS="{{template "KEY" "dump_threads"}}"
 DUMP_BASE_DIR="{{template "KEY" "base_dir"}}"
+PRIVATE_DUMP_BASE_DIR="{{template "KEY" "private_base_dir"}}"
 
 # Where to back things up to, who should own the backup files, and what mode
 # those files should have.
@@ -15,6 +16,8 @@ BACKUP_GROUP="{{template "KEY" "group"}}"
 BACKUP_DIR_MODE=700
 BACKUP_FILE_MODE=600
 
+PRIVATE_BACKUP_DIR="{{template "KEY" "private_backup_dir"}}"
+
 # Same but for the files that need to copied to the FTP server,
 # for public consumption
 FTP_DIR="{{template "KEY" "ftp_dir"}}"

diff --git a/admin/config.sh.sample b/admin/config.sh.sample
@@ -2,6 +2,7 @@
 
 DUMP_THREADS=4
 DUMP_BASE_DIR='/mnt/dumps'
+PRIVATE_DUMP_BASE_DIR='/private/dumps'
 
 # Where to back things up to, who should own the backup files, and what mode
 # those files should have.
@@ -12,6 +13,8 @@ BACKUP_GROUP=root
 BACKUP_DIR_MODE=700
 BACKUP_FILE_MODE=600
 
+PRIVATE_BACKUP_DIR=/private/backup
+
 # Same but for the files that need to copied to the FTP server,
 # for public consumption
 FTP_DIR='/mnt/ftp/'

diff --git a/admin/create-dumps.sh b/admin/create-dumps.sh
@@ -83,6 +83,11 @@ if [ -z $DUMP_BASE_DIR ]; then
     exit 1
 fi
 
+if [ -z $PRIVATE_DUMP_BASE_DIR ]; then
+    echo "DUMP_BASE_PRIVATE_DIR isn't set"
+    exit 1
+fi
+
 DUMP_TYPE="${1:-full}"
 # consume dump type argument so that we can pass the remaining arguments to
 # the python dump manager script
@@ -112,8 +117,13 @@ echo "DUMP_BASE_DIR is $DUMP_BASE_DIR"
 echo "creating DUMP_TEMP_DIR $DUMP_TEMP_DIR"
 mkdir -p "$DUMP_TEMP_DIR"
 
+PRIVATE_DUMP_BASE_DIR="$PRIVATE_DUMP_BASE_DIR/$SUB_DIR.$$"
+echo "DUMP_BASE_DIR is $PRIVATE_DUMP_BASE_DIR"
+echo "creating PRIVATE_DUMP_BASE_DIR $PRIVATE_DUMP_BASE_DIR"
+mkdir -p "$PRIVATE_DUMP_BASE_DIR"
+
 if [ "$DUMP_TYPE" == "full" ]; then
-    if ! /usr/local/bin/python manage.py dump create_full -l "$DUMP_TEMP_DIR" -t "$DUMP_THREADS" "$@"; then
+    if ! /usr/local/bin/python manage.py dump create_full -l "$DUMP_TEMP_DIR" -lp "$PRIVATE_DUMP_BASE_DIR" -t "$DUMP_THREADS" "$@"; then
         echo "Full dump failed, exiting!"
         exit 1
     fi
@@ -169,6 +179,34 @@ retry rsync -a "$DUMP_DIR/" "$BACKUP_DIR/$SUB_DIR/$DUMP_NAME/"
 chmod "$BACKUP_FILE_MODE" "$BACKUP_DIR/$SUB_DIR/$DUMP_NAME/"*
 echo "Dumps copied to backup directory!"
 
+HAS_EMPTY_PRIVATE_DIRS_OR_FILES=$(find "$PRIVATE_DUMP_BASE_DIR" -empty)
+if [ -z "$HAS_EMPTY_PRIVATE_DIRS_OR_FILES" ]; then
+    # private dumps directory is not empty
+
+    PRIVATE_DUMP_ID_FILE=$(find "$PRIVATE_DUMP_BASE_DIR" -type f -name 'DUMP_ID.txt')
+    if [ -z "$PRIVATE_DUMP_ID_FILE" ]; then
+        echo "DUMP_ID.txt not found, exiting."
+        exit 1
+    fi
+
+    read -r PRIVATE_DUMP_TIMESTAMP PRIVATE_DUMP_ID PRIVATE_DUMP_TYPE < "$PRIVATE_DUMP_ID_FILE"
+    echo "Dump created with timestamp $PRIVATE_DUMP_TIMESTAMP"
+    PRIVATE_DUMP_DIR=$(dirname "$PRIVATE_DUMP_ID_FILE")
+    PRIVATE_DUMP_NAME=$(basename "$PRIVATE_DUMP_DIR")
+
+    # Backup dumps to the backup volume
+    echo "Creating private dumps backup directories..."
+    mkdir -m "$BACKUP_FILE_MODE" -p \
+        "$PRIVATE_BACKUP_DIR/$SUB_DIR" \
+        "$PRIVATE_BACKUP_DIR/$SUB_DIR/$PRIVATE_DUMP_NAME"
+    echo "Private dumps backup directories created!"
+
+    # Copy the files into the backup directory
+    echo "Begin copying private dumps to private backup directory..."
+    retry rsync -a "$PRIVATE_DUMP_DIR/" "$PRIVATE_BACKUP_DIR/$SUB_DIR/$PRIVATE_DUMP_NAME/"
+    chmod "$BACKUP_FILE_MODE" "$PRIVATE_BACKUP_DIR/$SUB_DIR/$PRIVATE_DUMP_NAME/"*
+    echo "Dumps copied to backup directory!"
+fi
 
 # rsync the files into the FTP server
 FTP_CURRENT_DUMP_DIR="$FTP_DIR/$SUB_DIR/$DUMP_NAME"

diff --git a/listenbrainz/db/dump.py b/listenbrainz/db/dump.py
@@ -240,30 +240,32 @@
 }
 
 
-def dump_postgres_db(location, dump_time=datetime.today(), threads=DUMP_DEFAULT_THREAD_COUNT):
+def dump_postgres_db(location, location_private, dump_time=datetime.today(), threads=DUMP_DEFAULT_THREAD_COUNT):
     """ Create postgres database dump in the specified location
 
         Arguments:
-            location: Directory where the final dump will be stored
+            location: Directory where the final public dump will be stored
+            location_private: Directory where the final private dump will be stored
             dump_time: datetime object representing when the dump was started
             threads: Maximal number of threads to run during compression
 
         Returns:
             a tuple: (path to private dump, path to public dump)
     """
     current_app.logger.info('Beginning dump of PostgreSQL database...')
-    current_app.logger.info('dump path: %s', location)
+    current_app.logger.info('private dump path: %s', location_private)
 
     current_app.logger.info('Creating dump of private data...')
     try:
-        private_dump = create_private_dump(location, dump_time, threads)
+        private_dump = create_private_dump(location_private, dump_time, threads)
     except Exception:
         current_app.logger.critical('Unable to create private db dump due to error: ', exc_info=True)
         current_app.logger.info('Removing created files and giving up...')
-        shutil.rmtree(location)
+        shutil.rmtree(location_private)
         return
     current_app.logger.info('Dump of private data created at %s!', private_dump)
 
+    current_app.logger.info('public dump path: %s', location)
     current_app.logger.info('Creating dump of public data...')
     try:
         public_dump = create_public_dump(location, dump_time, threads)
@@ -277,12 +279,13 @@ def dump_postgres_db(location, dump_time=datetime.today(), threads=DUMP_DEFAULT_
     return private_dump, public_dump
 
 
-def dump_timescale_db(location: str, dump_time: datetime = datetime.today(),
+def dump_timescale_db(location: str, location_private: str, dump_time: datetime = datetime.today(),
                       threads: int = DUMP_DEFAULT_THREAD_COUNT) -> Optional[Tuple[str, str]]:
     """ Create timescale database (excluding listens) dump in the specified location
 
         Arguments:
-            location: Directory where the final dump will be stored
+            location: Directory where the final public dump will be stored
+            location_private: Directory where the final private dump will be stored
             dump_time: datetime object representing when the dump was started
             threads: Maximal number of threads to run during compression
 
@@ -293,14 +296,13 @@ def dump_timescale_db(location: str, dump_time: datetime = datetime.today(),
 
     current_app.logger.info('Creating dump of timescale private data...')
     try:
-        private_timescale_dump = create_private_timescale_dump(location, dump_time, threads)
+        private_timescale_dump = create_private_timescale_dump(location_private, dump_time, threads)
     except Exception:
         current_app.logger.critical('Unable to create private timescale db dump due to error: ', exc_info=True)
         current_app.logger.info('Removing created files and giving up...')
-        shutil.rmtree(location)
+        shutil.rmtree(location_private)
         return
-    current_app.logger.info(
-        'Dump of private timescale data created at %s!', private_timescale_dump)
+    current_app.logger.info('Dump of private timescale data created at %s!', private_timescale_dump)
 
     current_app.logger.info('Creating dump of timescale public data...')
     try:

diff --git a/listenbrainz/db/dump_manager.py b/listenbrainz/db/dump_manager.py
@@ -1,6 +1,7 @@
 """ This module contains a click group with commands to
 create and import postgres data dumps.
 """
+from pathlib import PurePath
 
 # listenbrainz-server - Server for the ListenBrainz project
 #
@@ -92,8 +93,7 @@ def create_mbcanonical(location, use_lb_conn):
         try:
             write_hashes(dump_path)
         except IOError as e:
-            current_app.logger.error(
-                'Unable to create hash files! Error: %s', str(e), exc_info=True)
+            current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True)
             sys.exit(-1)
 
         try:
@@ -109,13 +109,14 @@ def create_mbcanonical(location, use_lb_conn):
             # Mapping dump doesn't have a dump id (second field) as they are standalone
             f.write("%s 0 mbcanonical\n" % (ts, ))
 
-        current_app.logger.info(
-            'Dumps created and hashes written at %s' % dump_path)
+        current_app.logger.info('Dumps created and hashes written at %s' % dump_path)
 
 
 @cli.command(name="create_full")
 @click.option('--location', '-l', default=os.path.join(os.getcwd(), 'listenbrainz-export'),
               help="path to the directory where the dump should be made")
+@click.option('--location-private', '-lp', default=None,
+              help="path to the directory where the private dumps should be made")
 @click.option('--threads', '-t', type=int, default=DUMP_DEFAULT_THREAD_COUNT,
               help="the number of threads to be used while compression")
 @click.option('--dump-id', type=int, default=None,
@@ -125,13 +126,14 @@ def create_mbcanonical(location, use_lb_conn):
 @click.option('--db/--no-db', 'do_db_dump', type=bool, default=True)
 @click.option('--timescale/--no-timescale', 'do_timescale_dump', type=bool, default=True)
 @click.option('--stats/--no-stats', 'do_stats_dump', type=bool, default=True)
-def create_full(location, threads, dump_id, do_listen_dump: bool, do_spark_dump: bool,
-                do_db_dump: bool, do_timescale_dump: bool, do_stats_dump: bool):
+def create_full(location, location_private, threads, dump_id, do_listen_dump: bool,
+                do_spark_dump: bool, do_db_dump: bool, do_timescale_dump: bool, do_stats_dump: bool):
     """ Create a ListenBrainz data dump which includes a private dump, a statistics dump
         and a dump of the actual listens from the listenstore.
 
         Args:
             location (str): path to the directory where the dump should be made
+            location_private (str): path to the directory where the private dumps should be made
             threads (int): the number of threads to be used while compression
             dump_id (int): the ID of the ListenBrainz data dump
             do_listen_dump: If True, make a listens dump
@@ -142,6 +144,15 @@ def create_full(location, threads, dump_id, do_listen_dump: bool, do_spark_dump:
     """
     app = create_app()
     with app.app_context():
+        if not location_private and (do_db_dump or do_timescale_dump):
+            current_app.logger.error("No location specified for creating private database and timescale dumps")
+            sys.exit(-1)
+        if location_private and os.path.normpath(location_private) == os.path.normpath(location):
+            current_app.logger.error("Location specified for public and private dumps cannot be same")
+            sys.exit(-1)
+        if location_private and PurePath(location_private).is_relative_to(PurePath(location)):
+            current_app.logger.error("Private dumps location cannot be a subdirectory of public dumps location")
+            sys.exit(-1)
         ls = DumpListenStore(app)
         if dump_id is None:
             end_time = datetime.now()
@@ -153,19 +164,25 @@ def create_full(location, threads, dump_id, do_listen_dump: bool, do_spark_dump:
                 sys.exit(-1)
             end_time = dump_entry['created']
 
-        ts = end_time.strftime('%Y%m%d-%H%M%S')
-        dump_name = 'listenbrainz-dump-{dump_id}-{time}-full'.format(
-            dump_id=dump_id, time=ts)
+        dump_name = f'listenbrainz-dump-{dump_id}-{end_time.strftime("%Y%m%d-%H%M%S")}-full'
         dump_path = os.path.join(location, dump_name)
         create_path(dump_path)
 
+        private_dump_path = None
+        if location_private:
+            private_dump_path = os.path.join(location_private, dump_name)
+            create_path(private_dump_path)
+
         expected_num_dumps = 0
+        expected_num_private_dumps = 0
         if do_db_dump:
-            db_dump.dump_postgres_db(dump_path, end_time, threads)
-            expected_num_dumps += 2
+            db_dump.dump_postgres_db(dump_path, private_dump_path, end_time, threads)
+            expected_num_dumps += 1
+            expected_num_private_dumps += 1
         if do_timescale_dump:
-            db_dump.dump_timescale_db(dump_path, end_time, threads)
-            expected_num_dumps += 2
+            db_dump.dump_timescale_db(dump_path, private_dump_path, end_time, threads)
+            expected_num_dumps += 1
+            expected_num_private_dumps += 1
         if do_listen_dump:
             ls.dump_listens(dump_path, dump_id=dump_id, end_time=end_time, threads=threads)
             expected_num_dumps += 1
@@ -178,25 +195,34 @@ def create_full(location, threads, dump_id, do_listen_dump: bool, do_spark_dump:
 
         try:
             write_hashes(dump_path)
+            if private_dump_path:
+                write_hashes(private_dump_path)
         except IOError as e:
-            current_app.logger.error(
-                'Unable to create hash files! Error: %s', str(e), exc_info=True)
+            current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True)
             sys.exit(-1)
 
         try:
             # 6 types of dumps, archive, md5, sha256 for each
             expected_num_dump_files = expected_num_dumps * 3
+            expected_num_private_dumps = expected_num_private_dumps * 3
             if not sanity_check_dumps(dump_path, expected_num_dump_files):
                 return sys.exit(-1)
+            if private_dump_path and not sanity_check_dumps(private_dump_path, expected_num_private_dumps):
+                return sys.exit(-1)
         except OSError:
             sys.exit(-1)
 
-        current_app.logger.info(
-            'Dumps created and hashes written at %s' % dump_path)
+        current_app.logger.info('Dumps created and hashes written at %s' % dump_path)
+        if private_dump_path:
+            current_app.logger.info('Private dumps created and hashes written at %s' % private_dump_path)
 
         # Write the DUMP_ID file so that the FTP sync scripts can be more robust
         with open(os.path.join(dump_path, "DUMP_ID.txt"), "w") as f:
-            f.write("%s %s full\n" % (ts, dump_id))
+            f.write("%s %s full\n" % (end_time.strftime('%Y%m%d-%H%M%S'), dump_id))
+        if private_dump_path:
+            # Write the DUMP_ID file so that the backup sync scripts can be more robust
+            with open(os.path.join(private_dump_path, "DUMP_ID.txt"), "w") as f:
+                f.write("%s %s full\n" % (end_time.strftime('%Y%m%d-%H%M%S'), dump_id))
 
         sys.exit(0)
 
@@ -215,22 +241,18 @@ def create_incremental(location, threads, dump_id):
         else:
             dump_entry = db_dump.get_dump_entry(dump_id)
             if dump_entry is None:
-                current_app.logger.error(
-                    "No dump with ID %d found, exiting!", dump_id)
+                current_app.logger.error("No dump with ID %d found, exiting!", dump_id)
                 sys.exit(-1)
             end_time = dump_entry['created']
 
         prev_dump_entry = db_dump.get_dump_entry(dump_id - 1)
         if prev_dump_entry is None:  # incremental dumps must have a previous dump in the series
-            current_app.logger.error(
-                "Invalid dump ID %d, could not find previous dump", dump_id)
+            current_app.logger.error("Invalid dump ID %d, could not find previous dump", dump_id)
             sys.exit(-1)
         start_time = prev_dump_entry['created']
-        current_app.logger.info(
-            "Dumping data from %s to %s", start_time, end_time)
+        current_app.logger.info("Dumping data from %s to %s", start_time, end_time)
 
-        dump_name = 'listenbrainz-dump-{dump_id}-{time}-incremental'.format(
-            dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S'))
+        dump_name = f'listenbrainz-dump-{dump_id}-{end_time.strftime("%Y%m%d-%H%M%S")}-incremental'
         dump_path = os.path.join(location, dump_name)
         create_path(dump_path)
 
@@ -241,8 +263,7 @@ def create_incremental(location, threads, dump_id):
         try:
             write_hashes(dump_path)
         except IOError as e:
-            current_app.logger.error(
-                'Unable to create hash files! Error: %s', str(e), exc_info=True)
+            current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True)
             sys.exit(-1)
 
         try:
@@ -253,11 +274,9 @@ def create_incremental(location, threads, dump_id):
 
         # Write the DUMP_ID file so that the FTP sync scripts can be more robust
         with open(os.path.join(dump_path, "DUMP_ID.txt"), "w") as f:
-            f.write("%s %s incremental\n" %
-                    (end_time.strftime('%Y%m%d-%H%M%S'), dump_id))
+            f.write("%s %s incremental\n" % (end_time.strftime('%Y%m%d-%H%M%S'), dump_id))
 
-        current_app.logger.info(
-            'Dumps created and hashes written at %s' % dump_path)
+        current_app.logger.info('Dumps created and hashes written at %s' % dump_path)
         sys.exit(0)
 
 

diff --git a/listenbrainz/db/tests/test_dump.py b/listenbrainz/db/tests/test_dump.py
@@ -48,6 +48,7 @@ class DumpTestCase(DatabaseTestCase):
     def setUp(self):
         super().setUp()
         self.tempdir = tempfile.mkdtemp()
+        self.tempdir_private = tempfile.mkdtemp()
         self.app = create_app()
 
     def tearDown(self):
@@ -125,7 +126,7 @@ def test_import_postgres_db(self):
             self.assertEqual(user_count, 1)
 
             # do a db dump and reset the db
-            private_dump, public_dump = db_dump.dump_postgres_db(self.tempdir)
+            private_dump, public_dump = db_dump.dump_postgres_db(self.tempdir, self.tempdir_private)
             self.reset_db()
             user_count = db_user.get_user_count()
             self.assertEqual(user_count, 0)
@@ -163,7 +164,7 @@ def test_dump_recording_feedback(self):
             db_feedback.insert(feedback)
 
             # do a db dump and reset the db
-            private_dump, public_dump = db_dump.dump_postgres_db(self.tempdir)
+            private_dump, public_dump = db_dump.dump_postgres_db(self.tempdir, self.tempdir_private)
             self.reset_db()
             user_count = db_user.get_user_count()
             self.assertEqual(user_count, 0)