Skip to content

Commit

Permalink
LB-1382: Place private dumps into wholly seperate file system paths
Browse files Browse the repository at this point in the history
To avoid accidental leaking of private data, make the separation between public
and private dumps more explicit. Often such errors occur when running scripts by
hand, so adding a few more safeguards to the dumps running code too.
  • Loading branch information
amCap1712 committed Dec 27, 2023
1 parent 60ab3c3 commit 7d8c087
Show file tree
Hide file tree
Showing 7 changed files with 196 additions and 54 deletions.
3 changes: 3 additions & 0 deletions admin/config.sh.ctmpl
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

DUMP_THREADS="{{template "KEY" "dump_threads"}}"
DUMP_BASE_DIR="{{template "KEY" "base_dir"}}"
PRIVATE_DUMP_BASE_DIR="{{template "KEY" "private_base_dir"}}"

# Where to back things up to, who should own the backup files, and what mode
# those files should have.
Expand All @@ -15,6 +16,8 @@ BACKUP_GROUP="{{template "KEY" "group"}}"
BACKUP_DIR_MODE=700
BACKUP_FILE_MODE=600

PRIVATE_BACKUP_DIR="{{template "KEY" "private_backup_dir"}}"

# Same but for the files that need to copied to the FTP server,
# for public consumption
FTP_DIR="{{template "KEY" "ftp_dir"}}"
Expand Down
3 changes: 3 additions & 0 deletions admin/config.sh.sample
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

DUMP_THREADS=4
DUMP_BASE_DIR='/mnt/dumps'
PRIVATE_DUMP_BASE_DIR='/private/dumps'

# Where to back things up to, who should own the backup files, and what mode
# those files should have.
Expand All @@ -12,6 +13,8 @@ BACKUP_GROUP=root
BACKUP_DIR_MODE=700
BACKUP_FILE_MODE=600

PRIVATE_BACKUP_DIR=/private/backup

# Same but for the files that need to copied to the FTP server,
# for public consumption
FTP_DIR='/mnt/ftp/'
Expand Down
40 changes: 39 additions & 1 deletion admin/create-dumps.sh
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,11 @@ if [ -z $DUMP_BASE_DIR ]; then
exit 1
fi

if [ -z $PRIVATE_DUMP_BASE_DIR ]; then
echo "DUMP_BASE_PRIVATE_DIR isn't set"
exit 1
fi

DUMP_TYPE="${1:-full}"
# consume dump type argument so that we can pass the remaining arguments to
# the python dump manager script
Expand Down Expand Up @@ -112,8 +117,13 @@ echo "DUMP_BASE_DIR is $DUMP_BASE_DIR"
echo "creating DUMP_TEMP_DIR $DUMP_TEMP_DIR"
mkdir -p "$DUMP_TEMP_DIR"

PRIVATE_DUMP_BASE_DIR="$PRIVATE_DUMP_BASE_DIR/$SUB_DIR.$$"
echo "DUMP_BASE_DIR is $PRIVATE_DUMP_BASE_DIR"
echo "creating PRIVATE_DUMP_BASE_DIR $PRIVATE_DUMP_BASE_DIR"
mkdir -p "$PRIVATE_DUMP_BASE_DIR"

if [ "$DUMP_TYPE" == "full" ]; then
if ! /usr/local/bin/python manage.py dump create_full -l "$DUMP_TEMP_DIR" -t "$DUMP_THREADS" "$@"; then
if ! /usr/local/bin/python manage.py dump create_full -l "$DUMP_TEMP_DIR" -lp "$PRIVATE_DUMP_BASE_DIR" -t "$DUMP_THREADS" "$@"; then
echo "Full dump failed, exiting!"
exit 1
fi
Expand Down Expand Up @@ -169,6 +179,34 @@ retry rsync -a "$DUMP_DIR/" "$BACKUP_DIR/$SUB_DIR/$DUMP_NAME/"
chmod "$BACKUP_FILE_MODE" "$BACKUP_DIR/$SUB_DIR/$DUMP_NAME/"*
echo "Dumps copied to backup directory!"

HAS_EMPTY_PRIVATE_DIRS_OR_FILES=$(find "$PRIVATE_DUMP_BASE_DIR" -empty)
if [ -z "$HAS_EMPTY_PRIVATE_DIRS_OR_FILES" ]; then
# private dumps directory is not empty

PRIVATE_DUMP_ID_FILE=$(find "$PRIVATE_DUMP_BASE_DIR" -type f -name 'DUMP_ID.txt')
if [ -z "$PRIVATE_DUMP_ID_FILE" ]; then
echo "DUMP_ID.txt not found, exiting."
exit 1
fi

read -r PRIVATE_DUMP_TIMESTAMP PRIVATE_DUMP_ID PRIVATE_DUMP_TYPE < "$PRIVATE_DUMP_ID_FILE"
echo "Dump created with timestamp $PRIVATE_DUMP_TIMESTAMP"
PRIVATE_DUMP_DIR=$(dirname "$PRIVATE_DUMP_ID_FILE")
PRIVATE_DUMP_NAME=$(basename "$PRIVATE_DUMP_DIR")

# Backup dumps to the backup volume
echo "Creating private dumps backup directories..."
mkdir -m "$BACKUP_FILE_MODE" -p \
"$PRIVATE_BACKUP_DIR/$SUB_DIR" \
"$PRIVATE_BACKUP_DIR/$SUB_DIR/$PRIVATE_DUMP_NAME"
echo "Private dumps backup directories created!"

# Copy the files into the backup directory
echo "Begin copying private dumps to private backup directory..."
retry rsync -a "$PRIVATE_DUMP_DIR/" "$PRIVATE_BACKUP_DIR/$SUB_DIR/$PRIVATE_DUMP_NAME/"
chmod "$BACKUP_FILE_MODE" "$PRIVATE_BACKUP_DIR/$SUB_DIR/$PRIVATE_DUMP_NAME/"*
echo "Dumps copied to backup directory!"
fi

# rsync the files into the FTP server
FTP_CURRENT_DUMP_DIR="$FTP_DIR/$SUB_DIR/$DUMP_NAME"
Expand Down
24 changes: 13 additions & 11 deletions listenbrainz/db/dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -240,30 +240,32 @@
}


def dump_postgres_db(location, dump_time=datetime.today(), threads=DUMP_DEFAULT_THREAD_COUNT):
def dump_postgres_db(location, location_private, dump_time=datetime.today(), threads=DUMP_DEFAULT_THREAD_COUNT):
""" Create postgres database dump in the specified location
Arguments:
location: Directory where the final dump will be stored
location: Directory where the final public dump will be stored
location_private: Directory where the final private dump will be stored
dump_time: datetime object representing when the dump was started
threads: Maximal number of threads to run during compression
Returns:
a tuple: (path to private dump, path to public dump)
"""
current_app.logger.info('Beginning dump of PostgreSQL database...')
current_app.logger.info('dump path: %s', location)
current_app.logger.info('private dump path: %s', location_private)

current_app.logger.info('Creating dump of private data...')
try:
private_dump = create_private_dump(location, dump_time, threads)
private_dump = create_private_dump(location_private, dump_time, threads)
except Exception:
current_app.logger.critical('Unable to create private db dump due to error: ', exc_info=True)
current_app.logger.info('Removing created files and giving up...')
shutil.rmtree(location)
shutil.rmtree(location_private)
return
current_app.logger.info('Dump of private data created at %s!', private_dump)

current_app.logger.info('public dump path: %s', location)
current_app.logger.info('Creating dump of public data...')
try:
public_dump = create_public_dump(location, dump_time, threads)
Expand All @@ -277,12 +279,13 @@ def dump_postgres_db(location, dump_time=datetime.today(), threads=DUMP_DEFAULT_
return private_dump, public_dump


def dump_timescale_db(location: str, dump_time: datetime = datetime.today(),
def dump_timescale_db(location: str, location_private: str, dump_time: datetime = datetime.today(),
threads: int = DUMP_DEFAULT_THREAD_COUNT) -> Optional[Tuple[str, str]]:
""" Create timescale database (excluding listens) dump in the specified location
Arguments:
location: Directory where the final dump will be stored
location: Directory where the final public dump will be stored
location_private: Directory where the final private dump will be stored
dump_time: datetime object representing when the dump was started
threads: Maximal number of threads to run during compression
Expand All @@ -293,14 +296,13 @@ def dump_timescale_db(location: str, dump_time: datetime = datetime.today(),

current_app.logger.info('Creating dump of timescale private data...')
try:
private_timescale_dump = create_private_timescale_dump(location, dump_time, threads)
private_timescale_dump = create_private_timescale_dump(location_private, dump_time, threads)
except Exception:
current_app.logger.critical('Unable to create private timescale db dump due to error: ', exc_info=True)
current_app.logger.info('Removing created files and giving up...')
shutil.rmtree(location)
shutil.rmtree(location_private)
return
current_app.logger.info(
'Dump of private timescale data created at %s!', private_timescale_dump)
current_app.logger.info('Dump of private timescale data created at %s!', private_timescale_dump)

current_app.logger.info('Creating dump of timescale public data...')
try:
Expand Down
83 changes: 51 additions & 32 deletions listenbrainz/db/dump_manager.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
""" This module contains a click group with commands to
create and import postgres data dumps.
"""
from pathlib import PurePath

# listenbrainz-server - Server for the ListenBrainz project
#
Expand Down Expand Up @@ -92,8 +93,7 @@ def create_mbcanonical(location, use_lb_conn):
try:
write_hashes(dump_path)
except IOError as e:
current_app.logger.error(
'Unable to create hash files! Error: %s', str(e), exc_info=True)
current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True)
sys.exit(-1)

try:
Expand All @@ -109,13 +109,14 @@ def create_mbcanonical(location, use_lb_conn):
# Mapping dump doesn't have a dump id (second field) as they are standalone
f.write("%s 0 mbcanonical\n" % (ts, ))

current_app.logger.info(
'Dumps created and hashes written at %s' % dump_path)
current_app.logger.info('Dumps created and hashes written at %s' % dump_path)


@cli.command(name="create_full")
@click.option('--location', '-l', default=os.path.join(os.getcwd(), 'listenbrainz-export'),
help="path to the directory where the dump should be made")
@click.option('--location-private', '-lp', default=None,
help="path to the directory where the private dumps should be made")
@click.option('--threads', '-t', type=int, default=DUMP_DEFAULT_THREAD_COUNT,
help="the number of threads to be used while compression")
@click.option('--dump-id', type=int, default=None,
Expand All @@ -125,13 +126,14 @@ def create_mbcanonical(location, use_lb_conn):
@click.option('--db/--no-db', 'do_db_dump', type=bool, default=True)
@click.option('--timescale/--no-timescale', 'do_timescale_dump', type=bool, default=True)
@click.option('--stats/--no-stats', 'do_stats_dump', type=bool, default=True)
def create_full(location, threads, dump_id, do_listen_dump: bool, do_spark_dump: bool,
do_db_dump: bool, do_timescale_dump: bool, do_stats_dump: bool):
def create_full(location, location_private, threads, dump_id, do_listen_dump: bool,
do_spark_dump: bool, do_db_dump: bool, do_timescale_dump: bool, do_stats_dump: bool):
""" Create a ListenBrainz data dump which includes a private dump, a statistics dump
and a dump of the actual listens from the listenstore.
Args:
location (str): path to the directory where the dump should be made
location_private (str): path to the directory where the private dumps should be made
threads (int): the number of threads to be used while compression
dump_id (int): the ID of the ListenBrainz data dump
do_listen_dump: If True, make a listens dump
Expand All @@ -142,6 +144,15 @@ def create_full(location, threads, dump_id, do_listen_dump: bool, do_spark_dump:
"""
app = create_app()
with app.app_context():
if not location_private and (do_db_dump or do_timescale_dump):
current_app.logger.error("No location specified for creating private database and timescale dumps")
sys.exit(-1)
if location_private and os.path.normpath(location_private) == os.path.normpath(location):
current_app.logger.error("Location specified for public and private dumps cannot be same")
sys.exit(-1)
if location_private and PurePath(location_private).is_relative_to(PurePath(location)):
current_app.logger.error("Private dumps location cannot be a subdirectory of public dumps location")
sys.exit(-1)
ls = DumpListenStore(app)
if dump_id is None:
end_time = datetime.now()
Expand All @@ -153,19 +164,25 @@ def create_full(location, threads, dump_id, do_listen_dump: bool, do_spark_dump:
sys.exit(-1)
end_time = dump_entry['created']

ts = end_time.strftime('%Y%m%d-%H%M%S')
dump_name = 'listenbrainz-dump-{dump_id}-{time}-full'.format(
dump_id=dump_id, time=ts)
dump_name = f'listenbrainz-dump-{dump_id}-{end_time.strftime("%Y%m%d-%H%M%S")}-full'
dump_path = os.path.join(location, dump_name)
create_path(dump_path)

private_dump_path = None
if location_private:
private_dump_path = os.path.join(location_private, dump_name)
create_path(private_dump_path)

expected_num_dumps = 0
expected_num_private_dumps = 0
if do_db_dump:
db_dump.dump_postgres_db(dump_path, end_time, threads)
expected_num_dumps += 2
db_dump.dump_postgres_db(dump_path, private_dump_path, end_time, threads)
expected_num_dumps += 1
expected_num_private_dumps += 1
if do_timescale_dump:
db_dump.dump_timescale_db(dump_path, end_time, threads)
expected_num_dumps += 2
db_dump.dump_timescale_db(dump_path, private_dump_path, end_time, threads)
expected_num_dumps += 1
expected_num_private_dumps += 1
if do_listen_dump:
ls.dump_listens(dump_path, dump_id=dump_id, end_time=end_time, threads=threads)
expected_num_dumps += 1
Expand All @@ -178,25 +195,34 @@ def create_full(location, threads, dump_id, do_listen_dump: bool, do_spark_dump:

try:
write_hashes(dump_path)
if private_dump_path:
write_hashes(private_dump_path)
except IOError as e:
current_app.logger.error(
'Unable to create hash files! Error: %s', str(e), exc_info=True)
current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True)
sys.exit(-1)

try:
# 6 types of dumps, archive, md5, sha256 for each
expected_num_dump_files = expected_num_dumps * 3
expected_num_private_dumps = expected_num_private_dumps * 3
if not sanity_check_dumps(dump_path, expected_num_dump_files):
return sys.exit(-1)
if private_dump_path and not sanity_check_dumps(private_dump_path, expected_num_private_dumps):
return sys.exit(-1)
except OSError:
sys.exit(-1)

current_app.logger.info(
'Dumps created and hashes written at %s' % dump_path)
current_app.logger.info('Dumps created and hashes written at %s' % dump_path)
if private_dump_path:
current_app.logger.info('Private dumps created and hashes written at %s' % private_dump_path)

# Write the DUMP_ID file so that the FTP sync scripts can be more robust
with open(os.path.join(dump_path, "DUMP_ID.txt"), "w") as f:
f.write("%s %s full\n" % (ts, dump_id))
f.write("%s %s full\n" % (end_time.strftime('%Y%m%d-%H%M%S'), dump_id))
if private_dump_path:
# Write the DUMP_ID file so that the backup sync scripts can be more robust
with open(os.path.join(private_dump_path, "DUMP_ID.txt"), "w") as f:
f.write("%s %s full\n" % (end_time.strftime('%Y%m%d-%H%M%S'), dump_id))

sys.exit(0)

Expand All @@ -215,22 +241,18 @@ def create_incremental(location, threads, dump_id):
else:
dump_entry = db_dump.get_dump_entry(dump_id)
if dump_entry is None:
current_app.logger.error(
"No dump with ID %d found, exiting!", dump_id)
current_app.logger.error("No dump with ID %d found, exiting!", dump_id)
sys.exit(-1)
end_time = dump_entry['created']

prev_dump_entry = db_dump.get_dump_entry(dump_id - 1)
if prev_dump_entry is None: # incremental dumps must have a previous dump in the series
current_app.logger.error(
"Invalid dump ID %d, could not find previous dump", dump_id)
current_app.logger.error("Invalid dump ID %d, could not find previous dump", dump_id)
sys.exit(-1)
start_time = prev_dump_entry['created']
current_app.logger.info(
"Dumping data from %s to %s", start_time, end_time)
current_app.logger.info("Dumping data from %s to %s", start_time, end_time)

dump_name = 'listenbrainz-dump-{dump_id}-{time}-incremental'.format(
dump_id=dump_id, time=end_time.strftime('%Y%m%d-%H%M%S'))
dump_name = f'listenbrainz-dump-{dump_id}-{end_time.strftime("%Y%m%d-%H%M%S")}-incremental'
dump_path = os.path.join(location, dump_name)
create_path(dump_path)

Expand All @@ -241,8 +263,7 @@ def create_incremental(location, threads, dump_id):
try:
write_hashes(dump_path)
except IOError as e:
current_app.logger.error(
'Unable to create hash files! Error: %s', str(e), exc_info=True)
current_app.logger.error('Unable to create hash files! Error: %s', str(e), exc_info=True)
sys.exit(-1)

try:
Expand All @@ -253,11 +274,9 @@ def create_incremental(location, threads, dump_id):

# Write the DUMP_ID file so that the FTP sync scripts can be more robust
with open(os.path.join(dump_path, "DUMP_ID.txt"), "w") as f:
f.write("%s %s incremental\n" %
(end_time.strftime('%Y%m%d-%H%M%S'), dump_id))
f.write("%s %s incremental\n" % (end_time.strftime('%Y%m%d-%H%M%S'), dump_id))

current_app.logger.info(
'Dumps created and hashes written at %s' % dump_path)
current_app.logger.info('Dumps created and hashes written at %s' % dump_path)
sys.exit(0)


Expand Down
5 changes: 3 additions & 2 deletions listenbrainz/db/tests/test_dump.py
Original file line number Diff line number Diff line change
Expand Up @@ -48,6 +48,7 @@ class DumpTestCase(DatabaseTestCase):
def setUp(self):
super().setUp()
self.tempdir = tempfile.mkdtemp()
self.tempdir_private = tempfile.mkdtemp()
self.app = create_app()

def tearDown(self):
Expand Down Expand Up @@ -125,7 +126,7 @@ def test_import_postgres_db(self):
self.assertEqual(user_count, 1)

# do a db dump and reset the db
private_dump, public_dump = db_dump.dump_postgres_db(self.tempdir)
private_dump, public_dump = db_dump.dump_postgres_db(self.tempdir, self.tempdir_private)
self.reset_db()
user_count = db_user.get_user_count()
self.assertEqual(user_count, 0)
Expand Down Expand Up @@ -163,7 +164,7 @@ def test_dump_recording_feedback(self):
db_feedback.insert(feedback)

# do a db dump and reset the db
private_dump, public_dump = db_dump.dump_postgres_db(self.tempdir)
private_dump, public_dump = db_dump.dump_postgres_db(self.tempdir, self.tempdir_private)
self.reset_db()
user_count = db_user.get_user_count()
self.assertEqual(user_count, 0)
Expand Down
Loading

0 comments on commit 7d8c087

Please sign in to comment.