From d487eb2d02ab51108c43ddf706f9e77d26385235 Mon Sep 17 00:00:00 2001 From: Julien Richard-Foy Date: Sat, 13 Jul 2024 14:42:02 +0200 Subject: [PATCH 1/3] Make some configuration properties optional - `rename`: default to the empty list if not set - `skipTokenRanges`: default to the empty set if not set - `validation`: make it mandatory only when we run the Validator (but not the Migrator) Fixes #164 --- .../scala/com/scylladb/migrator/Migrator.scala | 2 +- .../alternator/AlternatorValidator.scala | 5 +++-- .../migrator/config/MigratorConfig.scala | 13 +++++++++---- .../migrator/scylla/ScyllaMigrator.scala | 6 +++--- .../migrator/scylla/ScyllaValidator.scala | 17 +++++++++++------ 5 files changed, 27 insertions(+), 16 deletions(-) diff --git a/migrator/src/main/scala/com/scylladb/migrator/Migrator.scala b/migrator/src/main/scala/com/scylladb/migrator/Migrator.scala index 45ecb77d..3a84173a 100644 --- a/migrator/src/main/scala/com/scylladb/migrator/Migrator.scala +++ b/migrator/src/main/scala/com/scylladb/migrator/Migrator.scala @@ -36,7 +36,7 @@ object Migrator { spark, cassandraSource, cassandraSource.preserveTimestamps, - migratorConfig.skipTokenRanges) + migratorConfig.getSkipTokenRangesOrEmptySet) ScyllaMigrator.migrate(migratorConfig, scyllaTarget, sourceDF) case (parquetSource: SourceSettings.Parquet, scyllaTarget: TargetSettings.Scylla) => val sourceDF = readers.Parquet.readDataFrame(spark, parquetSource) diff --git a/migrator/src/main/scala/com/scylladb/migrator/alternator/AlternatorValidator.scala b/migrator/src/main/scala/com/scylladb/migrator/alternator/AlternatorValidator.scala index 44530acf..62052018 100644 --- a/migrator/src/main/scala/com/scylladb/migrator/alternator/AlternatorValidator.scala +++ b/migrator/src/main/scala/com/scylladb/migrator/alternator/AlternatorValidator.scala @@ -49,7 +49,8 @@ object AlternatorValidator { // Define some aliases to prevent the Spark engine to try to serialize the whole object graph val renamedColumn = config.renamesMap - val configValidation = config.validation + val configValidation = config.validation.getOrElse( + sys.error("Missing required property 'validation' in the configuration file.")) val targetByKey: RDD[(List[DdbValue], collection.Map[String, DdbValue])] = target @@ -74,7 +75,7 @@ object AlternatorValidator { configValidation.floatingPointTolerance ) } - .take(config.validation.failuresToFetch) + .take(configValidation.failuresToFetch) .toList } diff --git a/migrator/src/main/scala/com/scylladb/migrator/config/MigratorConfig.scala b/migrator/src/main/scala/com/scylladb/migrator/config/MigratorConfig.scala index b5686bea..4932adca 100644 --- a/migrator/src/main/scala/com/scylladb/migrator/config/MigratorConfig.scala +++ b/migrator/src/main/scala/com/scylladb/migrator/config/MigratorConfig.scala @@ -10,15 +10,20 @@ import io.circe.{ Decoder, DecodingFailure, Encoder, Error, Json } case class MigratorConfig(source: SourceSettings, target: TargetSettings, - renames: List[Rename], + renames: Option[List[Rename]], savepoints: Savepoints, - skipTokenRanges: Set[(Token[_], Token[_])], - validation: Validation) { + skipTokenRanges: Option[Set[(Token[_], Token[_])]], + validation: Option[Validation]) { def render: String = this.asJson.asYaml.spaces2 + def getRenamesOrNil: List[Rename] = renames.getOrElse(Nil) + /** The list of renames modelled as a Map from the old column name to the new column name */ lazy val renamesMap: Map[String, String] = - renames.map(rename => rename.from -> rename.to).toMap.withDefault(identity) + getRenamesOrNil.map(rename => rename.from -> rename.to).toMap.withDefault(identity) + + def getSkipTokenRangesOrEmptySet: Set[(Token[_], Token[_])] = skipTokenRanges.getOrElse(Set.empty) + } object MigratorConfig { implicit val tokenEncoder: Encoder[Token[_]] = Encoder.instance { diff --git a/migrator/src/main/scala/com/scylladb/migrator/scylla/ScyllaMigrator.scala b/migrator/src/main/scala/com/scylladb/migrator/scylla/ScyllaMigrator.scala index c2ea17f9..807f8892 100644 --- a/migrator/src/main/scala/com/scylladb/migrator/scylla/ScyllaMigrator.scala +++ b/migrator/src/main/scala/com/scylladb/migrator/scylla/ScyllaMigrator.scala @@ -76,7 +76,7 @@ object ScyllaMigrator { log.info( "Savepoints array defined, size of the array: " + migratorConfig.skipTokenRanges.size) - val diff = allTokenRanges.diff(migratorConfig.skipTokenRanges) + val diff = allTokenRanges.diff(migratorConfig.getSkipTokenRangesOrEmptySet) log.info("Diff ... total diff of full ranges to savepoints is: " + diff.size) log.debug("Dump of the missing tokens: ") log.debug(diff) @@ -88,7 +88,7 @@ object ScyllaMigrator { try { writers.Scylla.writeDataframe( target, - migratorConfig.renames, + migratorConfig.getRenamesOrNil, sourceDF.dataFrame, sourceDF.timestampColumns, tokenRangeAccumulator) @@ -147,7 +147,7 @@ object ScyllaMigrator { (range.range.start.asInstanceOf[Token[_]], range.range.end.asInstanceOf[Token[_]])) val modifiedConfig = config.copy( - skipTokenRanges = config.skipTokenRanges ++ rangesToSkip + skipTokenRanges = Some(config.getSkipTokenRangesOrEmptySet ++ rangesToSkip) ) Files.write(filename, modifiedConfig.render.getBytes(StandardCharsets.UTF_8)) diff --git a/migrator/src/main/scala/com/scylladb/migrator/scylla/ScyllaValidator.scala b/migrator/src/main/scala/com/scylladb/migrator/scylla/ScyllaValidator.scala index 73a07ac4..c637972d 100644 --- a/migrator/src/main/scala/com/scylladb/migrator/scylla/ScyllaValidator.scala +++ b/migrator/src/main/scala/com/scylladb/migrator/scylla/ScyllaValidator.scala @@ -25,6 +25,11 @@ object ScyllaValidator { sourceSettings: SourceSettings.Cassandra, targetSettings: TargetSettings.Scylla, config: MigratorConfig)(implicit spark: SparkSession): List[RowComparisonFailure] = { + + val validationConfig = + config.validation.getOrElse( + sys.error("Missing required property 'validation' in the configuration file.")) + val sourceConnector: CassandraConnector = Connectors.sourceConnector(spark.sparkContext.getConf, sourceSettings) val targetConnector: CassandraConnector = @@ -118,14 +123,14 @@ object ScyllaValidator { RowComparisonFailure.compareCassandraRows( l, r, - config.validation.floatingPointTolerance, - config.validation.timestampMsTolerance, - config.validation.ttlToleranceMillis, - config.validation.writetimeToleranceMillis, - config.validation.compareTimestamps + validationConfig.floatingPointTolerance, + validationConfig.timestampMsTolerance, + validationConfig.ttlToleranceMillis, + validationConfig.writetimeToleranceMillis, + validationConfig.compareTimestamps ) } - .take(config.validation.failuresToFetch) + .take(validationConfig.failuresToFetch) .toList } From 6b1f6b9e1e797a6ee6ea8511777da0d9609b0f9a Mon Sep 17 00:00:00 2001 From: Julien Richard-Foy Date: Sat, 13 Jul 2024 14:52:06 +0200 Subject: [PATCH 2/3] Simplify configuration files --- config.yaml.example | 39 ++++++++++--------- .../cassandra-to-scylla-basic.yaml | 4 +- .../cassandra-to-scylla-renames.yaml | 8 ---- ...ynamodb-s3-export-to-alternator-basic.yaml | 10 ----- .../dynamodb-to-alternator-basic.yaml | 4 +- .../dynamodb-to-alternator-issue-103.yaml | 10 ----- .../dynamodb-to-alternator-renames.yaml | 8 ---- .../parquet-to-scylla-basic.yaml | 10 ----- .../scylla-to-scylla-basic.yaml | 10 ----- 9 files changed, 22 insertions(+), 81 deletions(-) diff --git a/config.yaml.example b/config.yaml.example index e076ec7a..711bcaaf 100644 --- a/config.yaml.example +++ b/config.yaml.example @@ -260,26 +260,27 @@ savepoints: # Interval in which savepoints will be created intervalSeconds: 300 -# Column renaming configuration. If you'd like to rename any columns, specify them like so: +# Optional - Column renaming configuration. If you'd like to rename any columns, specify them like so: # - from: source_column_name # to: dest_column_name -renames: [] -# Which token ranges to skip. You shouldn't need to fill this in normally; the migrator will -# create a savepoint file with this filled. -skipTokenRanges: [] +# renames: [] -# Configuration section for running the validator. The validator is run manually (see README). -validation: - # Should WRITETIMEs and TTLs be compared? - compareTimestamps: true - # What difference should we allow between TTLs? - ttlToleranceMillis: 60000 - # What difference should we allow between WRITETIMEs? - writetimeToleranceMillis: 1000 - # How many differences to fetch and print - failuresToFetch: 100 - # What difference should we allow between floating point numbers? - floatingPointTolerance: 0.001 - # What difference in ms should we allow between timestamps? - timestampMsTolerance: 0 +# Optional - Which token ranges to skip. You shouldn't need to fill this in normally; the migrator will +# create a savepoint file with this filled. +# skipTokenRanges: [] +# Configuration section for running the validator. The validator is run manually (see documentation). +# Mandatory if the application is executed in validation mode. +# validation: +# # Should WRITETIMEs and TTLs be compared? +# compareTimestamps: true +# # What difference should we allow between TTLs? +# ttlToleranceMillis: 60000 +# # What difference should we allow between WRITETIMEs? +# writetimeToleranceMillis: 1000 +# # How many differences to fetch and print +# failuresToFetch: 100 +# # What difference should we allow between floating point numbers? +# floatingPointTolerance: 0.001 +# # What difference in ms should we allow between timestamps? +# timestampMsTolerance: 0 diff --git a/tests/src/test/configurations/cassandra-to-scylla-basic.yaml b/tests/src/test/configurations/cassandra-to-scylla-basic.yaml index b046cac7..de1ee946 100644 --- a/tests/src/test/configurations/cassandra-to-scylla-basic.yaml +++ b/tests/src/test/configurations/cassandra-to-scylla-basic.yaml @@ -28,12 +28,10 @@ target: connections: 16 stripTrailingZerosForDecimals: false -renames: [] - savepoints: path: /app/savepoints intervalSeconds: 300 -skipTokenRanges: [] + validation: compareTimestamps: true ttlToleranceMillis: 60000 diff --git a/tests/src/test/configurations/cassandra-to-scylla-renames.yaml b/tests/src/test/configurations/cassandra-to-scylla-renames.yaml index 437414a0..70eea603 100644 --- a/tests/src/test/configurations/cassandra-to-scylla-renames.yaml +++ b/tests/src/test/configurations/cassandra-to-scylla-renames.yaml @@ -35,11 +35,3 @@ renames: savepoints: path: /app/savepoints intervalSeconds: 300 -skipTokenRanges: [] -validation: - compareTimestamps: true - ttlToleranceMillis: 60000 - writetimeToleranceMillis: 1000 - failuresToFetch: 100 - floatingPointTolerance: 0.001 - timestampMsTolerance: 0 diff --git a/tests/src/test/configurations/dynamodb-s3-export-to-alternator-basic.yaml b/tests/src/test/configurations/dynamodb-s3-export-to-alternator-basic.yaml index f455fbdd..dd36c875 100644 --- a/tests/src/test/configurations/dynamodb-s3-export-to-alternator-basic.yaml +++ b/tests/src/test/configurations/dynamodb-s3-export-to-alternator-basic.yaml @@ -30,17 +30,7 @@ target: secretKey: dummy streamChanges: false -renames: [] - # Below are unused but mandatory settings savepoints: path: /app/savepoints intervalSeconds: 300 -skipTokenRanges: [] -validation: - compareTimestamps: true - ttlToleranceMillis: 60000 - writetimeToleranceMillis: 1000 - failuresToFetch: 100 - floatingPointTolerance: 0.001 - timestampMsTolerance: 0 diff --git a/tests/src/test/configurations/dynamodb-to-alternator-basic.yaml b/tests/src/test/configurations/dynamodb-to-alternator-basic.yaml index c319fd6f..d3da76d2 100644 --- a/tests/src/test/configurations/dynamodb-to-alternator-basic.yaml +++ b/tests/src/test/configurations/dynamodb-to-alternator-basic.yaml @@ -21,13 +21,11 @@ target: secretKey: dummy streamChanges: false -renames: [] - # Below are unused but mandatory settings savepoints: path: /app/savepoints intervalSeconds: 300 -skipTokenRanges: [] + validation: compareTimestamps: true ttlToleranceMillis: 60000 diff --git a/tests/src/test/configurations/dynamodb-to-alternator-issue-103.yaml b/tests/src/test/configurations/dynamodb-to-alternator-issue-103.yaml index be2873af..e729036b 100644 --- a/tests/src/test/configurations/dynamodb-to-alternator-issue-103.yaml +++ b/tests/src/test/configurations/dynamodb-to-alternator-issue-103.yaml @@ -24,17 +24,7 @@ target: maxMapTasks: 1 streamChanges: false -renames: [] - # Below are unused but mandatory settings savepoints: path: /app/savepoints intervalSeconds: 300 -skipTokenRanges: [] -validation: - compareTimestamps: true - ttlToleranceMillis: 60000 - writetimeToleranceMillis: 1000 - failuresToFetch: 100 - floatingPointTolerance: 0.001 - timestampMsTolerance: 0 diff --git a/tests/src/test/configurations/dynamodb-to-alternator-renames.yaml b/tests/src/test/configurations/dynamodb-to-alternator-renames.yaml index 8d1411f7..b4ae1841 100644 --- a/tests/src/test/configurations/dynamodb-to-alternator-renames.yaml +++ b/tests/src/test/configurations/dynamodb-to-alternator-renames.yaml @@ -31,11 +31,3 @@ renames: savepoints: path: /app/savepoints intervalSeconds: 300 -skipTokenRanges: [] -validation: - compareTimestamps: true - ttlToleranceMillis: 60000 - writetimeToleranceMillis: 1000 - failuresToFetch: 100 - floatingPointTolerance: 0.001 - timestampMsTolerance: 0 diff --git a/tests/src/test/configurations/parquet-to-scylla-basic.yaml b/tests/src/test/configurations/parquet-to-scylla-basic.yaml index 42d3fc88..d2870a70 100644 --- a/tests/src/test/configurations/parquet-to-scylla-basic.yaml +++ b/tests/src/test/configurations/parquet-to-scylla-basic.yaml @@ -16,16 +16,6 @@ target: connections: 16 stripTrailingZerosForDecimals: false -renames: [] - savepoints: path: /app/savepoints intervalSeconds: 300 -skipTokenRanges: [] -validation: - compareTimestamps: true - ttlToleranceMillis: 60000 - writetimeToleranceMillis: 1000 - failuresToFetch: 100 - floatingPointTolerance: 0.001 - timestampMsTolerance: 0 diff --git a/tests/src/test/configurations/scylla-to-scylla-basic.yaml b/tests/src/test/configurations/scylla-to-scylla-basic.yaml index 411e0b3e..f5217776 100644 --- a/tests/src/test/configurations/scylla-to-scylla-basic.yaml +++ b/tests/src/test/configurations/scylla-to-scylla-basic.yaml @@ -28,16 +28,6 @@ target: connections: 16 stripTrailingZerosForDecimals: false -renames: [] - savepoints: path: /app/savepoints intervalSeconds: 300 -skipTokenRanges: [] -validation: - compareTimestamps: true - ttlToleranceMillis: 60000 - writetimeToleranceMillis: 1000 - failuresToFetch: 100 - floatingPointTolerance: 0.001 - timestampMsTolerance: 0 From 170f4cbacf8dbf12cfc46c2aab5fae2e11b2a4ae Mon Sep 17 00:00:00 2001 From: Julien Richard-Foy Date: Sat, 13 Jul 2024 14:59:05 +0200 Subject: [PATCH 3/3] Update documentation --- docs/source/configuration.rst | 10 +++++----- docs/source/rename-columns.rst | 2 ++ 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/docs/source/configuration.rst b/docs/source/configuration.rst index 0ffca0d6..ad188854 100644 --- a/docs/source/configuration.rst +++ b/docs/source/configuration.rst @@ -26,16 +26,16 @@ The configuration file requires the following top-level properties (ie, with no # Target configuration target: # ... - # Columns to rename + # Optional - Columns to rename renames: # ... # Savepoints configuration savepoints: # ... - # Validator configuration + # Validator configuration. Required only if the app is executed in validation mode. validation: # ... - # Used internally + # Optional- Used internally skipTokenRanges: [] These top-level properties are documented in the following sections (except ``skipTokenRanges``, which is used internally). @@ -319,7 +319,7 @@ DynamoDB Target Renames ------- -The ``renames`` property lists the item columns to rename along the migration. To not rename any columns, use the empty array ``renames: []``. +The optional ``renames`` property lists the item columns to rename along the migration. .. code-block:: yaml @@ -347,7 +347,7 @@ When migrating data over CQL-compatible storages, the migrator is able to resume Validation ---------- -The properties of the ``validation`` field are used only when the application is executed in :doc:`validation mode `. +The ``validation`` field and its properties are mandatory only when the application is executed in :doc:`validation mode `. .. code-block:: yaml diff --git a/docs/source/rename-columns.rst b/docs/source/rename-columns.rst index 22f0fee9..bf7ce0b3 100644 --- a/docs/source/rename-columns.rst +++ b/docs/source/rename-columns.rst @@ -13,3 +13,5 @@ Indicate in the migration configuration which columns to rename with the ``renam to: bar - from: xxx to: yyy + +To not perform any renames, leave out the ``renames`` property from the configuration file.