diff --git a/3a_error_buildmodel.txt b/3a_error_buildmodel.txt new file mode 100644 index 0000000..de22f8d --- /dev/null +++ b/3a_error_buildmodel.txt @@ -0,0 +1,91 @@ +Training models. Currently training model for time horizon 0: 0%| | 0/17 [00:00 in +----> 1 modeler.build_model() + +/databricks/python/lib/python3.8/site-packages/fifeforspark/lgb_modelers.py in build_model(self, n_intervals, params) + 75 else: + 76 self.n_intervals = self.set_n_intervals() +---> 77 self.model = self.train( + 78 params=params + 79 ) + +/databricks/python/lib/python3.8/site-packages/fifeforspark/lgb_modelers.py in train(self, params, subset) + 98 for time_horizon in pbar: + 99 pbar.set_description(f"Training models. Currently training model for time horizon {time_horizon}") +--> 100 model = self.train_single_model( + 101 time_horizon=time_horizon, + 102 params=params, + +/databricks/python/lib/python3.8/site-packages/fifeforspark/lgb_modelers.py in train_single_model(self, time_horizon, params, subset) + 157 feature_columns = [column + "_index" for column in self.categorical_features] + self.numeric_features + 158 assembler = VectorAssembler(inputCols=feature_columns, outputCol='features').setHandleInvalid("keep") +--> 159 lgb_model = lgb(featuresCol="features", + 160 labelCol="_label", + 161 **params[time_horizon], + +/databricks/spark/python/pyspark/__init__.py in wrapper(self, *args, **kwargs) + 112 raise TypeError("Method %s forces keyword arguments." % func.__name__) + 113 self._input_kwargs = kwargs +--> 114 return func(self, **kwargs) + 115 return wrapper + 116 + +/local_disk0/spark-2f3bfc77-1c83-4bc9-8574-1927148b86e1/userFiles-8733ba33-5b83-482c-8464-95134948642b/addedFile2496447546510535567mmlspark_2_11_1_0_0_rc3-86e1e.jar/mmlspark/lightgbm/_LightGBMClassifier.py in __init__(self, baggingFraction, baggingFreq, baggingSeed, binSampleCount, boostFromAverage, boostingType, categoricalSlotIndexes, categoricalSlotNames, defaultListenPort, driverListenPort, earlyStoppingRound, featureFraction, featuresCol, featuresShapCol, improvementTolerance, initScoreCol, isProvideTrainingMetric, isUnbalance, labelCol, lambdaL1, lambdaL2, leafPredictionCol, learningRate, maxBin, maxBinByFeature, maxDeltaStep, maxDepth, metric, minDataInLeaf, minGainToSplit, minSumHessianInLeaf, modelString, negBaggingFraction, numBatches, numIterations, numLeaves, numTasks, objective, parallelism, posBaggingFraction, predictionCol, probabilityCol, rawPredictionCol, repartitionByGroupingColumn, slotNames, thresholds, timeout, topK, useBarrierExecutionMode, validationIndicatorCol, verbosity, weightCol) + 81 def __init__(self, baggingFraction=1.0, baggingFreq=0, baggingSeed=3, binSampleCount=200000, boostFromAverage=True, boostingType="gbdt", categoricalSlotIndexes=[], categoricalSlotNames=[], defaultListenPort=12400, driverListenPort=0, earlyStoppingRound=0, featureFraction=1.0, featuresCol="features", featuresShapCol="", improvementTolerance=0.0, initScoreCol=None, isProvideTrainingMetric=False, isUnbalance=False, labelCol="label", lambdaL1=0.0, lambdaL2=0.0, leafPredictionCol="", learningRate=0.1, maxBin=255, maxBinByFeature=[], maxDeltaStep=0.0, maxDepth=-1, metric="", minDataInLeaf=20, minGainToSplit=0.0, minSumHessianInLeaf=0.001, modelString="", negBaggingFraction=1.0, numBatches=0, numIterations=100, numLeaves=31, numTasks=0, objective="binary", parallelism="data_parallel", posBaggingFraction=1.0, predictionCol="prediction", probabilityCol="probability", rawPredictionCol="rawPrediction", repartitionByGroupingColumn=True, slotNames=[], thresholds=None, timeout=1200.0, topK=20, useBarrierExecutionMode=False, validationIndicatorCol=None, verbosity=1, weightCol=None): + 82 super(_LightGBMClassifier, self).__init__() +---> 83 self._java_obj = self._new_java_obj("com.microsoft.ml.spark.lightgbm.LightGBMClassifier") + 84 self.baggingFraction = Param(self, "baggingFraction", "baggingFraction: Bagging fraction (default: 1.0)") + 85 self._setDefault(baggingFraction=1.0) + +/databricks/spark/python/pyspark/ml/wrapper.py in _new_java_obj(java_class, *args) + 64 java_obj = getattr(java_obj, name) + 65 java_args = [_py2java(sc, arg) for arg in args] +---> 66 return java_obj(*java_args) + 67 + 68 @staticmethod + +/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args) + 1566 + 1567 answer = self._gateway_client.send_command(command) +-> 1568 return_value = get_return_value( + 1569 answer, self._gateway_client, None, self._fqn) + 1570 + +/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw) + 115 def deco(*a, **kw): + 116 try: +--> 117 return f(*a, **kw) + 118 except py4j.protocol.Py4JJavaError as e: + 119 converted = convert_exception(e.java_exception) + +/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name) + 324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client) + 325 if answer[1] == REFERENCE_TYPE: +--> 326 raise Py4JJavaError( + 327 "An error occurred while calling {0}{1}{2}.\n". + 328 format(target_id, ".", name), value) + +Py4JJavaError: An error occurred while calling None.com.microsoft.ml.spark.lightgbm.LightGBMClassifier. +: java.lang.NoClassDefFoundError: org/apache/spark/ml/util/MLWritable$class + at com.microsoft.ml.spark.lightgbm.LightGBMClassifier.(LightGBMClassifier.scala:25) + at com.microsoft.ml.spark.lightgbm.LightGBMClassifier.(LightGBMClassifier.scala:27) + at sun.reflect.NativeConstructorAccessorImpl.newInstance0(Native Method) + at sun.reflect.NativeConstructorAccessorImpl.newInstance(NativeConstructorAccessorImpl.java:62) + at sun.reflect.DelegatingConstructorAccessorImpl.newInstance(DelegatingConstructorAccessorImpl.java:45) + at java.lang.reflect.Constructor.newInstance(Constructor.java:423) + at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:247) + at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380) + at py4j.Gateway.invoke(Gateway.java:250) + at py4j.commands.ConstructorCommand.invokeConstructor(ConstructorCommand.java:80) + at py4j.commands.ConstructorCommand.execute(ConstructorCommand.java:69) + at py4j.GatewayConnection.run(GatewayConnection.java:251) + at java.lang.Thread.run(Thread.java:748) +Caused by: java.lang.ClassNotFoundException: org.apache.spark.ml.util.MLWritable$class + at java.net.URLClassLoader.findClass(URLClassLoader.java:382) + at java.lang.ClassLoader.loadClass(ClassLoader.java:419) + at com.databricks.backend.daemon.driver.ClassLoaders$LibraryClassLoader.loadClass(ClassLoaders.scala:151) + at java.lang.ClassLoader.loadClass(ClassLoader.java:352) + ... 13 more \ No newline at end of file diff --git a/5b_error_evaluate.txt b/5b_error_evaluate.txt new file mode 100644 index 0000000..f1097c2 --- /dev/null +++ b/5b_error_evaluate.txt @@ -0,0 +1,17 @@ +Internal error, sorry. Attach your notebook to a different cluster or restart the current cluster. +java.lang.RuntimeException: abort: DriverClient destroyed + at com.databricks.backend.daemon.driver.DriverClient.$anonfun$poll$3(DriverClient.scala:440) + at scala.concurrent.Future.$anonfun$flatMap$1(Future.scala:307) + at scala.concurrent.impl.Promise.$anonfun$transformWith$1(Promise.scala:41) + at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:64) + at com.databricks.threading.NamedExecutor$$anon$2.$anonfun$run$1(NamedExecutor.scala:356) + at scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23) + at com.databricks.logging.UsageLogging.$anonfun$withAttributionContext$1(UsageLogging.scala:266) + at scala.util.DynamicVariable.withValue(DynamicVariable.scala:62) + at com.databricks.logging.UsageLogging.withAttributionContext(UsageLogging.scala:261) + at com.databricks.logging.UsageLogging.withAttributionContext$(UsageLogging.scala:258) + at com.databricks.threading.NamedExecutor.withAttributionContext(NamedExecutor.scala:285) + at com.databricks.threading.NamedExecutor$$anon$2.run(NamedExecutor.scala:355) + at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128) + at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628) + at java.base/java.lang.Thread.run(Thread.java:834) \ No newline at end of file diff --git a/5b_p2_error_evaluate.txt b/5b_p2_error_evaluate.txt new file mode 100644 index 0000000..7939a7f --- /dev/null +++ b/5b_p2_error_evaluate.txt @@ -0,0 +1,173 @@ +org.apache.spark.SparkException: Could not execute broadcast in 300 secs. You can increase the timeout for broadcasts via spark.sql.broadcastTimeout or disable broadcast join by setting spark.sql.autoBroadcastJoinThreshold to -1 +--------------------------------------------------------------------------- +Py4JJavaError Traceback (most recent call last) + in +----> 1 modeler.evaluate() + +/databricks/python/lib/python3.8/site-packages/fifeforspark/base_modelers.py in evaluate(self, subset, threshold_positive, share_positive) + 462 total = actuals.count() + 463 metrics.append( +--> 464 compute_metrics_for_binary_outcomes( + 465 actuals, + 466 predictions.select( + +/databricks/python/lib/python3.8/site-packages/fifeforspark/base_modelers.py in compute_metrics_for_binary_outcomes(actuals, predictions, total, threshold_positive, share_positive, cache) + 69 preds_and_labs = preds_and_labs.withColumn('rawPrediction', preds_and_labs.predictions.cast(DoubleType())) + 70 evaluator = BinaryClassificationEvaluator(labelCol='actuals') +---> 71 metrics['AUROC'] = evaluator.evaluate(preds_and_labs, {evaluator.metricName: "areaUnderROC"}) + 72 preds_and_labs = preds_and_labs.drop('rawPrediction') + 73 else: + +/databricks/spark/python/pyspark/ml/evaluation.py in evaluate(self, dataset, params) + 80 if isinstance(params, dict): + 81 if params: +---> 82 return self.copy(params)._evaluate(dataset) + 83 else: + 84 return self._evaluate(dataset) + +/databricks/spark/python/pyspark/ml/evaluation.py in _evaluate(self, dataset) + 118 """ + 119 self._transfer_params_to_java() +--> 120 return self._java_obj.evaluate(dataset._jdf) + 121 + 122 def isLargerBetter(self): + +/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/java_gateway.py in __call__(self, *args) + 1302 + 1303 answer = self.gateway_client.send_command(command) +-> 1304 return_value = get_return_value( + 1305 answer, self.gateway_client, self.target_id, self.name) + 1306 + +/databricks/spark/python/pyspark/sql/utils.py in deco(*a, **kw) + 115 def deco(*a, **kw): + 116 try: +--> 117 return f(*a, **kw) + 118 except py4j.protocol.Py4JJavaError as e: + 119 converted = convert_exception(e.java_exception) + +/databricks/spark/python/lib/py4j-0.10.9-src.zip/py4j/protocol.py in get_return_value(answer, gateway_client, target_id, name) + 324 value = OUTPUT_CONVERTER[type](answer[2:], gateway_client) + 325 if answer[1] == REFERENCE_TYPE: +--> 326 raise Py4JJavaError( + 327 "An error occurred while calling {0}{1}{2}.\n". + 328 format(target_id, ".", name), value) + +Py4JJavaError: An error occurred while calling o45291.evaluate. +: org.apache.spark.SparkException: Could not execute broadcast in 300 secs. You can increase the timeout for broadcasts via spark.sql.broadcastTimeout or disable broadcast join by setting spark.sql.autoBroadcastJoinThreshold to -1 + at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.doExecuteBroadcast(BroadcastExchangeExec.scala:243) + at org.apache.spark.sql.execution.InputAdapter.doExecuteBroadcast(WholeStageCodegenExec.scala:516) + at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeBroadcast$1(SparkPlan.scala:226) + at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:257) + at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:165) + at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:253) + at org.apache.spark.sql.execution.SparkPlan.executeBroadcast(SparkPlan.scala:222) + at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.prepareBroadcast(BroadcastHashJoinExec.scala:213) + at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.prepareRelation(BroadcastHashJoinExec.scala:231) + at org.apache.spark.sql.execution.joins.HashJoin.codegenInner(HashJoin.scala:450) + at org.apache.spark.sql.execution.joins.HashJoin.codegenInner$(HashJoin.scala:449) + at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.codegenInner(BroadcastHashJoinExec.scala:49) + at org.apache.spark.sql.execution.joins.HashJoin.doConsume(HashJoin.scala:358) + at org.apache.spark.sql.execution.joins.HashJoin.doConsume$(HashJoin.scala:356) + at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doConsume(BroadcastHashJoinExec.scala:49) + at org.apache.spark.sql.execution.CodegenSupport.consume(WholeStageCodegenExec.scala:195) + at org.apache.spark.sql.execution.CodegenSupport.consume$(WholeStageCodegenExec.scala:150) + at org.apache.spark.sql.execution.ProjectExec.consume(basicPhysicalOperators.scala:43) + at org.apache.spark.sql.execution.ProjectExec.doConsume(basicPhysicalOperators.scala:79) + at org.apache.spark.sql.execution.CodegenSupport.consume(WholeStageCodegenExec.scala:195) + at org.apache.spark.sql.execution.CodegenSupport.consume$(WholeStageCodegenExec.scala:150) + at org.apache.spark.sql.execution.ColumnarToRowExec.consume(Columnar.scala:66) + at org.apache.spark.sql.execution.ColumnarToRowExec.doProduce(Columnar.scala:191) + at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:96) + at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:257) + at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:165) + at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:253) + at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:91) + at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:91) + at org.apache.spark.sql.execution.ColumnarToRowExec.produce(Columnar.scala:66) + at org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:53) + at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:96) + at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:257) + at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:165) + at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:253) + at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:91) + at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:91) + at org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:43) + at org.apache.spark.sql.execution.joins.HashJoin.doProduce(HashJoin.scala:353) + at org.apache.spark.sql.execution.joins.HashJoin.doProduce$(HashJoin.scala:352) + at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.doProduce(BroadcastHashJoinExec.scala:49) + at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:96) + at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:257) + at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:165) + at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:253) + at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:91) + at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:91) + at org.apache.spark.sql.execution.joins.BroadcastHashJoinExec.produce(BroadcastHashJoinExec.scala:49) + at org.apache.spark.sql.execution.ProjectExec.doProduce(basicPhysicalOperators.scala:53) + at org.apache.spark.sql.execution.CodegenSupport.$anonfun$produce$1(WholeStageCodegenExec.scala:96) + at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:257) + at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:165) + at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:253) + at org.apache.spark.sql.execution.CodegenSupport.produce(WholeStageCodegenExec.scala:91) + at org.apache.spark.sql.execution.CodegenSupport.produce$(WholeStageCodegenExec.scala:91) + at org.apache.spark.sql.execution.ProjectExec.produce(basicPhysicalOperators.scala:43) + at org.apache.spark.sql.execution.WholeStageCodegenExec.doCodeGen(WholeStageCodegenExec.scala:657) + at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:720) + at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:213) + at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:257) + at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:165) + at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:253) + at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:209) + at org.apache.spark.sql.execution.columnar.CachedRDDBuilder.buildBuffers(InMemoryRelation.scala:253) + at org.apache.spark.sql.execution.columnar.CachedRDDBuilder.cachedColumnBuffers(InMemoryRelation.scala:222) + at org.apache.spark.sql.execution.columnar.InMemoryTableScanExec.filteredCachedBatches(InMemoryTableScanExec.scala:145) + at org.apache.spark.sql.execution.columnar.InMemoryTableScanExec.columnarInputRDD$lzycompute(InMemoryTableScanExec.scala:71) + at org.apache.spark.sql.execution.columnar.InMemoryTableScanExec.columnarInputRDD(InMemoryTableScanExec.scala:69) + at org.apache.spark.sql.execution.columnar.InMemoryTableScanExec.doExecuteColumnar(InMemoryTableScanExec.scala:160) + at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeColumnar$1(SparkPlan.scala:240) + at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:257) + at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:165) + at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:253) + at org.apache.spark.sql.execution.SparkPlan.executeColumnar(SparkPlan.scala:236) + at org.apache.spark.sql.execution.InputAdapter.doExecuteColumnar(WholeStageCodegenExec.scala:520) + at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeColumnar$1(SparkPlan.scala:240) + at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:257) + at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:165) + at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:253) + at org.apache.spark.sql.execution.SparkPlan.executeColumnar(SparkPlan.scala:236) + at org.apache.spark.sql.execution.ColumnarToRowExec.inputRDDs(Columnar.scala:202) + at org.apache.spark.sql.execution.ProjectExec.inputRDDs(basicPhysicalOperators.scala:49) + at org.apache.spark.sql.execution.WholeStageCodegenExec.doExecute(WholeStageCodegenExec.scala:748) + at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:213) + at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:257) + at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:165) + at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:253) + at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:209) + at org.apache.spark.sql.execution.DeserializeToObjectExec.doExecute(objects.scala:104) + at org.apache.spark.sql.execution.SparkPlan.$anonfun$execute$1(SparkPlan.scala:213) + at org.apache.spark.sql.execution.SparkPlan.$anonfun$executeQuery$1(SparkPlan.scala:257) + at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:165) + at org.apache.spark.sql.execution.SparkPlan.executeQuery(SparkPlan.scala:253) + at org.apache.spark.sql.execution.SparkPlan.execute(SparkPlan.scala:209) + at org.apache.spark.sql.execution.QueryExecution.toRdd$lzycompute(QueryExecution.scala:167) + at org.apache.spark.sql.execution.QueryExecution.toRdd(QueryExecution.scala:166) + at org.apache.spark.sql.Dataset.rdd$lzycompute(Dataset.scala:3323) + at org.apache.spark.sql.Dataset.rdd(Dataset.scala:3321) + at org.apache.spark.ml.evaluation.BinaryClassificationEvaluator.getMetrics(BinaryClassificationEvaluator.scala:133) + at org.apache.spark.ml.evaluation.BinaryClassificationEvaluator.evaluate(BinaryClassificationEvaluator.scala:100) + at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method) + at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62) + at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43) + at java.lang.reflect.Method.invoke(Method.java:498) + at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244) + at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:380) + at py4j.Gateway.invoke(Gateway.java:295) + at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132) + at py4j.commands.CallCommand.execute(CallCommand.java:79) + at py4j.GatewayConnection.run(GatewayConnection.java:251) + at java.lang.Thread.run(Thread.java:748) +Caused by: java.util.concurrent.TimeoutException + at java.util.concurrent.CompletableFuture.timedGet(CompletableFuture.java:1784) + at java.util.concurrent.CompletableFuture.get(CompletableFuture.java:1928) + at org.apache.spark.sql.execution.exchange.BroadcastExchangeExec.doExecuteBroadcast(BroadcastExchangeExec.scala:232) + ... 110 more \ No newline at end of file diff --git a/README.md b/README.md index 2172966..f6fc0eb 100644 --- a/README.md +++ b/README.md @@ -22,22 +22,22 @@ Suppose you have a dataset that looks like this: The entities with IDs 0, 2, and 4 are observed in the dataset in 2019. -While FIFE offers a significantly larger suite of models designed to answer a variety of questions, FIFEforSpark is mainly focused on one question: what are each of their probabilities of being observed in any future year? Fortunately, FIFEforSpark can estimate answers to these questions for any unbalanced panel dataset. +While FIFE offers a significantly larger suite of models designed to answer a variety of questions, FIFEforSpark is mainly focused on one question: what are each of their <[CO comment 1a](_review.md)> probabilities of being observed in any future year? Fortunately, FIFEforSpark can estimate answers to these questions for any unbalanced panel dataset. Exactly like FIFE, FIFEforSpark unifies survival analysis and multivariate time series analysis. Tools for the former neglect future states of survival; tools for the latter neglect the possibility of discontinuation. Traditional forecasting approaches for each, such as proportional hazards and vector autoregression (VAR), respectively, impose restrictive functional forms that limit forecasting performance. FIFEforSpark supports one of the best approaches for maximizing forecasting performance: gradient-boosted trees (using MMLSpark's LightGBM). -FIFEforSpark is simple to use and the syntax is almost identical to that of FIFE; however, given that this is meant to be run in the Spark environment in a Python notebook, there are some notable differences. First, the package 'mmlspark' must already be installed and attached to the cluster. Unfortunately, the PyPI version of MMLSpark is not compatible with FIFEforSpark. As such, FIFE is best utilized in a Databricks notebook. +FIFEforSpark is simple to use and the syntax is almost identical to that of FIFE; however, given that this is meant to be run in the Spark environment in a Python notebook, there are some notable differences. First, the package 'mmlspark' must already be installed and attached to the cluster. <[CO comment 1b](_review.md)> Unfortunately, the PyPI version of MMLSpark is not compatible with FIFEforSpark. As such, FIFE is best utilized in a Databricks notebook. <[CO comment 1c](_review.md)> -If you are working in a Python IDE and have pyspark installed, you can run the following. Again, the PyPI version of MMLSpark is not complete and will cause this code to fail; however, if MMLSpark is installed correctly, the following code should work. +If you are working in a Python IDE and have pyspark installed <[CO comment 1d](_review.md)>, you can run the following. Again, the PyPI version of MMLSpark is not complete and will cause this code to fail; however, if MMLSpark is installed correctly, the following code should work. ```python -import findspark +import findspark <[CO comment 1e](_review.md)> findspark.init() import pyspark # only run after findspark.init() from pyspark.sql import SparkSession from fifeforspark.processors import PanelDataProcessor -from fifeforspark.lgb_modelers import LGBSurvivalModeler +from fifeforspark.lgb_modelers import LGBSurvivalModeler spark = SparkSession.builder.getOrCreate() data_processor = PanelDataProcessor(data=spark.read.csv(path_to_your_data)) @@ -51,6 +51,7 @@ forecasts = modeler.forecast() If you are working in a Databricks python notebook, you may run something like the following code, where 'your_table' is the name of your table. +<[CO comment 1f](_review.md)> ```python from fifeforspark.processors import PanelDataProcessor from fifeforspark.lgb_modelers import LGBSurvivalModeler diff --git a/_review.md b/_review.md new file mode 100644 index 0000000..3954dc0 --- /dev/null +++ b/_review.md @@ -0,0 +1,111 @@ +CO Review Comments + +General feedback: + +- Recommend making documentation more concise to make it easy for users to find the information they care about +(e.g., you can provide links to sources that give background info on Apache Spark, PySpark, and MMLSpark +for users who may be unfamiliar, and allow users who are familiar to bypass; can also streamline the language +throughout) + +- Recommend clarifying the installation/set-up process (e.g., if you recommend Databricks, provide detailed +instructions for how users can install MMLSpark and FIFEforSpark and attach to cluster in Databricks) + +- Due to the obscurity of error messages in Spark, recommend creating a separate page in the documentation to compile +commonly encountered error messages, along with potential causes and fixes (doesn't have to be comprehensive, +just a place for users to start troubleshooting; can add to the list as users encounter and resolve new errors) + +- Is `rf_modelers.py` ready for primetime? I see it in the `random_forest` branch, but not on `main`. (Also, I suspect +that comment 4b (below) will apply to this module/class as well.) + + +1) README.md + + a) Not obvious who "their" refers to in this statement + + b) It would be very helpful for users if you provide instructions for how to install MMLSpark and how + to attach it to a cluster in Databricks (or at least link to the SynapseML repository on Github) + (edit: found some instructions in the formal docs which were helpful, but think it would still be + good to include in the README) + + c) Recommend including instructions for how to install FIFEforSpark in the README + + d) This statement is confusing to me; is it best to use FIFEforSpark in Databricks, or in a Python IDE? + In either case, it's not clear how to get set up and install FIFEforSpark and all of the required dependencies + + e) I get a "ModuleNotFoundError: No module named 'findspark'" error message when running `import findspark` + in Databricks (using Databricks Runtime 8.3/Spark 3.1.1/Scala 2.12) + + f) This code block is mostly redundant (and user has to spend time trying to identify the difference); + recommend eliminating all but the line that applies to Databricks + (e.g., `data_processor = PanelDataProcessor(data = spark.sql("select * from your_table"))`) + +2) utils.py + + a) `create_example_data1` function is slow; also not clear to the user what the difference is between + `create_example_data1` and `create_example_data2` functions + +3) lgb_modelers.py + + a) [9/10 Update] Per Ed's recommendations, switching to 9.0 Databricks Runtime and + com.microsoft.ml.spark:mmlspark_2.12:1.0.0-rc3-59-bf337941-SNAPSHOT Maven coordinates resolved the error. + These coordinates are not the most recent listed on the [MMLSpark GitHub repository](https://github.com/microsoft/SynapseML) + though--recommend addressing in the documentation (in a general way, e.g. "encountering the following error may + indicate an incompatibility issue between the Databricks runtime and maven repository", rather than + trying to list all combinations of runtimes and coordinates that do/don't work) + + Encountering the following error when running `LGBSurvivalModeler(data=data_processor.data).build_model()`: + `java.lang.NoClassDefFoundError: org/apache/spark/ml/util/MLWritable$class`; note that importing + `fifeforspark.lgb_modelers` succeeds without triggering the warning that MMLSpark could not be imported; + using 8.3 Databricks Runtime, mmlspark_2.11-1.0.0-rc3 Maven coordinates for MMLSpark; full traceback is in + [3a_error_buildmodel.txt](3a_error_buildmodel.txt) + + b) `tqdm` package is not included in the 8.3 Databricks Runtime, and trying to import LGBSurvivalModeler + from fifeforspark.lgb_modelers returns following error: `ModuleNotFoundError: No module named tqdm`; + not a huge issue as users can install it themselves from PyPI, but it could be helpful to list in one place + all additional packages that FIFE users need to install to make the switch to FIFEforSpark + +4) gbt_modelers.py + + a) Update docstring of `train_single_model` function in `GBTModeler` class to remove reference to LightGBM + + b) Assuming that GBTModeler was implemented to provide an alternative to LightGBM (due to the difficulties of + installing MMLSpark), it's a bit strange that GBTModeler inherits from the LGBModeler class, and that + you therefore need to import the LGBModeler class from fifeforspark.lgb_modelers for this module to work. + Is there an easy way to uncouple the LGBModeler and GBTModeler class? Alternatively, do they really need + to be two separate classes, or should there be one "tree-based modeler" class to which users can pass + an argument for whether they want to use LightGBM or pyspark GBTClassifier (under the hood)? + +5) base_modelers.py + + a) Is the attribute set in line 176 (`self.spark = SparkSession.builder.getOrCreate()`) used anywhere else? + + b) [9/14 Update] Per Ed's recommendations, tried the `evaluate` method after setting + LGBSurvivalModeler(config={'CACHE':True}); encountered DriverClient destroyed error (may not necessarily + be related); full traceback is in [5b_error_evaluate.txt](5b_error_evaluate.txt); on re-run, encountered + SparkException error in [5b_p2_error_evaluate.txt](5b_p2_error_evaluate.txt) + + The `evaluate` method is very slow, even for a relatively small dataset; projected time is 2+ hours for + a (10k person x 48 period) data set, whereas data processing took 37 seconds and model training/producing + forecasts took 11 minutes (I think this issue exists with base FIFE as well though) + +6) fife-for-spark.readthedocs.io + + a) Recommend removing instructions for installing fifeforspark with firewall + [here](https://fife-for-spark.readthedocs.io/en/latest/user_guide.html) (ability to set trusted host + may depend on user's particular circumstances, e.g., security considerations) + + b) Recommend moving "Introduction to Survival Analysis" to separate page, and linking to it from the + Quickstart section. Would also recommend the same for "Configuration Parameters" + (with the goal of organizing information in such a way that users with varying levels + of familiarity can quickly navigate through the information and find what is most relevant to them) + + c) Recommend removing footnotes from documentation and linking directly to the sources in the text + instead (i.e., in-line) + + d) The link to the image of the MMLSpark_Maven coordinates + ([here](https://fife-for-spark.readthedocs.io/en/latest/spark_help.html#how-to-download-mmlspark)) + directs me to a webpage with the message "SORRY This page does not exist yet." + + d) Nitpick: user guide notes that the model produced by FIFEforSpark is a list of models, exactly like FIFE + ([here](https://fife-for-spark.readthedocs.io/en/latest/user_guide.html#lgbsurvivalmodeler))--this is + not necessarily the case if FIFE users are using non-tree-based modelers \ No newline at end of file