Fixed bug in bin data samples, now writing column labels. Fixed bug in graph estimation module.

felixleopoldo · felixleopoldo · commit 080de64d85c3 · 2025-05-31T09:09:28.000+02:00
diff --git a/workflow/rules/data/iid/rules.smk b/workflow/rules/data/iid/rules.smk
@@ -4,7 +4,8 @@
 
 rule sample_bin_bn_data:
     input:
-        bn="{output_dir}/parameters/bin_bn/{bn}/adjmat=/{adjmat}.rds"
+        bn="{output_dir}/parameters/bin_bn/{bn}/adjmat=/{adjmat}.rds",
+        script="workflow/rules/data/iid/sample_data_with_range_header.R"
     output:
         data="{output_dir}/data" \
              "/adjmat=/{adjmat}"\
diff --git a/workflow/rules/data/iid/sample_data_with_range_header.R b/workflow/rules/data/iid/sample_data_with_range_header.R
@@ -38,8 +38,7 @@ bindata <- generatebinaryBN.data(n = n, binaryBN = bn, samplesize = samples)
 myrow <- rep(2, n)
 bindata_range_header <- data.frame(rbind(myrow, as.matrix(bindata)))
 
-# TODO: Should take the colun names from bn
-colnames(bindata_range_header) <- seq(n)
+colnames(bindata_range_header) <- colnames(bn$adj) #seq(n)
 write.table(bindata_range_header,
     file = filename, row.names = FALSE, quote = FALSE,
     col.names = TRUE, sep = ","
diff --git a/workflow/rules/evaluation/graph_estimation/rules.smk b/workflow/rules/evaluation/graph_estimation/rules.smk
@@ -143,11 +143,11 @@ features = {
     "csvs": {"ext":"csv", "argstring":"", "filename":"adjmat"}
 }
 
-# Since we have a lot of different data setups and algs, we need to create a rule for 
+# Since we have a lot of different data setups and algs, we need to create a rule for
 # each combination of them.
 
 for bmark_setup in config["benchmark_setup"]:
-    
+
     graph_estimation = bmark_setup["evaluation"]["graph_estimation"]
     graph_types = graph_estimation["convert_to"] is not None and graph_estimation["convert_to"] or []
     graph_types += ["original"]
@@ -158,14 +158,14 @@ for bmark_setup in config["benchmark_setup"]:
             for alg in active_algorithms(bmark_setup, eval_method="graph_estimation"):
                 data_index = 0
                 # We want one folder per data setup, so we create one rule for each of them.
-                
+
                 for sim_setup in bmark_setup["data"]:
-                    for seed in get_seed_range(sim_setup["seed_range"]):  
-                        
-                        adjmat_strings = gen_adjmat_string_from_conf(sim_setup["graph_id"], seed) 
+                    for seed in get_seed_range(sim_setup["seed_range"]):
+
+                        adjmat_strings = gen_adjmat_string_from_conf(sim_setup["graph_id"], seed)
                         parameters_strings = gen_parameter_string_from_conf(sim_setup["parameters_id"], seed)
                         data_strings = gen_data_string_from_conf(sim_setup["data_id"], seed, seed_in_path=False)
-                        
+
 
                         if adjmat_strings is None:
                             adjmat_strings = [None]
@@ -188,10 +188,9 @@ for bmark_setup in config["benchmark_setup"]:
                         for adjmat_string in adjmat_strings:
                             for parameters_string in parameters_strings:
                                 for data_string in data_strings:
-                                    #print(bmark_setup)
-                                    rule:   
-                                        name: 
-                                            "results/output/"+bmark_setup_title+"/graph_estimation/dataset_"+str(data_index+1)+"/"+alg+"/graph_type="+graph_type+"/"+feature 
+                                    rule:
+                                        name:
+                                            "results/output/"+bmark_setup_title+"/graph_estimation/dataset_"+str(sim_setup["graph_id"]) + "_" + str(sim_setup["parameters_id"]) + "_" + str(sim_setup["data_id"]) + "_" + str(seed)+"/"+alg+"/graph_type="+graph_type+"/"+feature
                                         input:
                                             conf=configfilename,
                                             graphs=eval_module_conf_to_feature_files_data(filename=feature_dict["filename"],
@@ -206,28 +205,31 @@ for bmark_setup in config["benchmark_setup"]:
                                                                                             data_string=data_string,
                                                                                             alg=alg,
                                                                                             bmark_setup=bmark_setup)
-                                                                                            
+
                                         output:
-                                            touch("results/output/"+bmark_setup_title+"/graph_estimation/dataset_"+str(data_index+1)+"/graph_type="+graph_type+"/"+feature+"/"+alg+".done")
+                                            touch("results/output/"+bmark_setup_title+"/graph_estimation/graph_id=" + str(sim_setup["graph_id"]) + "_parameters_id=" + str(sim_setup["parameters_id"]) + "_data_id=" + str(sim_setup["data_id"]) + "_seed=" + str(seed) +"/graph_type="+graph_type+"/"+feature+"/"+alg+".done")
                                             
                                         params:
                                             graph_type=graph_type,
                                             data_index=str(data_index+1),
                                             feature=feature,
                                             ext=feature_dict["ext"],
                                             alg=alg,
-                                            bmark_setup=bmark_setup_title
+                                            bmark_setup=bmark_setup_title,
+                                            output_dir="results/output/"+bmark_setup_title+"/graph_estimation/graph_id="+ str(sim_setup["graph_id"]) + "_parameters_id=" + str(sim_setup["parameters_id"]) + "_data_id=" + str(sim_setup["data_id"]) + "_seed=" + str(seed) +"/graph_type="+graph_type+"/"+feature+"/"+alg
+
+                                        run:
 
-                                        run:                                    
-                                            output_dir = "results/output/{params.bmark_setup}/graph_estimation/dataset_"+params["data_index"]+"/graph_type="+params["graph_type"]+"/"+params["feature"]+"/"+params["alg"]
                                             # clean old file while keeping the directory
                                             # check if the directory exists
-                                            if Path(output_dir).exists():
+                                            if Path(params["output_dir"]).exists():
                                                 # remove all files in the directory
-                                                [f.unlink() for f in Path(output_dir).glob("*.png") ]
-                                            for i, f in enumerate(input.graphs):                                            
-                                                shell("mkdir -p " + output_dir)                                                                                                                                    
-                                                shell("cp "+f+" " + output_dir + "/"+params["alg"]+"_"+params["graph_type"]+"_" +str(i+1) +"."+params["ext"])
+                                                [f.unlink() for f in Path(params["output_dir"]).glob("*.png") ]
+
+                                            # This is to iterate over all the parameter settings.
+                                            for j, f in enumerate(input.graphs):
+                                                shell("mkdir -p " + params["output_dir"])
+                                                shell("cp "+f+" " + params["output_dir"] + "/"+params["alg"]+"_"+params["graph_type"]+ "_" + str(j+1) +"."+params["ext"])
 
                                     data_index += 1
 
diff --git a/workflow/rules/helper_functions.py b/workflow/rules/helper_functions.py
@@ -89,6 +89,7 @@ def active_algorithms(bmark_setup, eval_method="benchmarks"):
 
     return list(set(algs))
 
+import pprint as pp
 
 def get_active_rules(wildcards):
     """
@@ -106,33 +107,21 @@ def get_active_rules(wildcards):
             graph_types = evaluation["graph_estimation"]["convert_to"] if evaluation["graph_estimation"]["convert_to"] != None else ["original"]
             graph_types += ["original"]
 
-            # go through all active features and create a done file for each.
-
+            # Go through all active features and create a .done file for each.
             for feature, isactive in evaluation["graph_estimation"].items():
 
                 # These are not features, so skip
                 if feature in ["ids", "convert_to"]:
                     continue
 
                 if isactive == True:
-                    # Cound the data setups and create a done file for each.
-                    n_comb = 0
                     for sim_setup in bmark_setup["data"]:
-                        seed=get_seed_range(sim_setup["seed_range"])
-                        adjmat=gen_adjmat_string_from_conf(sim_setup["graph_id"], seed),
-                        parameters=gen_parameter_string_from_conf(sim_setup["parameters_id"], seed),
-                        data=gen_data_string_from_conf(sim_setup["data_id"], seed, seed_in_path=False)
-
-                        # count total number of combinations of the three above
-                        n_data = len(data) if isinstance(data, list) and len(data) != 0 else 1
-                        n_parameters = len(parameters) if isinstance(parameters, list) and parameters != [] else 1
-                        n_adjmat = len(adjmat) if isinstance(adjmat, list) and adjmat != [] else 1
-                        n_comb += n_data*n_parameters*n_adjmat if n_data*n_parameters*n_adjmat != 0 else 1
-
-                    for data_index in range(n_comb):
-                        for alg in active_algorithms(bmark_setup, eval_method="graph_estimation"):
-                            for graph_type in graph_types:
-                                rules.append("results/output/"+bmark_setup_title+"/graph_estimation/dataset_"+str(data_index+1)+"/graph_type="+graph_type+"/"+feature+"/"+alg+".done")
+                        seed_range=get_seed_range(sim_setup["seed_range"])                                                
+                        for seed in seed_range:                    
+                            dataset = str("graph_id=" + str(sim_setup["graph_id"]) + "_parameters_id=" + str(sim_setup["parameters_id"]) + "_data_id=" + str(sim_setup["data_id"]) + "_seed=" + str(seed))
+                            for alg in active_algorithms(bmark_setup, eval_method="graph_estimation"):
+                                for graph_type in graph_types:
+                                    rules.append("results/output/"+bmark_setup_title+"/graph_estimation/"+dataset+"/graph_type="+graph_type+"/"+feature+"/"+alg+".done")
 
         # mcmc_traj_plots
         if "mcmc_traj_plots" in evaluation and len(evaluation["mcmc_traj_plots"]) > 0:
diff --git a/workflow/rules/parameters/bin_bn/sample_bayesian_network_for_dag.R b/workflow/rules/parameters/bin_bn/sample_bayesian_network_for_dag.R
@@ -53,13 +53,19 @@ filename_dag <- argv$filename_dag
 
 adjmat <- read.csv(filename_dag, check.names = FALSE)
 n <- dim(adjmat)[2]
+labels <- colnames(adjmat)
+rownames(adjmat) <- colnames(adjmat)
+
+# First we have to use integers as labels. Then convert back to the original labels below.
 rownames(adjmat) <- seq(n)
 colnames(adjmat) <- seq(n)
 
-DAG <- adjacency2dag(adjmat)
+DAG <- adjacency2dag(adjmat)#, nodes = colnames(adjmat))
 
-## TODO: This should pass the column names as well.
 set.seed(seed_number)
 binBN <- generateBinaryBN(DAG, c(argv$min, argv$max))
+# Set the node labels
+nodes(binBN$DAG) <- labels
+colnames(binBN$adj) <- labels
 
 saveRDS(binBN, file = filename)