diff --git a/benchmark/experimentation_bench/cloud_infra/ground_truth/additional/q1.txt b/benchmark/experimentation_bench/cloud_infra/ground_truth/additional/q1.txt new file mode 100644 index 0000000..a57e0bc --- /dev/null +++ b/benchmark/experimentation_bench/cloud_infra/ground_truth/additional/q1.txt @@ -0,0 +1,18 @@ +# Design: + +{ + "constant_vars": [ + "application_type=e-commerce web application", + "function_tested=add_to_cart", + "concurrent_requests=500", + "test_type=network I/O performance" + ], + "independent_vars": [ + "ec2_instance_type=[c5.large, c5.xlarge, c5.2xlarge, c5.4xlarge, c5.9xlarge]" + ], + "dependent_vars": [ + "99th percentile latency" + ] +} + +# Setup: refer to misc/optional_setup_q1_q2_q3_q4.txt \ No newline at end of file diff --git a/benchmark/experimentation_bench/cloud_infra/ground_truth/additional/q2.txt b/benchmark/experimentation_bench/cloud_infra/ground_truth/additional/q2.txt new file mode 100644 index 0000000..645f4a7 --- /dev/null +++ b/benchmark/experimentation_bench/cloud_infra/ground_truth/additional/q2.txt @@ -0,0 +1,19 @@ +# Design: + +{ + "constant_vars": [ + "application_type=e-commerce web application", + "function_tested=add_to_cart", + "concurrent_requests=500", + "test_type=network I/O performance" + ], + "independent_vars": [ + "ec2_instance_type=[c5.large, c5.xlarge, c5.2xlarge, c5.4xlarge, c5.9xlarge]" + ], + "dependent_vars": [ + "99th percentile latency under 150ms", + "total cost per hour" + ] +} + +# Setup: refer to misc/optional_setup_q1_q2_q3_q4.txt \ No newline at end of file diff --git a/benchmark/experimentation_bench/cloud_infra/ground_truth/additional/q3.txt b/benchmark/experimentation_bench/cloud_infra/ground_truth/additional/q3.txt new file mode 100644 index 0000000..2548e29 --- /dev/null +++ b/benchmark/experimentation_bench/cloud_infra/ground_truth/additional/q3.txt @@ -0,0 +1,19 @@ +# Design: + +{ + "constant_vars": [ + "application_type=e-commerce web application", + "function_tested=add_to_cart", + "concurrent_requests=500", + "test_type=network I/O performance" + ], + "independent_vars": [ + "ec2_instance_type=[c5.large, c5.xlarge, c5.2xlarge, c5.4xlarge, c5.9xlarge]" + ], + "dependent_vars": [ + "99th percentile latency under 150ms", + "total cost per hour" + ], +} + +# Setup: refer to misc/optional_setup_q1_q2_q3_q4.txt \ No newline at end of file diff --git a/benchmark/experimentation_bench/cloud_infra/ground_truth/additional/q4.txt b/benchmark/experimentation_bench/cloud_infra/ground_truth/additional/q4.txt new file mode 100644 index 0000000..4c0cecc --- /dev/null +++ b/benchmark/experimentation_bench/cloud_infra/ground_truth/additional/q4.txt @@ -0,0 +1,19 @@ +# Design: + +{ + "constant_vars": [ + "application_type=e-commerce web application", + "function_tested=add_to_cart", + "concurrent_requests=500", + "network_test=network I/O latency" + ], + "independent_vars": [ + "ec2_instance_type=[c5.large, c5.xlarge, c5.2xlarge, t3.nano, t3.micro, t3.small]" + ], + "dependent_vars": [ + "99th percentile latency", + "total cost per hour" + ], +} + +# Setup: refer to misc/optional_setup_q1_q2_q3_q4.txt \ No newline at end of file diff --git a/benchmark/experimentation_bench/cloud_infra/ground_truth/additional/q5.txt b/benchmark/experimentation_bench/cloud_infra/ground_truth/additional/q5.txt new file mode 100644 index 0000000..e0f5e80 --- /dev/null +++ b/benchmark/experimentation_bench/cloud_infra/ground_truth/additional/q5.txt @@ -0,0 +1,15 @@ +# Design: + +{ + "constant_vars": [ + "workload_type=compute-bound", + ], + "independent_vars": [ + "ec2_instance_type=[t3.medium, c5.large]" + ], + "dependent_vars": [ + "CPU throughput (events per second)", + ] +} + +# Setup: refer to misc/optional_setup_q5_q6_q7.txt \ No newline at end of file diff --git a/benchmark/experimentation_bench/cloud_infra/ground_truth/additional/q6.txt b/benchmark/experimentation_bench/cloud_infra/ground_truth/additional/q6.txt new file mode 100644 index 0000000..743fe6f --- /dev/null +++ b/benchmark/experimentation_bench/cloud_infra/ground_truth/additional/q6.txt @@ -0,0 +1,15 @@ +# Design: + +{ + "constant_vars": [ + "workload_type=compute-bound", + ], + "independent_vars": [ + "ec2_instance_type=[t3.medium, c5.large, r5.large, m6i.large, t3a.large]" + ], + "dependent_vars": [ + "CPU throughput (events per second)" + ], +} + +# Setup: refer to misc/optional_setup_q5_q6_q7.txt. Also, check if "events per second" is used as the metric. \ No newline at end of file diff --git a/benchmark/experimentation_bench/cloud_infra/ground_truth/additional/q7.txt b/benchmark/experimentation_bench/cloud_infra/ground_truth/additional/q7.txt new file mode 100644 index 0000000..7ce703e --- /dev/null +++ b/benchmark/experimentation_bench/cloud_infra/ground_truth/additional/q7.txt @@ -0,0 +1,19 @@ +# Design: + +{ + "constant_vars": [ + "workload_type=compute-bound", + "instance_vCPU=2", + "instance_memory=8GB", + ], + "independent_vars": [ + "ec2_instance_type=[t3a.large, m5.large, m6a.large]", + "cpu_architecture=[AMD EPYC 7000 (t3a), Intel Xeon Platinum 8000 (m5), AMD EPYC Milan (m6a)]", + "base_clock_frequency=[2.5GHz, 3.1GHz, 3.6GHz]" + ], + "dependent_vars": [ + "CPU throughput (events per second)" + ], +} + +# Setup: refer to misc/optional_setup_q5_q6_q7.txt \ No newline at end of file diff --git a/benchmark/experimentation_bench/cloud_infra/ground_truth/additional/q8.txt b/benchmark/experimentation_bench/cloud_infra/ground_truth/additional/q8.txt new file mode 100644 index 0000000..215dfe8 --- /dev/null +++ b/benchmark/experimentation_bench/cloud_infra/ground_truth/additional/q8.txt @@ -0,0 +1,17 @@ +# Design: + +{ + "constant_vars": [ + "workload_type=mixed (CPU-bound + memory-intensive)", + "test_duration=30 seconds", + ], + "independent_vars": [ + "ec2_instance_type=[t3.medium, t3.large, c5.large, c5.xlarge]", + "instance_family=[t3, c5]" + ], + "dependent_vars": [ + "average CPU throughput (events per second)", + ], +} + +# Setup: refer to misc/optional_setup_q8.txt \ No newline at end of file diff --git a/benchmark/experimentation_bench/cloud_infra/ground_truth/q1.txt b/benchmark/experimentation_bench/cloud_infra/ground_truth/q1.txt index f54658e..87bf6f1 100644 --- a/benchmark/experimentation_bench/cloud_infra/ground_truth/q1.txt +++ b/benchmark/experimentation_bench/cloud_infra/ground_truth/q1.txt @@ -1,4 +1,8 @@ -"c5.4xlarge or c5.2xlarge +# Answer: + +c5.4xlarge or c5.2xlarge + +Example raw results: **Control Group (c5.large):** - **Run 1:** 99th Percentile Latency: 143 ms diff --git a/benchmark/experimentation_bench/cloud_infra/ground_truth/q2.txt b/benchmark/experimentation_bench/cloud_infra/ground_truth/q2.txt index cdaef5c..0c21513 100644 --- a/benchmark/experimentation_bench/cloud_infra/ground_truth/q2.txt +++ b/benchmark/experimentation_bench/cloud_infra/ground_truth/q2.txt @@ -1,7 +1,3 @@ -"c5.large. An answer is also considered acceptable if the LLM chose to ignore cheaper instances that had a tested latency under 150ms, but was at the borderline of 150ms (i.e., the argument could be that for a better guarantee that the latency requirement is accepted, we should not use those cheaper instances) +# Answer: -/home/ubuntu/langgraph-exp-agent/eval_metadata/temp_logs/cloud_infra/q2-best-instance-multi-obj_20250126200321_iter1.log - -/home/ubuntu/langgraph-exp-agent/eval_metadata/temp_logs/cloud_infra/q2-best-instance-multi-obj_20250126201703_iter2.log - -False postiive example (stopped at initial plan creation by architect): /home/ubuntu/langgraph-exp-agent/eval_metadata/temp_logs/cloud_infra/q2-best-instance-multi-obj_20250124160231_iter2.log" +c5.large. An answer is also considered acceptable if the LLM chose to ignore cheaper instances that had a tested latency under 150ms, but was at the borderline of 150ms (i.e., the argument could be that for a better guarantee that the latency requirement is accepted, we should not use those cheaper instances) diff --git a/benchmark/experimentation_bench/cloud_infra/ground_truth/q3.txt b/benchmark/experimentation_bench/cloud_infra/ground_truth/q3.txt index 9c472c9..fb4ba5b 100644 --- a/benchmark/experimentation_bench/cloud_infra/ground_truth/q3.txt +++ b/benchmark/experimentation_bench/cloud_infra/ground_truth/q3.txt @@ -1,5 +1,7 @@ -"c5.large should be best to balance both, and an answer is also considered acceptable if the LLM chose to ignore cheaper instances that had a tested latency under 150ms, but was at the borderline of 150ms (i.e., the argument could be that for a better guarantee that the latency requirement is accepted, we should not use those cheaper instances). +# Answer: + +c5.large should be best to balance both, and an answer is also considered acceptable if the LLM chose to ignore cheaper instances that had a tested latency under 150ms, but was at the borderline of 150ms (i.e., the argument could be that for a better guarantee that the latency requirement is accepted, we should not use those cheaper instances). For latency: should have a decreasing trend as we increase instance size, but not a requirement. In general, all should meet the requirements -For cost: it should increase as the instance size increases. " +For cost: it should increase as the instance size increases. \ No newline at end of file diff --git a/benchmark/experimentation_bench/cloud_infra/ground_truth/q4.txt b/benchmark/experimentation_bench/cloud_infra/ground_truth/q4.txt index c9a0a9c..87d0718 100644 --- a/benchmark/experimentation_bench/cloud_infra/ground_truth/q4.txt +++ b/benchmark/experimentation_bench/cloud_infra/ground_truth/q4.txt @@ -1,6 +1,8 @@ +# Answer: + c5.large, and an answer is also considered acceptable if the LLM chose to ignore cheaper instances that had a tested latency under 150ms, but was at the borderline of 150ms (i.e., the argument could be that for a better guarantee that the latency requirement is accepted, we should not use those cheaper instances). -Example output: (note: the exact format or structure does not matter) +Example raw output: (note: the exact format or structure does not matter) - **c5.large**: - Latencies: 142 ms and 141 ms - Cost: $0.0363/hour diff --git a/benchmark/experimentation_bench/cloud_infra/ground_truth/q5.txt b/benchmark/experimentation_bench/cloud_infra/ground_truth/q5.txt index 2cc7fc3..becfa06 100644 --- a/benchmark/experimentation_bench/cloud_infra/ground_truth/q5.txt +++ b/benchmark/experimentation_bench/cloud_infra/ground_truth/q5.txt @@ -1,3 +1,7 @@ -"**Experimental Group (c5.large):** +# Answer: + +c5.large + +Example raw output for c5.large: - **Result 1:** CPU speed: 60.81 events per second -- **Result 2:** CPU speed: 60.68 events per second" +- **Result 2:** CPU speed: 60.68 events per second diff --git a/benchmark/experimentation_bench/cloud_infra/ground_truth/q6.txt b/benchmark/experimentation_bench/cloud_infra/ground_truth/q6.txt index 72f35e1..025523a 100644 --- a/benchmark/experimentation_bench/cloud_infra/ground_truth/q6.txt +++ b/benchmark/experimentation_bench/cloud_infra/ground_truth/q6.txt @@ -1,35 +1,17 @@ -No definitive answer. Depends on when test was taken. Check setup, and if "events per second" is used as the metric. +# Answer: -#### Control Group (t3.medium): -- **Result 1**: - - Events per second: 43.38 - - Total time: 10.0022s - - Average Latency: 23.04 ms -- **Result 2**: - - Events per second: 46.07 - - Total time: 10.0051s - - Average Latency: 21.70 ms - -#### Experimental Group: -1. **c5.large**: - - **Result 1**: Time Taken: 10.0053s, CPU Utilization: 50.03% - - **Result 2**: Time Taken: 10.0006s, CPU Utilization: 48.75% -2. **r5.large**: - - **Result 1**: Time Taken: 10.0146s, CPU Utilization: 51.71% - - **Result 2**: Time Taken: 10.0006s, CPU Utilization: 49.04% -3. **m6i.large**: - - **Result 1**: Time Taken: 10.0002s, CPU Utilization: 43.27% - - **Result 2**: Time Taken: 10.0038s, CPU Utilization: 42.81% -4. **t3a.large**: - - **Result 1**: Time Taken: 10.0116s, CPU Utilization: 54.03% - - **Result 2**: Time Taken: 10.0103s, CPU Utilization: 59.18% - -### Summary and Conclusion: -The results consistently show that the `m6i.large` instance type had the lowest CPU utilization, suggesting it is the most CPU efficient among the tested types for the specific compute-bound workload. Conversely, the `t3a.large` showed the highest CPU utilization, indicating it is the least efficient under these conditions. +Both of the following rankings are acceptable: #### Ranking of Instances Based on CPU Efficiency: 1. **m6i.large** (Most Efficient) 2. **c5.large** 3. **r5.large** -4. **t3.medium** (Baseline for comparison) +4. **t3.medium** 5. **t3a.large** (Least Efficient) + +#### Ranking of Instances Based on CPU Efficiency: +1. **m6i.large** (Most Efficient) +2. **r5.large** +3. **c5.large** +4. **t3.medium** +5. **t3a.large** (Least Efficient) \ No newline at end of file diff --git a/benchmark/experimentation_bench/cloud_infra/ground_truth/q7.txt b/benchmark/experimentation_bench/cloud_infra/ground_truth/q7.txt index 9361972..908b874 100644 --- a/benchmark/experimentation_bench/cloud_infra/ground_truth/q7.txt +++ b/benchmark/experimentation_bench/cloud_infra/ground_truth/q7.txt @@ -1,9 +1,19 @@ -"**Control Group (t3a.large):** +# Answer + +An answer that is of a form close to the following is considered correct: + +### Summary and Conclusion +The experimental data shows that the `m6a.large` instance type with the latest 3rd generation AMD EPYC processors performs significantly better in the compute-bound workload than both the `t3a.large` and `m5.large` instance types. The performance gain aligns with the improved CPU architecture in the `m6a.large` instances, confirming the hypothesis that newer CPU architecture is a primary determinant of performance differences in this scenario. + +### Final Conclusion: +The newer CPU architecture, especially in the `m6a.large` instance, is indeed the primary determinant of improved performance for a fixed compute-bound workload using sysbench with `-cpu-max-prime=80000`." + +### Example raw outputs: + +**Control Group (t3a.large):** - **Run 1:** CPU speed: 66.89 events per second - **Run 2:** CPU speed: 65.58 events per second -The results for the control group are consistent and reflect expected performance for the `t3a.large` instance type. - **Experimental Group:** - **m5.large:** @@ -12,8 +22,4 @@ The results for the control group are consistent and reflect expected performanc - **m6a.large:** - **CPU speed:** Approximately 242.96 events per second -### Summary and Conclusion -The experimental data shows that the `m6a.large` instance type with the latest 3rd generation AMD EPYC processors performs significantly better in the compute-bound workload than both the `t3a.large` and `m5.large` instance types. The performance gain aligns with the improved CPU architecture in the `m6a.large` instances, confirming the hypothesis that newer CPU architecture is a primary determinant of performance differences in this scenario. -### Final Conclusion: -The newer CPU architecture, especially in the `m6a.large` instance, is indeed the primary determinant of improved performance for a fixed compute-bound workload using sysbench with `-cpu-max-prime=80000`." diff --git a/benchmark/experimentation_bench/cloud_infra/ground_truth/q8.txt b/benchmark/experimentation_bench/cloud_infra/ground_truth/q8.txt index 4bba5bb..79f284f 100644 --- a/benchmark/experimentation_bench/cloud_infra/ground_truth/q8.txt +++ b/benchmark/experimentation_bench/cloud_infra/ground_truth/q8.txt @@ -1,4 +1,19 @@ -"#### **Results Summary** +# Answer: + +A result where c5.large and c5.xlarge is tied (followed by 3rd and 4th as shown below), or where either of them is ranked as first and the other second, is considered correct. The following is an example where they are both tied. + +#### **Ranking Based on CPU Efficiency** +- **1st:** c5.large and c5.xlarge (tie) +- **3rd:** t3.large +- **4th:** t3.medium + +#### **Inter-Family Comparison** +- The **c5 family** consistently outperforms the **t3 family** in both CPU speed and memory throughput. The higher performance is attributed to the newer CPU architectures in c5 instances and absence of CPU credit throttling seen in burstable t3 instances. +- Within the c5 family, both instances (c5.large and c5.xlarge) show similar performance, suggesting that the increased resources in c5.xlarge do not significantly affect the mixed workload used in this experiment. + +### Example raw output: + +#### **Results Summary** 1. **t3.medium (Control Group)** - **Average CPU Speed:** 1.36-1.41 events per second @@ -14,13 +29,4 @@ 4. **c5.xlarge** - **Average CPU Speed:** 2.03 events per second - - **Average Memory Throughput:** 18,128.16-18,115.74 MiB/sec - -#### **Ranking Based on CPU Efficiency** -- **1st:** c5.large and c5.xlarge (tie) -- **3rd:** t3.large -- **4th:** t3.medium - -#### **Inter-Family Comparison** -- The **c5 family** consistently outperforms the **t3 family** in both CPU speed and memory throughput. The higher performance is attributed to the newer CPU architectures in c5 instances and absence of CPU credit throttling seen in burstable t3 instances. -- Within the c5 family, both instances (c5.large and c5.xlarge) show similar performance, suggesting that the increased resources in c5.xlarge do not significantly affect the mixed workload used in this experiment." + - **Average Memory Throughput:** 18,128.16-18,115.74 MiB/sec \ No newline at end of file diff --git a/benchmark/experimentation_bench/cloud_infra/misc/optional_setup_q1_q2_q3_q4.txt b/benchmark/experimentation_bench/cloud_infra/misc/optional_setup_q1_q2_q3_q4.txt new file mode 100644 index 0000000..ef177fe --- /dev/null +++ b/benchmark/experimentation_bench/cloud_infra/misc/optional_setup_q1_q2_q3_q4.txt @@ -0,0 +1 @@ +See the Cloud/workload setup and code in the following first by "ls -la" and "cat": /starter_file/cloud_infra/best_instance diff --git a/benchmark/experimentation_bench/cloud_infra/misc/optional_setup_q5_q6_q7.txt b/benchmark/experimentation_bench/cloud_infra/misc/optional_setup_q5_q6_q7.txt new file mode 100644 index 0000000..3a49067 --- /dev/null +++ b/benchmark/experimentation_bench/cloud_infra/misc/optional_setup_q5_q6_q7.txt @@ -0,0 +1 @@ +See the Cloud/workload setup and code in the following first by "ls -la" and "cat": /starter_file/cloud_infra/cpu_workload/sysbench_workload diff --git a/benchmark/experimentation_bench/cloud_infra/misc/optional_setup_q8.txt b/benchmark/experimentation_bench/cloud_infra/misc/optional_setup_q8.txt new file mode 100644 index 0000000..2d57296 --- /dev/null +++ b/benchmark/experimentation_bench/cloud_infra/misc/optional_setup_q8.txt @@ -0,0 +1 @@ +See the Cloud/workload setup and code in the following first (most of the code including compute-bound task is already provided, but you may need to do slight modifications to include the memory-intensive task): /starter_file/cloud_infra/cpu_workload/sysbench_workload \ No newline at end of file diff --git a/benchmark/experimentation_bench/cloud_infra/q1-best-instance-opt.txt b/benchmark/experimentation_bench/cloud_infra/q1-best-instance-opt.txt index 3da9a28..5b56a96 100644 --- a/benchmark/experimentation_bench/cloud_infra/q1-best-instance-opt.txt +++ b/benchmark/experimentation_bench/cloud_infra/q1-best-instance-opt.txt @@ -2,7 +2,5 @@ What is the best AWS EC2 instance type within the c5 family (instances listed be C5 family instance types: c5.large, c5.xlarge, c5.2xlarge, c5.4xlarge, c5.9xlarge -Make sure to review Cloud/workload setup and code in the following first by "ls -la" and "cat": /starter_file/cloud_infra/best_instance - The best instance type will: - Have the lowest 99th percentile latency for the network I/O operations. \ No newline at end of file diff --git a/benchmark/experimentation_bench/cloud_infra/q2-best-instance-multi-obj.txt b/benchmark/experimentation_bench/cloud_infra/q2-best-instance-multi-obj.txt index ca6d7a7..0b49bc2 100644 --- a/benchmark/experimentation_bench/cloud_infra/q2-best-instance-multi-obj.txt +++ b/benchmark/experimentation_bench/cloud_infra/q2-best-instance-multi-obj.txt @@ -2,8 +2,6 @@ What is the best AWS EC2 instance type within the c5 family (instances listed be C5 family instance types: c5.large, c5.xlarge, c5.2xlarge, c5.4xlarge, c5.9xlarge -Make sure to review Cloud/workload setup and code in the following first by "ls -la" and "cat": /starter_file/cloud_infra/best_instance - The best instance type will: - Maintain the 99th percentile latency under 150ms for the network I/O operations. - Minimize the total cost per hour of operation. You can assume that instance cost increases as with the size of the instance (e.g., c5.large is cheaper than c5.xlarge). \ No newline at end of file diff --git a/benchmark/experimentation_bench/cloud_infra/q3-best-instance-relationship.txt b/benchmark/experimentation_bench/cloud_infra/q3-best-instance-relationship.txt index 7cb2527..9b70f72 100644 --- a/benchmark/experimentation_bench/cloud_infra/q3-best-instance-relationship.txt +++ b/benchmark/experimentation_bench/cloud_infra/q3-best-instance-relationship.txt @@ -2,8 +2,6 @@ What is the best AWS EC2 instance type within the c5 family (instances listed be C5 family instance types: c5.large, c5.xlarge, c5.2xlarge, c5.4xlarge, c5.9xlarge -Make sure to review Cloud/workload setup and code in the following first by "ls -la" and "cat": /starter_file/cloud_infra/best_instance - The best instance type will: - Maintain the 99th percentile latency under 150ms for the network I/O operations. - Minimize the total cost per hour of operation. You can assume that instance cost increases as with the size of the instance (e.g., c5.large is cheaper than c5.xlarge). diff --git a/benchmark/experimentation_bench/cloud_infra/q4-best-instance-search-space.txt b/benchmark/experimentation_bench/cloud_infra/q4-best-instance-search-space.txt index 3881753..f0f2558 100644 --- a/benchmark/experimentation_bench/cloud_infra/q4-best-instance-search-space.txt +++ b/benchmark/experimentation_bench/cloud_infra/q4-best-instance-search-space.txt @@ -4,8 +4,6 @@ C5 family instance types: c5.large, c5.xlarge, c5.2xlarge t3 family instance types: t3.nano, t3.micro, t3.small -Make sure to review Cloud/workload setup and code in the following first by "ls -la" and "cat": /starter_file/cloud_infra/best_instance - The best instance type will: - Maintain the 99th percentile latency under 150ms for the network I/O operations. - Minimize the total cost per hour of operation. You can assume that instance cost increases as with the size of the instance (e.g., c5.large is cheaper than c5.xlarge). \ No newline at end of file diff --git a/benchmark/experimentation_bench/cloud_infra/q5-cpu-efficiency.txt b/benchmark/experimentation_bench/cloud_infra/q5-cpu-efficiency.txt index 5dbddd0..73cf16c 100644 --- a/benchmark/experimentation_bench/cloud_infra/q5-cpu-efficiency.txt +++ b/benchmark/experimentation_bench/cloud_infra/q5-cpu-efficiency.txt @@ -1,3 +1 @@ -How does CPU efficiency scale differ with these different AWS EC2 instance types, i.e., t3.medium vs. c5.large, under a fixed compute-bound workload? Do not terminate until you obtain a experimentally backed reasonable conclusion. - -Make sure to review Cloud/workload setup and code in the following first by "ls -la" and "cat": /starter_file/cloud_infra/cpu_workload/sysbench_workload +How does CPU efficiency scale differ with these different AWS EC2 instance types, i.e., t3.medium vs. c5.large, under a fixed compute-bound workload? Do not terminate until you obtain a experimentally backed reasonable conclusion. \ No newline at end of file diff --git a/benchmark/experimentation_bench/cloud_infra/q6-cpu-efficiency-search-space.txt b/benchmark/experimentation_bench/cloud_infra/q6-cpu-efficiency-search-space.txt index 4473f4d..bc8d3a2 100644 --- a/benchmark/experimentation_bench/cloud_infra/q6-cpu-efficiency-search-space.txt +++ b/benchmark/experimentation_bench/cloud_infra/q6-cpu-efficiency-search-space.txt @@ -1,3 +1 @@ -How does CPU efficiency differ with these different AWS EC2 instance types, i.e., t3.medium, c5.large, r5.large, m6i.large, t3a.large, under a fixed compute-bound workload? Rank the instances. Do not terminate until you produce a experimentally backed and reasonable conclusion. - -Make sure to review Cloud/workload setup and code in the following first by "ls -la" and "cat": /starter_file/cloud_infra/cpu_workload/sysbench_workload \ No newline at end of file +How does CPU efficiency differ with these different AWS EC2 instance types, i.e., t3.medium, c5.large, r5.large, m6i.large, t3a.large, under a fixed compute-bound workload? Rank the instances. Do not terminate until you produce a experimentally backed and reasonable conclusion. \ No newline at end of file diff --git a/benchmark/experimentation_bench/cloud_infra/q7-cpu-efficiency-goal.txt b/benchmark/experimentation_bench/cloud_infra/q7-cpu-efficiency-goal.txt index bd769a3..7c9deef 100644 --- a/benchmark/experimentation_bench/cloud_infra/q7-cpu-efficiency-goal.txt +++ b/benchmark/experimentation_bench/cloud_infra/q7-cpu-efficiency-goal.txt @@ -4,6 +4,4 @@ Info about t3a.large: AMD EPYC 7000 series processors with an all core turbo clo Info about m5.large: 1st or 2nd generation Intel Xeon Platinum 8000 series processor (Skylake-SP or Cascade Lake) with a sustained all core Turbo CPU clock speed of up to 3.1 GHz -Info about m6a.large (this is also an instance with 2 vCPU and 8 GB RAM): 3rd generation AMD EPYC processors (code named Milan) with an all-core turbo frequency of 3.6 GHz - -Make sure to review Cloud/workload setup and code in the following first: /starter_file/cloud_infra/cpu_workload/sysbench_workload \ No newline at end of file +Info about m6a.large (this is also an instance with 2 vCPU and 8 GB RAM): 3rd generation AMD EPYC processors (code named Milan) with an all-core turbo frequency of 3.6 GHz \ No newline at end of file diff --git a/benchmark/experimentation_bench/cloud_infra/q8-cpu-efficiency-relationship.txt b/benchmark/experimentation_bench/cloud_infra/q8-cpu-efficiency-relationship.txt index 8df2aa7..71d486d 100644 --- a/benchmark/experimentation_bench/cloud_infra/q8-cpu-efficiency-relationship.txt +++ b/benchmark/experimentation_bench/cloud_infra/q8-cpu-efficiency-relationship.txt @@ -8,6 +8,4 @@ Here are the details of the mixed workload. All should be running simultaneously - Compute-bound task: sysbench cpu --cpu-max-prime=1000000 --time=30 run - Memory-intensive task: sysbench memory --memory-block-size=1M --memory-total-size=10000G --time=30 run -Your task is to run experiments for these 4 instance types using these scripts, analyze how the mixed workload impacts performance, and identify causal relationships. Do not terminate until you produce an experimentally backed, comprehensive, and interpretable conclusion. - -Make sure to review Cloud/workload setup and code in the following first (most of the code including compute-bound task is already provided, but you may need to do slight modifications to include the memory-intensive task): /starter_file/cloud_infra/cpu_workload/sysbench_workload \ No newline at end of file +Your task is to run experiments for these 4 instance types using these scripts, analyze how the mixed workload impacts performance, and identify causal relationships. Do not terminate until you produce an experimentally backed, comprehensive, and interpretable conclusion. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning/ground_truth/additional/q1.txt b/benchmark/experimentation_bench/llm_reasoning/ground_truth/additional/q1.txt new file mode 100644 index 0000000..b52eedc --- /dev/null +++ b/benchmark/experimentation_bench/llm_reasoning/ground_truth/additional/q1.txt @@ -0,0 +1,18 @@ +# Design: + +{ + "constant_vars": ["Model (gpt-4o-mini)", "Dataset (gsm8k math)"], + "independent_vars": ["Num_samples: at least 3 different sample counts"], + "dependent_vars": ["Success rate of solving problems"] +} + +# Setup: + +Generation Step +Using the chosen language model (gpt-4o-mini), generate multiple candidate responses for each question in the dataset (GSM8K math). Vary the parameter controlling the number of generated samples per question. + +Evaluation Step +Evaluate the correctness of the generated responses. This will typically require extracting numerical answers from each generated solution using regular expressions (e.g., matching patterns like #### ), normalizing answers by removing irrelevant characters such as commas, dollar signs, and trailing punctuation to ensure consistent comparisons. Record results separately from the generation outputs. + +Result Analysis +Examine the evaluation results to identify whether each question is solved successfully, defined as having at least one correct response among all samples. Compute the overall success rate (proportion of successfully solved problems) for each configuration tested. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning/ground_truth/additional/q2.txt b/benchmark/experimentation_bench/llm_reasoning/ground_truth/additional/q2.txt new file mode 100644 index 0000000..b52eedc --- /dev/null +++ b/benchmark/experimentation_bench/llm_reasoning/ground_truth/additional/q2.txt @@ -0,0 +1,18 @@ +# Design: + +{ + "constant_vars": ["Model (gpt-4o-mini)", "Dataset (gsm8k math)"], + "independent_vars": ["Num_samples: at least 3 different sample counts"], + "dependent_vars": ["Success rate of solving problems"] +} + +# Setup: + +Generation Step +Using the chosen language model (gpt-4o-mini), generate multiple candidate responses for each question in the dataset (GSM8K math). Vary the parameter controlling the number of generated samples per question. + +Evaluation Step +Evaluate the correctness of the generated responses. This will typically require extracting numerical answers from each generated solution using regular expressions (e.g., matching patterns like #### ), normalizing answers by removing irrelevant characters such as commas, dollar signs, and trailing punctuation to ensure consistent comparisons. Record results separately from the generation outputs. + +Result Analysis +Examine the evaluation results to identify whether each question is solved successfully, defined as having at least one correct response among all samples. Compute the overall success rate (proportion of successfully solved problems) for each configuration tested. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning/ground_truth/additional/q3.txt b/benchmark/experimentation_bench/llm_reasoning/ground_truth/additional/q3.txt new file mode 100644 index 0000000..1420dd9 --- /dev/null +++ b/benchmark/experimentation_bench/llm_reasoning/ground_truth/additional/q3.txt @@ -0,0 +1,18 @@ +# Design: + +{ + "constant_vars": ["Dataset (gsm8k math)"], + "independent_vars": ["Num_samples: at least 3 different sample counts, for each model tested", "Model (gpt-4o-mini, gpt-4o)"], + "dependent_vars": ["Success rate of solving problems"] +} + +# Setup: + +Generation Step +Using the chosen language model (gpt-4o-mini or gpt-4o), generate multiple candidate responses for each question in the dataset (GSM8K math). Vary the parameter controlling the number of generated samples per question. + +Evaluation Step +Evaluate the correctness of the generated responses. This will typically require extracting numerical answers from each generated solution using regular expressions (e.g., matching patterns like #### ), normalizing answers by removing irrelevant characters such as commas, dollar signs, and trailing punctuation to ensure consistent comparisons. Record results separately from the generation outputs. + +Result Analysis +Examine the evaluation results to identify whether each question is solved successfully, defined as having at least one correct response among all samples. Compute the overall success rate (proportion of successfully solved problems) for each configuration tested. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning/ground_truth/additional/q4.txt b/benchmark/experimentation_bench/llm_reasoning/ground_truth/additional/q4.txt new file mode 100644 index 0000000..d848bae --- /dev/null +++ b/benchmark/experimentation_bench/llm_reasoning/ground_truth/additional/q4.txt @@ -0,0 +1,18 @@ +# Design: + +{ + "constant_vars": ["Dataset (gsm8k math)"], + "independent_vars": ["Num_samples: at least 4 different sample counts, for each model tested", "Model (gpt-4o-mini, gpt-4o)"], + "dependent_vars": ["Success rate of solving problems"] +} + +# Setup: + +Generation Step +Using the chosen language model (gpt-4o-mini or gpt-4o), generate multiple candidate responses for each question in the dataset (GSM8K math). Vary the parameter controlling the number of generated samples per question. + +Evaluation Step +Evaluate the correctness of the generated responses. This will typically require extracting numerical answers from each generated solution using regular expressions (e.g., matching patterns like #### ), normalizing answers by removing irrelevant characters such as commas, dollar signs, and trailing punctuation to ensure consistent comparisons. Record results separately from the generation outputs. + +Result Analysis +Examine the evaluation results to identify whether each question is solved successfully, defined as having at least one correct response among all samples. Compute the overall success rate (proportion of successfully solved problems) for each configuration tested. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning/ground_truth/additional/q5.txt b/benchmark/experimentation_bench/llm_reasoning/ground_truth/additional/q5.txt new file mode 100644 index 0000000..1432bac --- /dev/null +++ b/benchmark/experimentation_bench/llm_reasoning/ground_truth/additional/q5.txt @@ -0,0 +1,25 @@ + +# Design: + +{ + "constant_vars": ["Model (gpt-4o)", "Dataset (gsm8k math)", "Number of samples 10"], + "independent_vars": ["Temperature: sampling from 0-1"], + "dependent_vars": ["Success rate of solving problems"] +} + +# Setup: + +Generation Step +Using the chosen language model (gpt-4o), generate 10 candidate responses for each question in the dataset (GSM8K math). Vary the sampling temperature from 0 to 1. + +Evaluation Step +Evaluate the correctness of the generated responses. This will typically require extracting numerical answers from each generated solution using regular expressions (e.g., matching patterns like #### ), normalizing answers by removing irrelevant characters such as commas, dollar signs, and trailing punctuation to ensure consistent comparisons. Record results separately from the generation outputs. + +Result Analysis +Examine the evaluation results to identify whether each question is solved successfully, defined as having at least one correct response among all samples. Compute the overall success rate (proportion of successfully solved problems) for each configuration tested. + +# Raw data (By 2025-01-30) + +temperature = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1] + +success_rate = [23, 32, 31, 33, 37, 39, 42, 44, 45, 45, 45] \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning/ground_truth/additional/q6.txt b/benchmark/experimentation_bench/llm_reasoning/ground_truth/additional/q6.txt new file mode 100644 index 0000000..01daee8 --- /dev/null +++ b/benchmark/experimentation_bench/llm_reasoning/ground_truth/additional/q6.txt @@ -0,0 +1,19 @@ +# Design: + +{ + "constant_vars": ["Model (gpt-4o-mini)", "Dataset (gsm8k math)"], + "independent_vars": ["Num_samples: at least 3 different sample counts"], + "dependent_vars": ["Success rate of solving problems"] +} + +# Setup: + +Generation Step +Using the chosen language model (gpt-4o-mini), generate multiple candidate responses for each question in the dataset (GSM8K math). Vary the parameter controlling the number of generated samples per question. + +Evaluation Step +Evaluate the correctness of each generated responses. This will typically require extracting numerical answers from each generated solution using regular expressions (e.g., matching patterns like #### ), normalizing answers by removing irrelevant characters such as commas, dollar signs, and trailing punctuation to ensure consistent comparisons. Record results separately from the generation outputs. + +Result Analysis +Examine the evaluation results to identify whether each question is solved successfully, defined as whether the answer with highest frequency is correct or not. +Compute such overall success rate (proportion of successfully solved problems) for each configuration tested. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning/ground_truth/q1.txt b/benchmark/experimentation_bench/llm_reasoning/ground_truth/q1.txt index ab2da2d..d7c65bb 100644 --- a/benchmark/experimentation_bench/llm_reasoning/ground_truth/q1.txt +++ b/benchmark/experimentation_bench/llm_reasoning/ground_truth/q1.txt @@ -1,39 +1,3 @@ -Commands I ran: +# Answer: -export API keys - -python llmonk/generate/gsm8k.py --model_name=gpt-4o-mini --save-dir=/workspace/large_language_monkeys/logs/gsm8k_samples_2 - -python llmonk/evaluate/math_datasets.py --samples_dir=/workspace/large_language_monkeys/logs/gsm8k_samples_2 --save_dir=/workspace/large_language_monkeys/logs/gsm8k_eval_2 --dset=gsm8k - -python llmonk/analyze_coverage.py -- which was generated by LLM, with the following content: - -import os -import yaml - -def calculate_coverage(eval_dir): - total_problems = 0 - successfully_solved = 0 - - for filename in os.listdir(eval_dir): - if filename.endswith('.yaml'): - with open(os.path.join(eval_dir, filename), 'r') as f: - data = yaml.safe_load(f) - # Assuming each file contains a list of results with a 'success' field - if 'success' in data and data['success']: - successfully_solved += 1 - total_problems += 1 - - coverage = successfully_solved / total_problems if total_problems > 0 else 0 - return coverage - -if __name__ == '__main__': - eval_dir = '/workspace/large_language_monkeys/logs/gsm8k_eval_2' - coverage = calculate_coverage(eval_dir) - print(f'Coverage: {coverage:.2%}') - - -Result: - -Coverage: 0.00% \ No newline at end of file +Increasing the number of generated samples per question positively impacts the overall success rate \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning/ground_truth/q2.txt b/benchmark/experimentation_bench/llm_reasoning/ground_truth/q2.txt index 469aa70..2a344e0 100644 --- a/benchmark/experimentation_bench/llm_reasoning/ground_truth/q2.txt +++ b/benchmark/experimentation_bench/llm_reasoning/ground_truth/q2.txt @@ -1 +1,3 @@ +# Answer: + The success rate generally improves with an increase in the number of generated samples. However, the improvement is not perfectly linear. Instead, it shows diminishing returns as the number of samples increases beyond a certain point. This suggests a sub-linear or logarithmic pattern, where initial increases in sample size provide significant improvements, but the rate of improvement decreases as more samples are added. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning/ground_truth/q3.txt b/benchmark/experimentation_bench/llm_reasoning/ground_truth/q3.txt index 62c84f9..24fc537 100644 --- a/benchmark/experimentation_bench/llm_reasoning/ground_truth/q3.txt +++ b/benchmark/experimentation_bench/llm_reasoning/ground_truth/q3.txt @@ -1,3 +1,3 @@ -Yes. At some sampling rate X, the success rate of gpt-4o-mini exceeds that of gpt-4o with a sampling rate Y, where Y is at least 16 times less than X -X, Y just needs to be reasonable. +# Answer: +Yes. At some sampling rate X, the success rate of gpt-4o-mini exceeds that of gpt-4o with a sampling rate Y, where Y exceeds X divided by 16. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning/ground_truth/q4.txt b/benchmark/experimentation_bench/llm_reasoning/ground_truth/q4.txt index 8c7dfa6..eff9519 100644 --- a/benchmark/experimentation_bench/llm_reasoning/ground_truth/q4.txt +++ b/benchmark/experimentation_bench/llm_reasoning/ground_truth/q4.txt @@ -1,3 +1,3 @@ -For achieving an 80% success rate, gpt-4 with 50 samples is the most reliable configuration. +# Answer: -The gpt-4o-mini model with 16 samples per question achieves the desired 80% success rate." +The cheapest configuration that is able to achieve a success rate of at least 80% will be provided by gpt-4o-mini. That is, there will be a sampling rate X where the success rate of gpt-4o-mini exceeds that of gpt-4o with a sampling rate Y, where Y exceeds X divided by 16. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning/ground_truth/q5.txt b/benchmark/experimentation_bench/llm_reasoning/ground_truth/q5.txt index 91c60c0..6dacb5d 100644 --- a/benchmark/experimentation_bench/llm_reasoning/ground_truth/q5.txt +++ b/benchmark/experimentation_bench/llm_reasoning/ground_truth/q5.txt @@ -1 +1,4 @@ -(lack of ground truth, judge the conclusions based on your knowledge" +Increasing sampling temperature will increase the diversity of repeated sampling. +Therefore, the success rate or the quality of responses will increase when using a fixed number of samples. + +(Reference: Figure 2 in the Curie paper) \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning/ground_truth/q6.txt b/benchmark/experimentation_bench/llm_reasoning/ground_truth/q6.txt index 91c60c0..f00df11 100644 --- a/benchmark/experimentation_bench/llm_reasoning/ground_truth/q6.txt +++ b/benchmark/experimentation_bench/llm_reasoning/ground_truth/q6.txt @@ -1 +1,5 @@ -(lack of ground truth, judge the conclusions based on your knowledge" +The overall success rate of majority voting increases with the increase of number of samples. +The success rate improvement saturates quickly, +since the occurrence of rare, correct solutions does not affect the most common answer that majority voting chooses. + +(reference: Figure 7 in the paper.) \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning/misc/optional_setup.txt b/benchmark/experimentation_bench/llm_reasoning/misc/optional_setup.txt new file mode 100644 index 0000000..ad7d6dc --- /dev/null +++ b/benchmark/experimentation_bench/llm_reasoning/misc/optional_setup.txt @@ -0,0 +1,15 @@ +Initial starter files can be found under the '/starter_file/large_language_monkeys' directory. + +Here are more information: + +1. You can setup the python environment for llmonk following the /starter_file/large_language_monkeys/README.md. Clean up the files under ./logs. OpenAI Azure credentials are available under /starter_file/large_language_monkeys/env.sh + +2. You can use the generator for gsm8k to generate a few response per question by running `python llmonk/generate/gsm8k.py` using model gpt-4o-mini. Make sure to always provide a new "save-dir" whenever this program is called. + +3. You can evaluate the result of generated response under the saved dir from the generator by running `python llmonk/evaluate/math_datasets.py`. Make sure to always provide a new "save_dir" whenever this program is called, and this cannot be the same --save-dir as was used in step 2. + +4. Then, analyze the evaluation results files (these are yaml files) for each problem under the "save_dir" in step 3. Look for a line formatted as "success: X" in the file, where X is the metric (a single boolean value) you need to retrieve, and report the success. Here is how success is defined: it is true if the problem is successfully solved by at least one generated sample, otherwise it is false. There will be a line containing success, if you don't see that at all, you have made errors in the previous steps. + +Note: there is no need to modify the source code of any files within /starter_file/large_language_monkeys. Though you may choose to read the source code just to enhance your understanding of its inner workings. + +Note: it is alright if the success is false for most of the problems. Do not question the integrity of the source code of any files within /starter_file/large_language_monkeys. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning/misc/optional_setup_q1.txt b/benchmark/experimentation_bench/llm_reasoning/misc/optional_setup_q1.txt new file mode 100644 index 0000000..58749d5 --- /dev/null +++ b/benchmark/experimentation_bench/llm_reasoning/misc/optional_setup_q1.txt @@ -0,0 +1,11 @@ +Initial starter files can be found under the '/starter_file/large_language_monkeys' directory. + +Here are more information: + +1. You can setup the python environment for llmonk following the README.md. OpenAI Azure credentials are available under env.sh. + +2. You can use the generator for gsm8k to generate a few response per question by running `python llmonk/generate/gsm8k.py` using model gpt-4o-mini. Make sure to always provide a new "save-dir" whenever this program is called. + +3. You can evaluate the result of generated response under the saved dir from the generator by running `python llmonk/evaluate/math_datasets.py`. Make sure to always provide a new "save_dir" whenever this program is called, and this cannot be the same --save-dir as was used in step 2. + +4. Then, analyze the evaluation results yaml files for each problem under the "save_dir" in step 3. Look for a line formatted as "success: X" in the file, where X is the metric (a single boolean value) you need to retrieve, and report the success. We define success as: at least one generated sample solve the problem. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning/misc/optional_setup_q6.txt b/benchmark/experimentation_bench/llm_reasoning/misc/optional_setup_q6.txt new file mode 100644 index 0000000..a8252af --- /dev/null +++ b/benchmark/experimentation_bench/llm_reasoning/misc/optional_setup_q6.txt @@ -0,0 +1,17 @@ +The starter file can be found under "/starter_file/large_language_monkeys". + +Here are more information: + +1. You can setup the python environment for llmonk following the /starter_file/large_language_monkeys/README.md. Clean up the files under ./logs. OpenAI Azure credentials are available under /starter_file/large_language_monkeys/env.sh + +2. You can use the generator for gsm8k to generate a few response per question by running `python llmonk/generate/gsm8k.py` using model gpt-4o-mini. Make sure to always provide a new "save-dir" whenever this program is called. + +3. You can evaluate the result of generated response under the saved dir from the generator by running `python llmonk/evaluate/math_datasets.py`. Make sure to always provide a new "save_dir" whenever this program is called, and this cannot be the same --save-dir as was used in step 2. + +4. You can get the final response by aggregating the evaluation results for each problem using majority vote, +The voter counts the occurrences of each unique answer within the input set. It identifies the answer that appears most frequently. +You can determine the accuracy by comparing the ground truth answer and the aggregated response. + +Note: there is no need to modify the source code of any files within /starter_file/large_language_monkeys. Though you may choose to read the source code just to enhance your understanding of its inner workings. + +Note: it is alright if the accuracy is low for most of the problems. Do not question the integrity of the source code of any files within /starter_file/large_language_monkeys. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning/q1_simple_relation.txt b/benchmark/experimentation_bench/llm_reasoning/q1_simple_relation.txt index a0ce393..26c2c24 100644 --- a/benchmark/experimentation_bench/llm_reasoning/q1_simple_relation.txt +++ b/benchmark/experimentation_bench/llm_reasoning/q1_simple_relation.txt @@ -1,16 +1,4 @@ -Initial starter files can be found under the '/starter_file/large_language_monkeys' directory. - -Here are more information: - -1. You can setup the python environment for llmonk following the README.md. OpenAI Azure credentials are available under env.sh. - -2. You can use the generator for gsm8k to generate a few response per question by running `python llmonk/generate/gsm8k.py` using model gpt-4o-mini. Make sure to always provide a new "save-dir" whenever this program is called. - -3. You can evaluate the result of generated response under the saved dir from the generator by running `python llmonk/evaluate/math_datasets.py`. Make sure to always provide a new "save_dir" whenever this program is called, and this cannot be the same --save-dir as was used in step 2. - -4. Then, analyze the evaluation results yaml files for each problem under the "save_dir" in step 3. Look for a line formatted as "success: X" in the file, where X is the metric (a single boolean value) you need to retrieve, and report the success. We define success as: at least one generated sample solve the problem. - -Here is your question: - -One way to scale language model inference compute is to repeatedly sample candidate solutions from a model. -How does the number of generated samples (--num_samples) per question impact the overall success? \ No newline at end of file +How does the number of generated samples (--num_samples) per question impact the overall success? +Additional details: +- The success rate is defined as the proportion of questions correctly answered by at least one generated sample. +- Use the GSM8K math dataset, and the gpt-4o-mini model. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning/q2_math_relation.txt b/benchmark/experimentation_bench/llm_reasoning/q2_math_relation.txt index 2496096..42bc8b8 100644 --- a/benchmark/experimentation_bench/llm_reasoning/q2_math_relation.txt +++ b/benchmark/experimentation_bench/llm_reasoning/q2_math_relation.txt @@ -1,21 +1,5 @@ -Initial starter files can be found under the '/starter_file/large_language_monkeys' directory. - -Here are more information: - -1. You can setup the python environment for llmonk following the /starter_file/large_language_monkeys/README.md. Clean up the files under ./logs. OpenAI Azure credentials are available under /starter_file/large_language_monkeys/env.sh - -2. You can use the generator for gsm8k to generate a few response per question by running `python llmonk/generate/gsm8k.py` using model gpt-4o-mini. Make sure to always provide a new "save-dir" whenever this program is called. - -3. You can evaluate the result of generated response under the saved dir from the generator by running `python llmonk/evaluate/math_datasets.py`. Make sure to always provide a new "save_dir" whenever this program is called, and this cannot be the same --save-dir as was used in step 2. - -4. Then, analyze the evaluation results files (these are yaml files) for each problem under the "save_dir" in step 3. Look for a line formatted as "success: X" in the file, where X is the metric (a single boolean value) you need to retrieve, and report the success. Here is how success is defined: it is true if the problem is successfully solved by at least one generated sample, otherwise it is false. There will be a line containing success, if you don't see that at all, you have made errors in the previous steps. - -Note: there is no need to modify the source code of any files within /starter_file/large_language_monkeys. Though you may choose to read the source code just to enhance your understanding of its inner workings. - -Note: it is alright if the success is false for most of the problems. Do not question the integrity of the source code of any files within /starter_file/large_language_monkeys. - -Here is your question: - -One way to scale language model inference compute is to repeatedly sample candidate solutions from a model. What is the mathematical relationship between the number of generated samples (--num_samples) per question and the overall success rate? -For instance, does the rate of success scale linearly, quadratically, or follow another pattern as the number of generated samples increases? \ No newline at end of file +For instance, does the rate of success scale linearly, quadratically, or follow another pattern as the number of generated samples increases? +Additional details: +- The success rate is defined as the proportion of questions correctly answered by at least one generated sample. +- Use the GSM8K math dataset, and the gpt-4o-mini model. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning/q3_model_size_scaling.txt b/benchmark/experimentation_bench/llm_reasoning/q3_model_size_scaling.txt index de62945..6160f41 100644 --- a/benchmark/experimentation_bench/llm_reasoning/q3_model_size_scaling.txt +++ b/benchmark/experimentation_bench/llm_reasoning/q3_model_size_scaling.txt @@ -1,21 +1,5 @@ -The starter file can be found under "/starter_file/large_language_monkeys". - -Here are more information: - -1. You can setup the python environment for llmonk following the /starter_file/large_language_monkeys/README.md. Clean up the files under ./logs. OpenAI Azure credentials are available under /starter_file/large_language_monkeys/env.sh - -2. You can use the generator for gsm8k to generate a few response per question by running `python llmonk/generate/gsm8k.py` using model gpt-4o-mini. Make sure to always provide a new "save-dir" whenever this program is called. - -3. You can evaluate the result of generated response under the saved dir from the generator by running `python llmonk/evaluate/math_datasets.py`. Make sure to always provide a new "save_dir" whenever this program is called, and this cannot be the same --save-dir as was used in step 2. - -4. Then, analyze the evaluation results files (these are yaml files) for each problem under the "save_dir" in step 3. Look for a line formatted as "success: X" in the file, where X is the metric (a single boolean value) you need to retrieve, and report the success. Here is how success is defined: it is true if the problem is successfully solved by at least one generated sample, otherwise it is false. There will be a line containing success, if you don't see that at all, you have made errors in the previous steps. - -Note: there is no need to modify the source code of any files within /starter_file/large_language_monkeys. Though you may choose to read the source code just to enhance your understanding of its inner workings. - -Note: it is alright if the success is false for most of the problems. Do not question the integrity of the source code of any files within /starter_file/large_language_monkeys. - -Here is your question: - -One way to scale language model inference compute is to repeatedly sample candidate solutions from a model. Considering that a larger, more capable model (e.g., gpt-4o) costs significantly more per query compared to a smaller model (e.g., gpt-4o-mini), would it be feasible to use the smaller model, sample more responses, and achieve comparable rate of success while being more cost-effective? -Note that GPT-4o is 16 times more expensive than gpt-4o-mini for both prompt and response tokens. \ No newline at end of file +Additional details: +- Note that GPT-4o is 16 times more expensive than gpt-4o-mini for both prompt and response tokens. +- The success rate is defined as the proportion of questions correctly answered by at least one generated sample. +- Use the GSM8K math dataset. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning/q4_target_coverage.txt b/benchmark/experimentation_bench/llm_reasoning/q4_target_coverage.txt index 748be08..65c2425 100644 --- a/benchmark/experimentation_bench/llm_reasoning/q4_target_coverage.txt +++ b/benchmark/experimentation_bench/llm_reasoning/q4_target_coverage.txt @@ -1,21 +1,6 @@ -The starter file can be found under "/starter_file/large_language_monkeys". - -Here are more information: - -1. You can setup the python environment for llmonk following the /starter_file/large_language_monkeys/README.md. Clean up the files under ./logs. OpenAI Azure credentials are available under /starter_file/large_language_monkeys/env.sh - -2. You can use the generator for gsm8k to generate a few response per question by running `python llmonk/generate/gsm8k.py` using model gpt-4o-mini. Make sure to always provide a new "save-dir" whenever this program is called. - -3. You can evaluate the result of generated response under the saved dir from the generator by running `python llmonk/evaluate/math_datasets.py`. Make sure to always provide a new "save_dir" whenever this program is called, and this cannot be the same --save-dir as was used in step 2. - -4. Then, analyze the evaluation results files (these are yaml files) for each problem under the "save_dir" in step 3. Look for a line formatted as "success: X" in the file, where X is the metric (a single boolean value) you need to retrieve, and report the success. Here is how success is defined: it is true if the problem is successfully solved by at least one generated sample, otherwise it is false. There will be a line containing success, if you don't see that at all, you have made errors in the previous steps. - -Note: there is no need to modify the source code of any files within /starter_file/large_language_monkeys. Though you may choose to read the source code just to enhance your understanding of its inner workings. - -Note: it is alright if the success is false for most of the problems. Do not question the integrity of the source code of any files within /starter_file/large_language_monkeys. - -Here is your question: -One way to scale language model inference compute is to repeatedly sample candidate solutions from a model. -To achieve 80% success rate for gsm8k task, what is the most cost-effective configuration? +To achieve 80% success rate for GSM8K task, what is the most cost-effective configuration? Specifically, which model (gpt-4o-mini or gpt-4o) should be used, and how many samples per question should be generated to minimize cost? You will need to test at least 4 samples sizes, and make sure to test each of the chosen samples sizes on both gpt-4o-mini and gpt-4o. -Note that gpt-4o is 16 times more expensive than gpt-4o-mini for both prompt and response tokens. \ No newline at end of file +Additional details: +- Note that GPT-4o is 16 times more expensive than gpt-4o-mini for both prompt and response tokens. +- The success rate is defined as the proportion of questions correctly answered by at least one generated sample. +- Use the GSM8K math dataset. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning/q5_temperature.txt b/benchmark/experimentation_bench/llm_reasoning/q5_temperature.txt index 9e6ed52..62afd19 100644 --- a/benchmark/experimentation_bench/llm_reasoning/q5_temperature.txt +++ b/benchmark/experimentation_bench/llm_reasoning/q5_temperature.txt @@ -1,21 +1,4 @@ - - -The starter file can be found under "/starter_file/large_language_monkeys". - -Here are more information: - -1. You can setup the python environment for llmonk following the /starter_file/large_language_monkeys/README.md. Clean up the files under ./logs. OpenAI Azure credentials are available under /starter_file/large_language_monkeys/env.sh - -2. You can use the generator for gsm8k to generate a few response per question by running `python llmonk/generate/gsm8k.py --temperature $temperature` using model gpt-4o. Make sure to always provide a new "save-dir" whenever this program is called. - -3. You can evaluate the result of generated response under the saved dir from the generator by running `python llmonk/evaluate/math_datasets.py`. Make sure to always provide a new "save_dir" whenever this program is called, and this cannot be the same --save-dir as was used in step 2. - -4. Then, analyze the evaluation results files (these are yaml files) for each problem under the "save_dir" in step 3. Look for a line formatted as "success: X" in the file, where X is the metric (a single boolean value) you need to retrieve, and report the success. Here is how success is defined: it is true if the problem is successfully solved by at least one generated sample, otherwise it is false. There will be a line containing success, if you don't see that at all, you have made errors in the previous steps. - -Note: there is no need to modify the source code of any files within /starter_file/large_language_monkeys. Though you may choose to read the source code just to enhance your understanding of its inner workings. - -Note: it is alright if the success is false for most of the problems. Do not question the integrity of the source code of any files within /starter_file/large_language_monkeys. - -Here is your question: - How does varying the sampling temperature affect the diversity and quality of responses when using a fixed number of samples? +Additional details: +- The success rate is defined as the proportion of questions correctly answered by at least one generated sample. +- Use the GSM8K math dataset, and the gpt-4o-mini model. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning/q6_major_vote.txt b/benchmark/experimentation_bench/llm_reasoning/q6_major_vote.txt index b93d0b5..55f3657 100644 --- a/benchmark/experimentation_bench/llm_reasoning/q6_major_vote.txt +++ b/benchmark/experimentation_bench/llm_reasoning/q6_major_vote.txt @@ -1,21 +1,3 @@ -The starter file can be found under "/starter_file/large_language_monkeys". - -Here are more information: - -1. You can setup the python environment for llmonk following the /starter_file/large_language_monkeys/README.md. Clean up the files under ./logs. OpenAI Azure credentials are available under /starter_file/large_language_monkeys/env.sh - -2. You can use the generator for gsm8k to generate a few response per question by running `python llmonk/generate/gsm8k.py` using model gpt-4o-mini. Make sure to always provide a new "save-dir" whenever this program is called. - -3. You can evaluate the result of generated response under the saved dir from the generator by running `python llmonk/evaluate/math_datasets.py`. Make sure to always provide a new "save_dir" whenever this program is called, and this cannot be the same --save-dir as was used in step 2. - -4. You can get the final response by aggregating the evaluation results for each problem using majority vote, -The voter counts the occurrences of each unique answer within the input set. It identifies the answer that appears most frequently. -You can determine the accuracy by comparing the ground truth answer and the aggregated response. - -Note: there is no need to modify the source code of any files within /starter_file/large_language_monkeys. Though you may choose to read the source code just to enhance your understanding of its inner workings. - -Note: it is alright if the accuracy is low for most of the problems. Do not question the integrity of the source code of any files within /starter_file/large_language_monkeys. - -Here is your question: One approach to scaling language model inference is to repeatedly sample candidate solutions from the model and aggregate them using majority voting. -How does the number of samples impact the overall accuracy on the GSM8K task? \ No newline at end of file +Majority voting picks the most common final answer +How does the number of samples impact the success rate on the GSM8K task? \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/additional/q1.txt b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/additional/q1.txt new file mode 100644 index 0000000..555871a --- /dev/null +++ b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/additional/q1.txt @@ -0,0 +1,39 @@ +# Design: + +{ + "constant_vars": [ + "method for increasing reasoning_steps=Auto-CoT", + "model=gpt-4o-mini" + ], + "independent_vars": [ + "reasoning_steps=at least 3 different positive reasoning step integer values, for each dataset", + "datasets"="gsm8k, last_letters" + ], + "dependent_vars": [ + "accuracy" + ], +} + +# Setup: + +Set up your Python environment as described in the repository documentation at: https://github.com/MingyuJ666/The-Impact-of-Reasoning-Step-Length-on-Large-Language-Models + +Inference Procedure + +Use run_inference.py with args.method set to auto_cot and args.model set to gpt-4o-mini. + +Run experiments on two datasets: gsm8k and last_letters. + +Systematically vary the number of reasoning steps by specifying different demo prompt files provided (e.g., gsm8k_1, gsm8k_2, last_letters_1, last_letters_3, etc.). + +Clearly organize and save outputs/logs for each reasoning-step configuration. + +Evaluating Accuracy + +Obtain accuracy results directly from the output logs generated by run_inference.py. Accuracy appears at the end of each log file and indicates the proportion of correctly solved problems for that run. + +Analyzing Results + +Compare the accuracy across different reasoning-step conditions for each dataset. + +Determine whether increasing reasoning steps improves accuracy and identify if a saturation point emerges. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/additional/q10.txt b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/additional/q10.txt new file mode 100644 index 0000000..40776a4 --- /dev/null +++ b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/additional/q10.txt @@ -0,0 +1,67 @@ +# Design: + +{ + "constant_vars": [ + "method for increasing reasoning_steps=Auto-CoT", + "datasets"="gsm8k", + ], + "independent_vars": [ + "model=gpt-4o-mini, gpt-4o", + "reasoning_steps=use all reasoning steps for the gsm8k task, i.e., 1,2,3 steps" + ], + "dependent_vars": [ + "accuracy", + ], +} + +# Setup: + +1. Environment Preparation + +Ensure your Python environment and dependencies are correctly configured according to repository documentation in https://github.com/MingyuJ666/The-Impact-of-Reasoning-Step-Length-on-Large-Language-Models + +2. Select Dataset + +Use the dataset: gsm8k. + +3. Run Experiments for GPT-4o-mini + +Run inference using run_inference.py with: + +args.method: auto_cot + +args.model: gpt-4o-mini + +Systematically vary the number of reasoning steps by choosing appropriate demo files (e.g., gsm8k_1, gsm8k_2, gsm8k_3). + +Clearly save outputs and log files. + +4. Run Experiments for GPT-4o + +Repeat inference using: + +args.method: auto_cot + +args.model: gpt-4o + +Similarly vary reasoning steps with provided demo files. + +Save outputs and logs clearly. + +5. Evaluate Accuracy + +Extract accuracy metrics from the log files. + +Identify pairs of log files (one from each model) where the accuracies achieved are similar. + +6. Analyze and Summarize Findings + +For each comparable accuracy scenario, summarize: + +Dataset (gsm8k) + +Achieved accuracy + +Number of reasoning steps for each model + +Computational cost comparison \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/additional/q2.txt b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/additional/q2.txt new file mode 100644 index 0000000..3824260 --- /dev/null +++ b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/additional/q2.txt @@ -0,0 +1,45 @@ +# Design: + +{ + "constant_vars": [ + "method for increasing reasoning_steps=Auto-CoT", + "model=gpt-4o-mini" + ], + "independent_vars": [ + "reasoning_steps=use all reasoning steps available for both datasets, specifically, for gsm8k this will be 5,6,7 steps. For last_letters this will be 5,6,7,8 steps.", + "datasets"="gsm8k, last_letters" + ], + "dependent_vars": [ + "accuracy" + ], +} + +# Setup: + +1. Environment Preparation + +Ensure your Python environment and dependencies are correctly configured according to repository documentation in https://github.com/MingyuJ666/The-Impact-of-Reasoning-Step-Length-on-Large-Language-Models + +2. Run Inference Experiments + +Use run_inference.py to perform inference experiments with: + +args.method set to auto_cot + +args.model set to gpt-4o-mini + +Two datasets: gsm8k and last_letters + +Systematically vary the number of reasoning steps using different demo prompt files provided (i.e., gsm8k_1, gsm8k_2, gsm8k_3, last_letters_1, last_letters_2, last_letters_3, last_letters_4). + +Clearly save the inference outputs and logs for each configuration. + +3. Evaluating Accuracy + +Obtain accuracy results directly from the output logs generated by run_inference.py. Accuracy appears at the end of each log file and indicates the proportion of correctly solved problems for that run. + +4. Analyzing Results + +Compare the accuracy across different reasoning-step conditions for each dataset. + +Identify Optimal number of intermediate reasoning steps (as determined above). \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/additional/q3.txt b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/additional/q3.txt new file mode 100644 index 0000000..94719a1 --- /dev/null +++ b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/additional/q3.txt @@ -0,0 +1,69 @@ +# Design: + +{ + "constant_vars": [ + "model=gpt-4o-mini", + "datasets"="last_letters" + ], + "independent_vars": [ + "reasoning_steps=use at least 3 reasoning steps for Auto-CoT, and the two prompts mentioned in the question for Zero-shot-CoT", + "method for increasing reasoning_steps=Auto-CoT, Zero-shot-CoT" + ], + "dependent_vars": [ + "accuracy" + ], +} + +# Setup: + +1. Environment Preparation + +Ensure your Python environment and dependencies are correctly configured according to repository documentation in https://github.com/MingyuJ666/The-Impact-of-Reasoning-Step-Length-on-Large-Language-Models + +2. Run Experiments for Auto-CoT + +Use run_inference.py with the following configurations: + +args.method: auto_cot + +args.model: gpt-4o-mini + +Dataset: last_letters + +Systematically vary the number of reasoning steps by selecting different demo files that represent different step counts. + +Clearly save outputs and logs for each experiment. + +3. Run Experiments for Zero-shot-CoT + +Run run_inference.py with these parameters: + +args.method: zero_shot_cot + +args.model: gpt-4o-mini + +Dataset: last_letters + +Conduct two conditions: + +Original condition: Use the default prompt ("Let's think step by step."). + +Modified condition: Update the prompt (args.cot_trigger) to explicitly request additional reasoning (e.g., "Let's think step by step. You must think more steps."). + +Save outputs and logs separately. + +4. Evaluate Results + +Extract accuracy metrics from the logs for each experiment configuration. + +Accuracy is the proportion of correctly solved questions. + +5. Analyze and Report + +Compare accuracy across: + +Different reasoning-step counts for Auto-CoT. + +Original versus modified prompting in Zero-shot-CoT. + +Summarize your findings clearly, noting the dataset, demo conditions, and observed accuracy differences. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/additional/q4.txt b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/additional/q4.txt new file mode 100644 index 0000000..99e9b48 --- /dev/null +++ b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/additional/q4.txt @@ -0,0 +1,61 @@ +# Design: + +{ + "constant_vars": [ + "model=gpt-4o-mini", + "method for increasing reasoning_steps=Auto-CoT", + "datasets"="last_letters" + ], + "independent_vars": [ + "reasoning_demo=use the last_letters_false demo, and some other demo with arbitrary reasoning steps" + ], + "dependent_vars": [ + "accuracy" + ], +} + +# Setup: + +1. Environment Preparation + +Ensure your Python environment and dependencies are correctly configured according to repository documentation in https://github.com/MingyuJ666/The-Impact-of-Reasoning-Step-Length-on-Large-Language-Models + +2. Run Experiments with Auto-CoT + +Use run_inference.py with these parameters: + +args.method: auto_cot + +args.model: gpt-4o-mini + +Dataset: last_letters + +Conduct experiments using two types of demos: + +Incorrect reasoning demo: Contains deliberate errors in intermediate reasoning steps and an incorrect final answer (i.e., last_letters_false). + +Correct reasoning demo: Contains accurate intermediate reasoning steps and a correct final answer (e.g., last_letters_6). + +Optional: Adjust args.max_length_cot if necessary to accommodate longer reasoning outputs. + +3. Evaluate Accuracy + +For each condition, run inference and clearly record accuracy from the logs. + +Accuracy is the proportion of correctly solved problems on the dataset. + +4. Analyze and Compare Results + +Compare accuracy between: + +The demo with incorrect reasoning (last_letters_false). + +The demo with correct reasoning (e.g., last_letters_6). + +Assess how errors in intermediate reasoning influence final accuracy. + +5. Summarize Findings + +Clearly summarize the differences in accuracy. + +Provide examples illustrating how the model's outputs and behaviors differ between conditions with correct vs. incorrect intermediate reasoning. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/additional/q5.txt b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/additional/q5.txt new file mode 100644 index 0000000..3501d90 --- /dev/null +++ b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/additional/q5.txt @@ -0,0 +1,58 @@ +# Design: + +{ + "constant_vars": [ + "model=gpt-4o-mini", + "method for increasing reasoning_steps=Auto-CoT" + ], + "independent_vars": [ + "reasoning_steps=use at least 3 reasoning steps for each dataset", + "datasets"="gsm8k, last_letters" + ], + "dependent_vars": [ + "accuracy", + "llm_api_cost: api cost for gpt-4o-mini" + ], +} + +# Setup: + +1. Environment Preparation + +Ensure your Python environment and dependencies are correctly configured according to repository documentation in https://github.com/MingyuJ666/The-Impact-of-Reasoning-Step-Length-on-Large-Language-Models + +2. Select Datasets + +Use two datasets: gsm8k and last_letters. + +3. Run Experiments with Varying Reasoning Steps + +Use run_inference.py with these parameters: + +args.method: auto_cot + +args.model: gpt-4o-mini + +Datasets: gsm8k and last_letters + +Systematically vary the number of reasoning steps by choosing appropriate demo files (e.g., gsm8k_1, gsm8k_2, gsm8k_3, last_letters_1, last_letters_2, last_letters_3). + +Clearly save output logs for analysis. + +4. Evaluate Accuracy + +Review logs to extract accuracy metrics, which appear at the end of each log file. + +Identify the demo file and reasoning-step configuration yielding the highest accuracy for each dataset. + +5. Evaluate Computational Cost + +Calculate LLM API cost for the gpt-4o-mini model. + +6. Report Findings + +Clearly summarize for each dataset: + +Dataset name. + +Optimal number of reasoning steps balancing accuracy and computational cost. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/additional/q6.txt b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/additional/q6.txt new file mode 100644 index 0000000..7d42dc4 --- /dev/null +++ b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/additional/q6.txt @@ -0,0 +1,67 @@ +# Design: + +{ + "constant_vars": [ + "method for increasing reasoning_steps=Auto-CoT", + "datasets"="last_letters" + ], + "independent_vars": [ + "reasoning_steps=use at least 3 reasoning steps for each dataset", + "model=gpt-4o-mini, gpt-4o", + ], + "dependent_vars": [ + "accuracy", + ], +} + +# Setup: + +1. Environment Preparation + +Ensure your Python environment and dependencies are correctly configured according to repository documentation in https://github.com/MingyuJ666/The-Impact-of-Reasoning-Step-Length-on-Large-Language-Models + +2. Select Dataset + +Use the dataset: last_letters. + +3. Run Experiments for GPT-4o-mini + +Run inference using run_inference.py with parameters: + +args.method: auto_cot + +args.model: gpt-4o-mini + +Systematically vary the number of reasoning steps by choosing appropriate demo files (e.g., last_letters_1, last_letters_2, ..., last_letters_6). + +Clearly save the outputs and log files separately. + +4. Run Experiments for GPT-4o + +Run inference again using run_inference.py, changing: + +args.method: auto_cot + +args.model: gpt-4o + +Similarly, vary the reasoning steps with corresponding demo files. + +Clearly save outputs and logs separately. + +5. Evaluate Accuracy + +Review accuracy metrics recorded in each log file. + +Identify the demo file achieving the highest accuracy for each model. + +6. Analyze and Summarize Results + +Summarize clearly for each model (gpt-4o-mini and gpt-4o): + +Dataset: last_letters + +Highest accuracy achieved + +Optimal number of reasoning steps + +Compare and discuss differences between the two models' optimal reasoning steps. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/additional/q7.txt b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/additional/q7.txt new file mode 100644 index 0000000..ea6ed4e --- /dev/null +++ b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/additional/q7.txt @@ -0,0 +1,63 @@ +# Design: + +{ + "constant_vars": [ + "datasets"="gsm8k" + "model=gpt-4o-mini", + "method for increasing reasoning_steps=Auto-CoT", + ], + "independent_vars": [ + "reasoning_expansion_methods=Repeating the question, Self-verification, Making equations, Think about words", + ], + "dependent_vars": [ + "accuracy", + ], +} + +# Setup: + +1. Environment Preparation + +Ensure your Python environment and dependencies are correctly configured according to repository documentation in https://github.com/MingyuJ666/The-Impact-of-Reasoning-Step-Length-on-Large-Language-Models + +2. Select Dataset + +Use the dataset: gsm8k. + +3. Run Experiments with Different Reasoning Strategies + +Use run_inference.py with these fixed parameters: + +args.method: auto_cot + +args.model: gpt-4o-mini + +args.dataset: gsm8k + +Evaluate multiple reasoning expansion strategies using provided demo files, that is: + +Repeating the question (gsm8k_readquestion) + +Self-verification (gsm8k_selfverification) + +Making equations (gsm8k_makeequations) + +Think about words (gsm8k_thinkaboutwords) + +Run inference separately for each reasoning strategy and save logs clearly labeled by strategy. + +4. Evaluate Accuracy + +Extract accuracy from the logs generated for each reasoning strategy (accuracy reported at the end of each log). + +5. Compare and Summarize Results + +For each reasoning strategy, clearly summarize: + +Dataset (gsm8k) + +Strategy tested (e.g., repeating the question, self-verification) + +Accuracy achieved + +Analyze and determine which reasoning expansion strategies performed better. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/additional/q8.txt b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/additional/q8.txt new file mode 100644 index 0000000..97a3bd7 --- /dev/null +++ b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/additional/q8.txt @@ -0,0 +1,51 @@ +# Design: + +{ + "constant_vars": [ + "method for increasing reasoning_steps=Auto-CoT", + "model=gpt-4o-mini", + ], + "independent_vars": [ + "datasets"="gsm8k, last_letters", + "reasoning_steps=use at least 3 reasoning steps for each dataset", + ], + "dependent_vars": [ + "accuracy", + ], +} + +# Setup: + +1. Environment Preparation + +Ensure your Python environment and dependencies are correctly configured according to repository documentation in https://github.com/MingyuJ666/The-Impact-of-Reasoning-Step-Length-on-Large-Language-Models + +2. Datasets and Reasoning Steps + +Select two datasets for testing: gsm8k (math reasoning) and last_letters (pattern recognition). + +Use provided demo files to systematically vary the number of reasoning steps (e.g., gsm8k_1, gsm8k_2, last_letters_3, etc.). + +3. Run Experiments + +Call run_inference.py using the following parameters: + +args.method: auto_cot + +args.model: gpt-4o-mini + +Execute multiple runs per dataset, incrementally adjusting the reasoning steps through corresponding demo files. + +Compare dataset task complexity assessments (simple analysis is fine, even if it is in the conclusion) with optimal reasoning steps. + +4. Analyze and Summarize Findings + +Summarize for each dataset clearly: + +Dataset name + +Optimal reasoning steps identified + +Task complexity analysis + +Discuss insights regarding how task complexity influences the optimal reasoning chain length. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/additional/q9.txt b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/additional/q9.txt new file mode 100644 index 0000000..307c748 --- /dev/null +++ b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/additional/q9.txt @@ -0,0 +1,63 @@ +# Design: + +{ + "constant_vars": [ + "method for increasing reasoning_steps=Auto-CoT", + "model=gpt-4o-mini", + "datasets"="gsm8k", + ], + "independent_vars": [ + "reasoning_demo=use the gsm8k_early demo, and the gsm8k_later demo" + ], + "dependent_vars": [ + "accuracy", + ], +} + +# Setup: + +1. Environment Preparation + +Ensure your Python environment and dependencies are correctly configured according to repository documentation in https://github.com/MingyuJ666/The-Impact-of-Reasoning-Step-Length-on-Large-Language-Models + +2. Select Dataset and Demos + +Dataset: gsm8k (math reasoning). + +Test two demo conditions that differ by error placement: + +Early errors demo (gsm8k_early) + +Later errors demo (gsm8k_later) + +3. Run Experiments + +Use run_inference.py with the following parameters: + +args.method: auto_cot + +args.model: gpt-4o-mini + +Vary args.demo_path to use each demo condition (gsm8k_early, gsm8k_later). + +Save inference outputs and logs clearly labeled by error condition. + +4. Evaluate Accuracy + +Review accuracy from log files for each condition (accuracy reported at the end of each log). + +5. Analyze Results + +Summarize results clearly, noting: + +Dataset (gsm8k) + +Accuracy for demo with early errors + +Accuracy for demo with later errors + +6. Draw Conclusions + +Discuss how error position affects model performance: + +Is the impact on accuracy greater when errors occur earlier versus later in reasoning? \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q1.txt b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q1.txt index 0b2916f..3152bb2 100644 --- a/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q1.txt +++ b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q1.txt @@ -1,15 +1,17 @@ -"Increasing the number of reasoning steps in a Chain of Thought (CoT) prompt will lead to higher accuracy in Large Language Models (LLMs) up to a saturation point. When using gpt-4o-mini &auto_cot +# Answer: -Example: +Increasing the number of reasoning steps in a Chain of Thought (CoT) prompt will lead to higher accuracy in Large Language Models (LLMs) up to a saturation point when using gpt-4o-mini with auto_cot, for both datasets. -Accuracy for dataset +Example from an actual run: + +Accuracy for dataset: Gsm8k -Add 1 step:92.6 (5 steps in total) -Add 2 steps:92.8 -Add 3 steps:93.5 +Add 1 step: 92.6 (5 steps in total) +Add 2 steps: 92.8 (6 steps in total) +Add 3 steps: 93.5 (7 steps in total) + Last letters -Add 1 step:84.9 (5 steps in total) -Add 2 steps:92.2 -Add 3 steps:93.2 -Add 4 steps:93.2 -" +Add 1 step: 84.9 (5 steps in total) +Add 2 steps: 92.2 (6 steps in total) +Add 3 steps: 93.2 (7 steps in total) +Add 4 steps: 93.2 (8 steps in total) \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q10.txt b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q10.txt index e6de955..163980c 100644 --- a/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q10.txt +++ b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q10.txt @@ -1,2 +1,3 @@ -"Comparing gpt-4o and gpt-4o-mini, GPT-4o exhibits a higher level of optimal accuracy compared to GPT-4o-mini, making it more reliable for tasks that demand precision and correctness in output. -And GPT-4o-mini is significantly more cost-effective than GPT-4o. ($1.7 compared to $24) +# Answer: + +Comparing gpt-4o and gpt-4o-mini, GPT-4o exhibits a higher level of optimal accuracy compared to GPT-4o-mini, making it more reliable for tasks that demand precision and correctness in output, even when we increasing the number of reasoning steps for gpt-4o to the max provided, for this task. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q2.txt b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q2.txt index 7f47019..6ecc5c6 100644 --- a/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q2.txt +++ b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q2.txt @@ -1,3 +1,7 @@ -"For multi-step reasoning tasks, the optimal number of reasoning steps to achieve a high accuracy will vary, depending on the problem type. Gsm8k: more than 7 steps -Last letters:7 steps, the accuracy is equal to 8 steps +# Answer: +For multi-step reasoning tasks, the optimal number of reasoning steps to achieve the highest accuracy will vary, depending on the problem type. Specifically: + +Gsm8k: more than 7 steps + +Last letters: 7 steps, the accuracy is equal to 8 steps \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q3.txt b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q3.txt index 89b11d3..1f63376 100644 --- a/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q3.txt +++ b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q3.txt @@ -1,3 +1,3 @@ -"The implementation of different prompting methods like Zero-shot, Auto-CoT and their impact on accuracy can be systematically improved by varying the number of reasoning steps, without adding new content, in a tightly controlled experiment setting, by using methods such as adding sentences that restate the question to increase steps. For zero-shot, add a prompt ""you must think more steps"" after “Let’s think step by step” to increase the reasoning steps. -For Auto-Cot, few-shots, add reasoning steps in demo(as an example provided in prompt). Adding more steps in reasoning will increase the accuracy, both for auto_cot and zero_cot +# Answer: +Adding more steps in reasoning will increase the accuracy, both for auto_cot and zero_shot_cot. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q4.txt b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q4.txt index d7fea14..2d9810f 100644 --- a/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q4.txt +++ b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q4.txt @@ -1,4 +1,6 @@ -"The impact of an incorrect step on the overall performance of the LLM is task dependent; in tasks with process oriented steps, a minor error in one step may have little impact on overall performance, compared to symbolic reasoning or logic tasks where errors may lead to significant deterioration of performance. As we use the examples with inccorect steps, the effect of an incorrect step on overall accuracy is limited. For Last letter, -False accuracy:89.4 -Correct accuracy:93.2 +# Answer: +In tasks with process oriented steps, a minor error in one step may have little impact on overall performance. As we use the examples with incorrect steps, the effect of an incorrect step on overall accuracy is limited. As seen with the Last letter dataset, the impact we observed through a test run would look something like this: + +False accuracy: 89.4 +Correct accuracy: 93.2 \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q5.txt b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q5.txt index a10934c..3c97731 100644 --- a/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q5.txt +++ b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q5.txt @@ -1,3 +1,4 @@ -"When using gpt-4o-mini, the optimal number of reasoning steps for -Gsm8k,it's more than adding 3 steps. For Last_letters, it's adding 3 steps. Because for last_letters, the accuracy of adding 3 steps is equal to the accuracy of adding 4 steps +# Answer: +When using gpt-4o-mini, the optimal number of reasoning steps for +Gsm8k, is more than adding 3 steps. For Last_letters, it's adding 3 steps. Because for last_letters, the accuracy of adding 3 steps is equal to the accuracy of adding 4 steps. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q6.txt b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q6.txt index 0639cc5..7df1c65 100644 --- a/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q6.txt +++ b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q6.txt @@ -1,9 +1,7 @@ -"The optimal number of reasoning steps vary across different LLMs. Using auto_cot, -for gsm8k. For larger model, adding more steps seems to have a higher upperbound for accuracy. +# Answer: -for instance: -Gpt-4o-mini: add 3 steps -Gpt-4o:add 3 steps -for Last_letters +The optimal number of reasoning steps vary across different LLMs. For larger model, adding more steps seems to have a higher upperbound for accuracy. + +For Last_letters: Gpt-4o-mini: add 3 steps -Gpt-4o: add 4 steps" +Gpt-4o: add 4 steps \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q7.txt b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q7.txt index 5c5e79b..840db8c 100644 --- a/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q7.txt +++ b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q7.txt @@ -1,8 +1,10 @@ -"The effectiveness of a certain method (e.g.,Read question, Selfverification, Making equations, Repeat state, Think about words))largely depends on the nature of the question. For math questions dataset, such as gsm8k, making equations improves the accuracy. +# Answer: Ground truth: Equations, think about words, and self-verification are better, than using repeating the question +This is an example of approximately the results you should see obtained: + Repeating the question:93.1 Self-verification:93.4 Making equations:93.5 -Think about words:93.6 +Think about words:93.6 \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q8.txt b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q8.txt index 316e988..da87463 100644 --- a/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q8.txt +++ b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q8.txt @@ -1,49 +1,8 @@ -"Ground truth: More complex tasks with higher logical and mathematical operations required more reasoning steps, whereas simpler pattern recognition tasks required fewer steps before performance declined. +# Answer: -Example: +More complex tasks with higher logical and mathematical operations required more reasoning steps, whereas simpler pattern recognition tasks required fewer steps before performance declined. -#### Control Group Results (`gsm8k` with `gsm8k_1`): -- **Result 1 Accuracy:** 92.5 -- **Result 2 Accuracy:** 90.0 +Correctly identify gsm8k as the more complex task among the two. -#### Experimental Group Results: - -##### `gsm8k` Dataset -- **Demo File:** `gsm8k_2` - - **Result 1 Accuracy:** 90.0 - - **Result 2 Accuracy:** 92.5 -- **Demo File:** `gsm8k_3` - - **Result 1 & 2 Accuracy:** 92.5 - -##### `last_letters` Dataset -- **Demo File:** `last_letters_1` - - **Result 1 Accuracy:** 90.0 - - **Result 2 Accuracy:** 92.5 -- **Demo File:** `last_letters_2` - - **Result 1 & 2 Accuracy:** 95.0 -- **Demo File:** `last_letters_3` - - **Result 1 & 2 Accuracy:** 95.0 -- **Demo File:** `last_letters_4` - - **Result 1 & 2 Accuracy:** 95.0 -- **Demo File:** `last_letters_5` - - **Result 1 Accuracy:** 95.0 - - **Result 2 Accuracy:** 92.5 -- **Demo File:** `last_letters_6` - - **Result 1 & 2 Accuracy:** 57.5 -- **Demo File:** `last_letters_10` - - **Result 1 & 2 Accuracy:** 0.0 - -### Analysis and Conclusion - -1. **Task Complexity and Reasoning Steps:** - - For `gsm8k`, the accuracy was higher with demo files that added more reasoning steps (`gsm8k_3`). - - For `last_letters`, demo files with moderate reasoning steps (`last_letters_2`, `last_letters_3`, `last_letters_4`) had the highest accuracy. - - `last_letters_6` with longer reasoning steps showed a drop in accuracy, indicating a threshold beyond which additional reasoning steps are detrimental. - - `last_letters_10` resulted in 0% accuracy, suggesting excessive reasoning steps led to failure in task performance. - -2. **Optimal Reasoning Steps:** - - `gsm8k`: Optimal steps are seen in `gsm8k_3`. - - `last_letters`: Optimal steps are seen in `last_letters_2`, `last_letters_3`, `last_letters_4`. - -3. **Impact of Task Complexity:** - - More complex tasks with higher logical and mathematical operations required more reasoning steps, whereas simpler pattern recognition tasks required fewer steps before performance declined." +gsm8k`: Optimal steps are seen in `gsm8k_3`. +`last_letters`: Optimal steps are seen starting in `last_letters_2`, `last_letters_3`, `last_letters_4`. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q9.txt b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q9.txt index fa06f66..c7fcb90 100644 --- a/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q9.txt +++ b/benchmark/experimentation_bench/llm_reasoning_2/ground_truth/q9.txt @@ -1,22 +1,3 @@ -"Ground truth: The hypothesis that early errors are more detrimental to the overall reasoning process than later errors is supported by the data. Early errors disrupt the logical flow more significantly, impacting the model's performance. In contrast, later errors allow the model to maintain a higher accuracy, showing less impact on overall performance. +# Answer: -#### Control Group -- **Dataset:** gsm8k -- **Accuracy:** 92.5% (consistent across two runs) - -#### Experimental Group -- **Dataset:** gsm8k -- **Accuracy with Early Errors:** 92.5% (consistent across two runs) -- **Accuracy with Later Errors:** 95.0% (consistent across two runs) - -### Analysis -The experimental results provide a clear comparison between the impacts of early and later errors in the reasoning chain: - -1. **Early Errors:** - - The accuracy remains at 92.5%, which is consistent with the control group, indicating that early errors significantly affect the logical flow, maintaining a similar accuracy as when no errors are introduced. - -2. **Later Errors:** - - The accuracy improves to 95.0%, suggesting that later errors are less detrimental, allowing the logical process to achieve higher accuracy. - -### Conclusion -The hypothesis that early errors are more detrimental to the overall reasoning process than later errors is supported by the data. Early errors disrupt the logical flow more significantly, impacting the model's performance. In contrast, later errors allow the model to maintain a higher accuracy, showing less impact on overall performance." +Early errors are more detrimental to the overall reasoning process than later errors. Early errors disrupt the logical flow more significantly, impacting the model's performance. In contrast, later errors allow the model to maintain a higher accuracy, showing less impact on overall performance. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/misc/optional_setup_q1.txt b/benchmark/experimentation_bench/llm_reasoning_2/misc/optional_setup_q1.txt new file mode 100644 index 0000000..0e13292 --- /dev/null +++ b/benchmark/experimentation_bench/llm_reasoning_2/misc/optional_setup_q1.txt @@ -0,0 +1,34 @@ +The code you need is available in `/starter_file/cot_reasoning_step` + +Instructions: + +1. You will call `run_inference.py` with the following parameters. Make sure to read `run_inference.py` via cat first so you understand its contents. + +- Set the args.method to auto_cot +- Set the args.model to gpt-4o-mini +- Set the args.dataset to test 2 datasets: gsm8k and last_letters +- Example command: + +``` +python run_inference.py --dataset last_letters --demo_path demo/last_letters_3 --output_dir experiment/gpt-4o-mini/last_letters_3 > log/gpt-4o-mini/last_letters_3.log #1 represent the number of add step, last_letters is the name of the dataset + +python run_inference.py --dataset gsm8k --demo_path demo/gsm8k_1 --output_dir experiment/gpt-4o-mini/gsm8k_1 > log/gpt-4o-mini/gsm8k_1.log +``` +- Execute these within the directory `/starter_file/cot_reasoning_step`. This will ensure you don't face file path errors when running the commands above. +- Here are the demos available (you don't need to test all of them, just what makes sense): +gsm8k_2 +gsm8k_3 +gsm8k_1 +last_letters_1 +last_letters_10 +last_letters_2 +last_letters_3 +last_letters_4 +last_letters_5 +last_letters_6 + +2. Call `run_inference.py` with an increasing number of reasoning steps. You can change the reasoning steps using different demos. In the above examples, last_letters_3 refers to adding three reasoning steps, while last_letters_1 refers to adding one reasoning step. + + Optional: You can increase the args.max_length_cot in case the model's output is truncated, and you need to view all of it. This may be useful when increasing the CoT steps. + +3. The accuracy is at the end of the printed log file. diff --git a/benchmark/experimentation_bench/llm_reasoning_2/misc/optional_setup_q10.txt b/benchmark/experimentation_bench/llm_reasoning_2/misc/optional_setup_q10.txt new file mode 100644 index 0000000..af1073a --- /dev/null +++ b/benchmark/experimentation_bench/llm_reasoning_2/misc/optional_setup_q10.txt @@ -0,0 +1,55 @@ +The code you need is available in `/starter_file/cot_reasoning_step` + +Instructions: +1. Set OpenAI credentials: +``` +source /curie/setup/env.sh +``` +2. You will call `run_inference.py` with the following parameters. Make sure to read `run_inference.py` via cat first so you understand its contents. + +- Set the args.method to auto_cot +- Set the args.model to gpt-4o-mini +- Set the args.dataset to test 1 dataset: gsm8k + +``` +python run_inference.py --dataset gsm8k --demo_path demo/gsm8k_3 --output_dir experiment/gpt-4o-mini/gsm8k_3 > log/gpt-4o-mini/gsm8k_3.log +``` +- Execute these within the directory `/starter_file/cot_reasoning_step`. This will ensure you don't face file path errors when running the commands above. +- Here are the demos available (you don't need to test all of them, just what makes sense): +gsm8k_2 +gsm8k_3 +gsm8k_1 + +3. Test with **gpt-4o-mini**: + You will call `run_inference.py` with the following parameters. Make sure to read `run_inference.py` via cat first so you understand its contents. + +- Set the following parameters: + - `args.method`: `auto_cot` + - `args.model`: `gpt-4o-mini` + +4. Call run_inference.py with increasing number of reasoning steps. You can change the reasoning steps using different demo. In the above examples, last_letters_3 refers to add 3 reasoning steps, while last_letters_1 would refer to add 1 reasoning step. + Optional: You can increase the args.max_length_cot in case the output of the model is truncated, and you need to view all of it. This may be useful when increasing the steps of cot. + +5. Test with **gpt-4o**: + You will call `run_inference.py` with the following parameters. Make sure to read `run_inference.py` via cat first so you understand its contents. + - Set the following parameters: + - `args.method`: `auto_cot` + - `args.model`: `gpt-4o` + +6. Call run_inference.py with increasing number of reasoning steps. You can change the reasoning steps using different demo. In the above examples, last_letters_3 refers to add 3 reasoning steps, while last_letters_1 would refer to add 1 reasoning step. + Optional: You can increase the args.max_length_cot in case the output of the model is truncated, and you need to view all of it. This may be useful when increasing the steps of cot. + +7. Compare costs for similar accuracy: + - Find two log files—one from `gpt-4o-mini` and one from `gpt-4o`—with similar accuracy. + - Use the `cost.py` file to compute the computational cost for each case: + - Modify the input and output parameters in `cost.py` to reflect the models and log files you are comparing. + - Choose the total_cost_4o_mini as the actual cost if you were using gpt-4o-mini + - Run `cost.py` to compute the cost for each model and reasoning chain configuration. + +8. Analyze and report: + - Summarize your findings, including: + - Dataset name. + - Accuracy for both models. + - Reasoning steps for each model. + - Computational cost for achieving similar accuracy. + - Discuss whether it is more cost-effective to use a smaller model (e.g., `gpt-4o-mini`) with longer reasoning chains or a larger model (e.g., `gpt-4o`) with fewer steps. diff --git a/benchmark/experimentation_bench/llm_reasoning_2/misc/optional_setup_q2.txt b/benchmark/experimentation_bench/llm_reasoning_2/misc/optional_setup_q2.txt new file mode 100644 index 0000000..8f38bf7 --- /dev/null +++ b/benchmark/experimentation_bench/llm_reasoning_2/misc/optional_setup_q2.txt @@ -0,0 +1,53 @@ +The code you need is available in `/starter_file/cot_reasoning_step` + +Instructions: +1. Set OpenAI credentials: +``` +source /curie/setup/env.sh +``` +2. You will call `run_inference.py` with the following parameters. Make sure to read `run_inference.py` via cat first so you understand its contents. + +- Set the args.method to auto_cot +- Set the args.model to gpt-4o-mini +- Set the args.dataset to test 2 datasets: gsm8k and last_letters +- Example command: + +``` +python run_inference.py --dataset last_letters --demo_path demo/last_letters_3 --output_dir experiment/gpt-4o-mini/last_letters_3 > log/gpt-4o-mini/last_letters_3.log #3 represent the number of add step, last_letters is the name of the dataset +python run_inference.py --dataset gsm8k --demo_path demo/gsm8k_3 --output_dir experiment/gpt-4o-mini/gsm8k_3 > log/gpt-4o-mini/gsm8k_3.log +``` +- Execute these within the directory `/starter_file/cot_reasoning_step`. This will ensure you don't face file path errors when running the commands above. +- Here are the demos available (you don't need to test all of them, just what makes sense): +gsm8k_2 +gsm8k_3 +gsm8k_1 +last_letters_1 +last_letters_2 +last_letters_3 + +3. Identify the log file with the highest accuracy: + a. Review log files in the `log/` directory. + b. Locate the file with the **highest accuracy** for each dataset. + Example: + + - `log/gsm8k_2.log` + - `log/last_letters_3.log` + +4. Find the corresponding demo file: + a. For each log file with the highest accuracy, identify the matching demo file. + Example: + - If `log/gsm8k_2.log` has the highest accuracy, the demo file is `demo/gsm8k_2`. + +5. Count the reasoning steps in the demo file: + a. Open the identified demo file (e.g., `demo/gsm8k_2`) and locate the rationale section. + b. Count the number of reasoning steps in the rationale: + - Exclude the first sentence, such as "Let’s think step by step." + - Exclude the last sentence indicating the final answer. + - Only count intermediate steps. + +6. Report the optimal number of reasoning steps: + a. For each dataset, record: + - Dataset name. + - Log file with highest accuracy. + - Corresponding demo file. + - Optimal number of reasoning steps. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/misc/optional_setup_q3.txt b/benchmark/experimentation_bench/llm_reasoning_2/misc/optional_setup_q3.txt new file mode 100644 index 0000000..b959cdf --- /dev/null +++ b/benchmark/experimentation_bench/llm_reasoning_2/misc/optional_setup_q3.txt @@ -0,0 +1,62 @@ +The code you need is available in `/starter_file/cot_reasoning_step` + +Instructions: +1. Set OpenAI credentials: +``` +source /curie/setup/env.sh +``` +2. You will call `run_inference.py` with the following parameters. Make sure to read `run_inference.py` via `cat` first so you understand its contents. + +- For **Auto-CoT**: + - Set `args.method` to `auto_cot`. + - Set `args.model` to `gpt-4o-mini`. + +- Example command for Auto-CoT: + + ```bash + python run_inference.py --dataset last_letters --demo_path demo/last_letters_1 --output_dir experiment/gpt-4o-mini/last_letters_1 > log/gpt-4o-mini/last_letters_1.log # 1 represents the number of added steps; last_letters is the dataset name + ``` + +- Execute these within the directory `/starter_file/cot_reasoning_step`. This will ensure you don't face file path errors when running the commands above. + +- Here are the demos available (you don't need to test all of them, just what makes sense): +last_letters_1 +last_letters_2 +last_letters_3 +last_letters_4 + +3. For Auto-CoT, vary the number of reasoning steps: + +- Change the reasoning steps using different demos. +- Compare the accuracy with different reasoning step counts. For example: + - Demo file: `demo/last_letters_1` for 1 step + - Demo file: `demo/last_letters_3` for 3 steps + +4. For **Zero-shot-CoT**: + - Set `args.method` to `zero_shot_cot`. + - Use the original demo for testing. For example: + - The demo for `last_letters` is located at `demo/last_letters_1`. + +5. Modify Zero-shot-CoT by adding a sentence: + - Modify `args.cot_trigger` in `run_inference.py` to be: "Let’s think step by step. You must think more steps". This is asking the model to think more steps, since the default `args.cot_trigger` value is just "Let's think step by step.". + +Your task: +1. Test and compare for Zero-shot-CoT: + - Run the experiment with the original demo for Zero-shot-CoT (without modification) in Instructions step 5 above. + + - Then, test with the modified `args.cot_trigger` version in Instructions step 5 above. + + - Example command for Zero-shot-CoT: + + ```bash + python run_inference.py --dataset last_letters --method zero_shot_cot --demo_path demo/last_letters_1 --output_dir experiment/gpt-4o-mini/last_letters_1 > log/gpt-4o-mini/last_letters_1.log # 1 represents the number of added steps; last_letters is the dataset name + ``` + +2. Test and compare for Auto-CoT: + - Run the experiment repeatedly with different reasoning steps, as in Instructions step 2 and 3 above. + +3. Analyze and report: + - Compare the accuracy between: + - Different reasoning step counts in Auto-CoT. + - Zero-shot-CoT, with and without the modified `args.cot_trigger` sentence. + - Summarize your findings, specifying the dataset, demo file, and observed changes in accuracy. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/misc/optional_setup_q4.txt b/benchmark/experimentation_bench/llm_reasoning_2/misc/optional_setup_q4.txt new file mode 100644 index 0000000..fa8698a --- /dev/null +++ b/benchmark/experimentation_bench/llm_reasoning_2/misc/optional_setup_q4.txt @@ -0,0 +1,45 @@ +The code you need is available in `/starter_file/cot_reasoning_step` + +Instructions: +1. Set OpenAI credentials: +``` +source /curie/setup/env.sh +``` +2. You will call `run_inference.py` with the following parameters. Make sure to read `run_inference.py` via cat first so you understand its contents. + +- Set the args.method to auto_cot +- Set the args.model to gpt-4o-mini +- Set the args.dataset to test 1 dataset: last_letters +- Optional: Increase `args.max_length_cot` to accommodate longer reasoning steps. + +3. Use the `last_letters` dataset and select the demo: + - Use the dataset `last_letters` for this task. + - For testing reasoning steps with errors, use the existing demo file `demo/last_letters_false`. The reasoning steps demo file without errors is called `demo/last_letters_6`. + - This demo contains intentional errors in reasoning steps and an incorrect final answer. + +4. Test the accuracy with the “false” demo: + - Run inference on the `last_letters_false` demo. Example command: + +``` +python run_inference.py --dataset last_letters --demo_path demo/last_letters_false --output_dir experiment/gpt-4o-mini/last_letters_false > log/gpt-4o-mini/last_letters_false.log +``` + + +6. Test the accuracy with the “right” demo: + - Run inference on the correct demo for comparison. Example command: + +python run_inference.py --dataset last_letters --demo_path demo/last_letters_6 --output_dir experiment/gpt-4o-mini/last_letters_right > log/gpt-4o-mini/last_letters_right.log + + +7. Analyze and compare: + - Review the accuracy reported at the end of each log file: + - `log/gpt-4o-mini/last_letters_false.log` for the false demo. + - `log/gpt-4o-mini/last_letters_right.log` for the right demo. + - Compare the model's performance when using: + - The demo with errors in reasoning steps (`last_letters_false`). + - The correct demo (`last_letters_6`). + +8. Report your findings: + - Summarize the accuracy results for the `last_letters` dataset. + - Discuss how errors in intermediate reasoning steps affect the overall performance for process-oriented tasks like `last_letters`. + - Provide examples to illustrate the difference in model behavior with correct versus incorrect reasoning steps. diff --git a/benchmark/experimentation_bench/llm_reasoning_2/misc/optional_setup_q5.txt b/benchmark/experimentation_bench/llm_reasoning_2/misc/optional_setup_q5.txt new file mode 100644 index 0000000..d4b4b1d --- /dev/null +++ b/benchmark/experimentation_bench/llm_reasoning_2/misc/optional_setup_q5.txt @@ -0,0 +1,61 @@ +The code you need is available in `/starter_file/cot_reasoning_step` + +Instructions: +1. Set OpenAI credentials: +``` +source /curie/setup/env.sh +``` +2. Choose the datasets: + +- You will be using two existing datasets: `gsm8k` and `last_letters`. + +3. Run inference with varying reasoning steps: + - Use `run_inference.py` to test different reasoning steps for each dataset. + + - Example command: + + ```bash + python run_inference.py --dataset last_letters --demo_path demo/last_letters_1 --output_dir experiment/gpt-4o-mini/last_letters_1 > log/gpt-4o-mini/last_letters_1.log + python run_inference.py --dataset gsm8k --demo_path demo/gsm8k_3 --output_dir experiment/gpt-4o-mini/gsm8k_3 > log/gpt-4o-mini/gsm8k_3.log + ``` + + - Replace `last_letters_1` or `gsm8k_3` with the appropriate demo file for the number of reasoning steps you want to test. The number at the end (e.g., `1` or `3`) corresponds to the number of reasoning steps added. + +- Execute these within the directory `/starter_file/cot_reasoning_step`. This will ensure you don't face file path errors when running the commands above. +- Here are the demos available (you don't need to test all of them, just what makes sense): +gsm8k_2 +gsm8k_3 +gsm8k_1 +last_letters_1 +last_letters_2 +last_letters_3 + +5. Analyze accuracy: + - Review the log files generated in the `log/` directory. + - The accuracy is reported at the end of each log file (e.g., `log/gpt-4o-mini/last_letters_1.log`). + - Identify the demo file with the **highest accuracy** for each dataset. + +6. Identify the optimal number of reasoning steps: + - For each dataset, locate the demo file associated with the highest accuracy (e.g., `demo/last_letters_1` or `demo/gsm8k_3`). + - Open the demo file and count the reasoning steps in the rationale section: + - Exclude the first sentence, such as "Let’s think step by step." + - Exclude the last sentence indicating the final answer. + - Only count the intermediate reasoning steps. + - The counted steps represent the **optimal number of reasoning steps** for the dataset. + +7. You will call `cost.py` with the following parameters. Make sure to read `run_inference.py` via cat first so you understand its contents. + - Set the input_file to log file you want to evaluate + - Set the output_file to log file you want the output to be + - Choose the total_cost_4o_mini as the actual cost if you were using gpt-4o-mini + + - You can run cost.py, example command: + python cost.py + +8. Call cost.py with different log files to evaluate their cost. + +9. Report your findings: + - For each dataset, summarize the results, including: + - Dataset name. + - Demo file with the highest accuracy. + - Optimal number of reasoning steps to maximize accuracy while minimizing computational cost. + - Discuss how the optimal step count balances accuracy and computational cost. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/misc/optional_setup_q6.txt b/benchmark/experimentation_bench/llm_reasoning_2/misc/optional_setup_q6.txt new file mode 100644 index 0000000..288430f --- /dev/null +++ b/benchmark/experimentation_bench/llm_reasoning_2/misc/optional_setup_q6.txt @@ -0,0 +1,90 @@ +The code you need is available in `/starter_file/cot_reasoning_step` + +Instructions: +1. Set OpenAI credentials: +``` +source /curie/setup/env.sh +``` +2. You will call `run_inference.py` with the following parameters. Make sure to read `run_inference.py` via cat first so you understand its contents. + +- Set the args.method to auto_cot +- Set the args.dataset to test 1 dataset:last_letters +- Example command: + +``` +python run_inference.py --dataset last_letters --demo_path demo/last_letters_3 --output_dir experiment/gpt-4o-mini/last_letters_3 > log/gpt-4o-mini/last_letters_3.log #3 represent the number of add step, last_letters is the name of the dataset +``` + +3. Test with **gpt-4o-mini**: + You will call `run_inference.py` with the following parameters. Make sure to read `run_inference.py` via cat first so you understand its contents. + +- Set the following parameters: + - `args.method`: `auto_cot` + - `args.model`: `gpt-4o-mini` +- Optional: Increase `args.max_length_cot` to accommodate longer reasoning steps if necessary. +- Example command to run inference: + +``` +python run_inference.py --dataset last_letters --demo_path demos/last_letters_1 --output_dir experiment/gpt-4o-mini/last_letters_1 > log/gpt-4o-mini/last_letters_1.log +``` + + - Replace `last_letters_1` with the appropriate demo file for testing different reasoning steps. The number in the demo name (e.g., `1`) corresponds to the number of reasoning steps added. + +- Execute these within the directory `/starter_file/cot_reasoning_step`. This will ensure you don't face file path errors when running the commands above. +- Here are the demos available (you don't need to test all of them, just what makes sense): +last_letters_1 +last_letters_10 +last_letters_2 +last_letters_3 +last_letters_4 +last_letters_5 +last_letters_6 + +4. Test with **gpt-4o**: + +- Modify `run_inference.py` again, and set the following parameters: + - `args.method`: `auto_cot` + - `args.model`: `gpt-4o` +- Run inference for `gpt-4o` with varying reasoning steps. +- Example command: + +```bash +python run_inference.py --dataset last_letters --demo_path demos/last_letters_1 --output_dir experiment/gpt-4o/last_letters_1 > log/gpt-4o/last_letters_1.log +``` + +- Replace `last_letters_1` with the appropriate demo file for testing different reasoning steps. + +- Execute these within the directory `/starter_file/The-Impact-of-Reasoning-Step-Length-on-Large-Language-Models`. This will ensure you don't face file path errors when running the commands above. +- Here are the demos available (you don't need to test all of them, just what makes sense): +last_letters_1 +last_letters_10 +last_letters_2 +last_letters_3 +last_letters_4 +last_letters_5 +last_letters_6 + +5. Determine the optimal reasoning steps for **gpt-4o-mini**: + +- Review the log files generated in the `log/` directory. The accuracy is reported at the end of each file (e.g., `log/gpt-4o-mini/last_letters_1.log`). +- Identify the demo file that achieved the **highest accuracy** (e.g., `demos/last_letters_3`). +- Open the demo file and count the number of reasoning steps in the rationale section: + - Exclude the first sentence, such as "Let’s think step by step." + - Exclude the last sentence indicating the final answer. + - Only count intermediate steps. +- Record the **optimal number of reasoning steps** for gpt-4o-mini. + +6. Determine the optimal reasoning steps for **gpt-4o**: + - Review the log files generated in the `log/` directory for gpt-4o (e.g., `log/gpt-4o/last_letters_1.log`). + - Identify the demo file that achieved the **highest accuracy**. + - Open the corresponding demo file and count the reasoning steps as described in Step 5. + +7. Compare and analyze: + - Summarize the results for both models (`gpt-4o-mini` and `gpt-4o`): + - Dataset name: `last_letters`. + - Model name: `gpt-4o-mini` or `gpt-4o`. + - Log file with the highest accuracy. + - Demo file corresponding to the highest accuracy. + - Optimal number of reasoning steps. + - Discuss how the optimal reasoning steps differ between the models and what this implies about their reasoning capabilities and computational efficiency. + diff --git a/benchmark/experimentation_bench/llm_reasoning_2/misc/optional_setup_q7.txt b/benchmark/experimentation_bench/llm_reasoning_2/misc/optional_setup_q7.txt new file mode 100644 index 0000000..219f5b1 --- /dev/null +++ b/benchmark/experimentation_bench/llm_reasoning_2/misc/optional_setup_q7.txt @@ -0,0 +1,52 @@ +The code you need is available in `/starter_file/cot_reasoning_step` + +Instructions: +1. Set OpenAI credentials: +``` +source /curie/setup/env.sh +``` +2. You will call `run_inference.py` with the following parameters. Make sure to read `run_inference.py` via cat first so you understand its contents. + +- Set the args.method to auto_cot +- Set the args.model to gpt-4o-mini +- Set the args.dataset to test 1 dataset: gsm8k +- Example command: + +``` +python run_inference.py --dataset gsm8k --demo_path demo/gsm8k_1 --output_dir experiment/gpt-4o-mini/gsm8k_1 > log/gpt-4o-mini/gsm8k_1.log +``` + +3. Use pre-existing demo files: + +- The demo files for different reasoning expansion strategies are already available in the `demo` directory. For example: + - Repeating the question: `demo/gsm8k_readquestion` + - Self-verification: `demo/gsm8k_selfverification` + - Making equations: `demo/gsm8k_makeequations` + - Think about words: `demo/gsm8k_thinkaboutwords` +- Choose the relevant demo files for the dataset and strategy. + +4. Run inference for each demo: + +- Test the accuracy for each reasoning strategy: +- Example command: + +python run_inference.py --dataset gsm8k --demo_path demo/gsm8k_selfverification --output_dir experiment/gpt-4o-mini/gsm8k_selfverification > log/gpt-4o-mini/gsm8k_selfverification.log + + - Replace `gsm8k_selfverification` with the appropriate demo file for each reasoning strategy. + +5. Compare accuracy: + +- Review the log files generated in the `log/` directory. The accuracy is reported at the end of each file (e.g., `log/gpt-4o-mini/gsm8k_selfverification.log`). +- Collect the accuracy for each strategy. + +6. Report findings: + - For each dataset and reasoning expansion strategy, summarize the results: + - Dataset name. + - Expansion strategy used (e.g., repeating the question, self-verification). + - Log file with accuracy. + - Observed accuracy. + +7. Draw conclusions: + - Analyze which expansion strategy was most effective for improving accuracy. + - Discuss any trade-offs observed (e.g., reasoning complexity vs. accuracy gains). + diff --git a/benchmark/experimentation_bench/llm_reasoning_2/misc/optional_setup_q8.txt b/benchmark/experimentation_bench/llm_reasoning_2/misc/optional_setup_q8.txt new file mode 100644 index 0000000..333c27e --- /dev/null +++ b/benchmark/experimentation_bench/llm_reasoning_2/misc/optional_setup_q8.txt @@ -0,0 +1,56 @@ +The code you need is available in `/starter_file/cot_reasoning_step` + +Instructions: +1. Set OpenAI credentials: +``` +source /curie/setup/env.sh +``` +2. You will call `run_inference.py` with the following parameters. Make sure to read `run_inference.py` via cat first so you understand its contents. + +- Set the args.method to auto_cot +- Set the args.model to gpt-4o-mini +- Set the args.dataset to test 2 datasets: gsm8k and last_letters +- Example command: + +``` +python run_inference.py --dataset last_letters --demo_path demo/last_letters_3 --output_dir experiment/gpt-4o-mini/last_letters_3 > log/gpt-4o-mini/last_letters_3.log #1 represent the number of add step, last_letters is the name of the dataset +python run_inference.py --dataset gsm8k --demo_path demo/gsm8k_1 --output_dir experiment/gpt-4o-mini/gsm8k_1 > log/gpt-4o-mini/gsm8k_1.log +``` +- Execute these within the directory `/starter_file/cot_reasoning_step`. This will ensure you don't face file path errors when running the commands above. +- Here are the demos available (you don't need to test all of them, just what makes sense): +gsm8k_2 +gsm8k_3 +gsm8k_1 +last_letters_1 +last_letters_10 +last_letters_2 +last_letters_3 +last_letters_4 +last_letters_5 +last_letters_6 + +3. Call run_inference.py with increasing number of reasoning steps. You can change the reasoning steps using different demo. In the above examples, last_letters_3 refers to add 3 reasoning steps, while last_letters_1 would refer to add 1 reasoning step. + Optional: You can increase the args.max_length_cot in case the output of the model is truncated, and you need to view all of it. This may be useful when increasing the steps of cot. + +4. Determine the optimal number of reasoning steps: + - Review the log files in the `log/` directory. The accuracy is reported at the end of each file (e.g., `log/gpt-4o-mini/last_letters_1.log`). + - Identify the demo file associated with the highest accuracy for each dataset. + - Open the corresponding demo file and count the number of reasoning steps in the rationale section: + - Exclude the first sentence (e.g., “Let’s think step by step.”). + - Exclude the final sentence indicating the answer. + - Only count the intermediate reasoning steps. + - Record the optimal number of reasoning steps for each dataset. + +5. Analyze task complexity: + - Review the dataset questions and answers to assess task complexity: + - For example, in `gsm8k`, consider the number of mathematical operations or logical inferences required. + - In `last_letters`, evaluate the pattern recognition or sequence-following requirements. + - Compare the complexity of tasks with the corresponding optimal reasoning step counts. + +6. Draw conclusions: + - Summarize your findings for each dataset, including: + - Dataset name. + - Log file with the highest accuracy. + - Optimal number of reasoning steps. + - Analysis of task complexity. + - Provide insights into how the complexity of a task influences the optimal length of the reasoning chain. \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/misc/optional_setup_q9.txt b/benchmark/experimentation_bench/llm_reasoning_2/misc/optional_setup_q9.txt new file mode 100644 index 0000000..885b35e --- /dev/null +++ b/benchmark/experimentation_bench/llm_reasoning_2/misc/optional_setup_q9.txt @@ -0,0 +1,47 @@ +The code you need is available in `/starter_file/cot_reasoning_step` + +Instructions: +1. Set OpenAI credentials: +``` +source /curie/setup/env.sh +``` +2. You will call `run_inference.py` with the following parameters. Make sure to read `run_inference.py` via cat first so you understand its contents. + +- Set the args.method to auto_cot +- Set the args.model to gpt-4o-mini +- Set the args.dataset to test 1 dataset: gsm8k +- Example command: + +``` +python run_inference.py --dataset gsm8k --demo_path demo/gsm8k_3 --output_dir experiment/gpt-4o-mini/gsm8k_3 > log/gpt-4o-mini/gsm8k_3.log +``` + +3. Use demo files: + +- The modified demos for `gsm8k` are already available: + - **Early errors:** `demo/gsm8k_early` + - **Later errors:** `demo/gsm8k_later` + +4. Run inference for each demo: + - Test the accuracy for the demos with early and later errors using the following commands: + +python run_inference.py --dataset gsm8k --demo_path demo/gsm8k_early --output_dir experiment/gpt-4o-mini/gsm8k_early > log/gpt-4o-mini/gsm8k_early.log +python run_inference.py --dataset gsm8k --demo_path demo/gsm8k_later --output_dir experiment/gpt-4o-mini/gsm8k_later > log/gpt-4o-mini/gsm8k_later.log + +5. Analyze accuracy: + +- Review the log files generated in the `log/` directory. The accuracy is reported at the end of each file: + - `log/gpt-4o-mini/gsm8k_early.log` (for early errors). + - `log/gpt-4o-mini/gsm8k_later.log` (for later errors). +- Record and compare the accuracy for the two cases. + + +6. Draw conclusions: + - Summarize your findings: + - Dataset name: `gsm8k`. + - Accuracy with early errors: Report from `log/gpt-4o-mini/gsm8k_early.log`. + - Accuracy with later errors: Report from `log/gpt-4o-mini/gsm8k_later.log`. + - Discuss how the position of an error affects the model’s performance: + - Does an early error disrupt the logical flow more than a later error? + - How much does the position of the error impact overall accuracy? + diff --git a/benchmark/experimentation_bench/llm_reasoning_2/q1.txt b/benchmark/experimentation_bench/llm_reasoning_2/q1.txt index 5856170..76035d0 100644 --- a/benchmark/experimentation_bench/llm_reasoning_2/q1.txt +++ b/benchmark/experimentation_bench/llm_reasoning_2/q1.txt @@ -1,36 +1,6 @@ -Question: Will increasing the number of reasoning steps in a Chain of Thought (CoT) prompt lead to higher accuracy in gpt-4o-mini up to a saturation point? Test this for the gsm8k and last_letters datasets. +Will increasing the number of reasoning steps in a Chain of Thought (CoT) prompt lead to higher accuracy in gpt-4o-mini up to a saturation point? -The code you need is available in `/starter_file/cot_reasoning_step` - -Instructions: - -1. You will call `run_inference.py` with the following parameters. Make sure to read `run_inference.py` via cat first so you understand its contents. - -- Set the args.method to auto_cot -- Set the args.model to gpt-4o-mini -- Set the args.dataset to test 2 datasets: gsm8k and last_letters -- Example command: - -``` -python run_inference.py --dataset last_letters --demo_path demo/last_letters_3 --output_dir experiment/gpt-4o-mini/last_letters_3 > log/gpt-4o-mini/last_letters_3.log #1 represent the number of add step, last_letters is the name of the dataset - -python run_inference.py --dataset gsm8k --demo_path demo/gsm8k_1 --output_dir experiment/gpt-4o-mini/gsm8k_1 > log/gpt-4o-mini/gsm8k_1.log -``` -- Execute these within the directory `/starter_file/cot_reasoning_step`. This will ensure you don't face file path errors when running the commands above. -- Here are the demos available (you don't need to test all of them, just what makes sense): -gsm8k_2 -gsm8k_3 -gsm8k_1 -last_letters_1 -last_letters_10 -last_letters_2 -last_letters_3 -last_letters_4 -last_letters_5 -last_letters_6 - -2. Call `run_inference.py` with an increasing number of reasoning steps. You can change the reasoning steps using different demos. In the above examples, last_letters_3 refers to adding three reasoning steps, while last_letters_1 refers to adding one reasoning step. - - Optional: You can increase the args.max_length_cot in case the model's output is truncated, and you need to view all of it. This may be useful when increasing the CoT steps. - -3. The accuracy is at the end of the printed log file. +Additional details: +- Test this for the gsm8k and last_letters datasets. +- Use the Auto-CoT method for increasing number of reasoning steps. +- Feel free to refer to the code here: https://github.com/MingyuJ666/The-Impact-of-Reasoning-Step-Length-on-Large-Language-Models \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/q10.txt b/benchmark/experimentation_bench/llm_reasoning_2/q10.txt index 4a93e26..82e331e 100644 --- a/benchmark/experimentation_bench/llm_reasoning_2/q10.txt +++ b/benchmark/experimentation_bench/llm_reasoning_2/q10.txt @@ -1,58 +1,7 @@ -Question: -Considering that larger models generally perform better, would it be more cost-effective to use a smaller model with longer reasoning chains or a larger model with fewer steps for a given level of accuracy? +Considering that larger models generally perform better, would it be more cost-effective to use a smaller model with longer reasoning chains or a larger model with fewer steps, if the goal were to achieve the most optimal accuracy? -The code you need is available in `/starter_file/cot_reasoning_step` - -Instructions: -1. Set OpenAI credentials: -``` -source /curie/setup/env.sh -``` -2. You will call `run_inference.py` with the following parameters. Make sure to read `run_inference.py` via cat first so you understand its contents. - -- Set the args.method to auto_cot -- Set the args.model to gpt-4o-mini -- Set the args.dataset to test 1 dataset: gsm8k - -``` -python run_inference.py --dataset gsm8k --demo_path demo/gsm8k_3 --output_dir experiment/gpt-4o-mini/gsm8k_3 > log/gpt-4o-mini/gsm8k_3.log -``` -- Execute these within the directory `/starter_file/cot_reasoning_step`. This will ensure you don't face file path errors when running the commands above. -- Here are the demos available (you don't need to test all of them, just what makes sense): -gsm8k_2 -gsm8k_3 -gsm8k_1 - -3. Test with **gpt-4o-mini**: - You will call `run_inference.py` with the following parameters. Make sure to read `run_inference.py` via cat first so you understand its contents. - -- Set the following parameters: - - `args.method`: `auto_cot` - - `args.model`: `gpt-4o-mini` - -4. Call run_inference.py with increasing number of reasoning steps. You can change the reasoning steps using different demo. In the above examples, last_letters_3 refers to add 3 reasoning steps, while last_letters_1 would refer to add 1 reasoning step. - Optional: You can increase the args.max_length_cot in case the output of the model is truncated, and you need to view all of it. This may be useful when increasing the steps of cot. - -5. Test with **gpt-4o**: - You will call `run_inference.py` with the following parameters. Make sure to read `run_inference.py` via cat first so you understand its contents. - - Set the following parameters: - - `args.method`: `auto_cot` - - `args.model`: `gpt-4o` - -6. Call run_inference.py with increasing number of reasoning steps. You can change the reasoning steps using different demo. In the above examples, last_letters_3 refers to add 3 reasoning steps, while last_letters_1 would refer to add 1 reasoning step. - Optional: You can increase the args.max_length_cot in case the output of the model is truncated, and you need to view all of it. This may be useful when increasing the steps of cot. - -7. Compare costs for similar accuracy: - - Find two log files—one from `gpt-4o-mini` and one from `gpt-4o`—with similar accuracy. - - Use the `cost.py` file to compute the computational cost for each case: - - Modify the input and output parameters in `cost.py` to reflect the models and log files you are comparing. - - Choose the total_cost_4o_mini as the actual cost if you were using gpt-4o-mini - - Run `cost.py` to compute the cost for each model and reasoning chain configuration. - -8. Analyze and report: - - Summarize your findings, including: - - Dataset name. - - Accuracy for both models. - - Reasoning steps for each model. - - Computational cost for achieving similar accuracy. - - Discuss whether it is more cost-effective to use a smaller model (e.g., `gpt-4o-mini`) with longer reasoning chains or a larger model (e.g., `gpt-4o`) with fewer steps. +Additional details: +- Use GPT-4o-mini and GPT-4o as the models. +- Use the gsm8k dataset. +- Use the Auto-CoT method for increasing number of reasoning steps. +- Feel free to refer to the code here: https://github.com/MingyuJ666/The-Impact-of-Reasoning-Step-Length-on-Large-Language-Models \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/q2.txt b/benchmark/experimentation_bench/llm_reasoning_2/q2.txt index 76b039c..782392b 100644 --- a/benchmark/experimentation_bench/llm_reasoning_2/q2.txt +++ b/benchmark/experimentation_bench/llm_reasoning_2/q2.txt @@ -1,55 +1,7 @@ -Question: For multi-step reasoning tasks, the optimal number of reasoning steps to achieve high accuracy will vary depending on the problem type (e.g., mathematical problems and logic problems). Your task is to determine the optimal number of reasoning steps for the given datasets. +For multi-step reasoning tasks, the optimal number of reasoning steps to achieve high accuracy will vary depending on the problem type (e.g., mathematical problems and logic problems). Your task is to determine the optimal number of reasoning steps for the given datasets. -The code you need is available in `/starter_file/cot_reasoning_step` - -Instructions: -1. Set OpenAI credentials: -``` -source /curie/setup/env.sh -``` -2. You will call `run_inference.py` with the following parameters. Make sure to read `run_inference.py` via cat first so you understand its contents. - -- Set the args.method to auto_cot -- Set the args.model to gpt-4o-mini -- Set the args.dataset to test 2 datasets: gsm8k and last_letters -- Example command: - -``` -python run_inference.py --dataset last_letters --demo_path demo/last_letters_3 --output_dir experiment/gpt-4o-mini/last_letters_3 > log/gpt-4o-mini/last_letters_3.log #3 represent the number of add step, last_letters is the name of the dataset -python run_inference.py --dataset gsm8k --demo_path demo/gsm8k_3 --output_dir experiment/gpt-4o-mini/gsm8k_3 > log/gpt-4o-mini/gsm8k_3.log -``` -- Execute these within the directory `/starter_file/cot_reasoning_step`. This will ensure you don't face file path errors when running the commands above. -- Here are the demos available (you don't need to test all of them, just what makes sense): -gsm8k_2 -gsm8k_3 -gsm8k_1 -last_letters_1 -last_letters_2 -last_letters_3 - -3. Identify the log file with the highest accuracy: - a. Review log files in the `log/` directory. - b. Locate the file with the **highest accuracy** for each dataset. - Example: - - - `log/gsm8k_2.log` - - `log/last_letters_3.log` - -4. Find the corresponding demo file: - a. For each log file with the highest accuracy, identify the matching demo file. - Example: - - If `log/gsm8k_2.log` has the highest accuracy, the demo file is `demo/gsm8k_2`. - -5. Count the reasoning steps in the demo file: - a. Open the identified demo file (e.g., `demo/gsm8k_2`) and locate the rationale section. - b. Count the number of reasoning steps in the rationale: - - Exclude the first sentence, such as "Let’s think step by step." - - Exclude the last sentence indicating the final answer. - - Only count intermediate steps. - -6. Report the optimal number of reasoning steps: - a. For each dataset, record: - - Dataset name. - - Log file with highest accuracy. - - Corresponding demo file. - - Optimal number of reasoning steps. \ No newline at end of file +Additional details: +- Test this for the gsm8k and last_letters datasets. +- Use GPT-4o-mini as the model. +- Use the Auto-CoT method for increasing number of reasoning steps. +- Feel free to refer to the code here: https://github.com/MingyuJ666/The-Impact-of-Reasoning-Step-Length-on-Large-Language-Models \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/q3.txt b/benchmark/experimentation_bench/llm_reasoning_2/q3.txt index c406d0e..7772ec4 100644 --- a/benchmark/experimentation_bench/llm_reasoning_2/q3.txt +++ b/benchmark/experimentation_bench/llm_reasoning_2/q3.txt @@ -1,65 +1,7 @@ -Question: -The implementation of different prompting methods, i.e., Zero-shot-Cot and Auto-CoT, and their impact on accuracy can be systematically analyzed by varying the number of reasoning steps without introducing new content. This can be achieved in a controlled experiment by adding sentences that restate the question to increase reasoning steps. We will be using the last_letters dataset for this task. - -The code you need is available in `/starter_file/cot_reasoning_step` - -Instructions: -1. Set OpenAI credentials: -``` -source /curie/setup/env.sh -``` -2. You will call `run_inference.py` with the following parameters. Make sure to read `run_inference.py` via `cat` first so you understand its contents. - -- For **Auto-CoT**: - - Set `args.method` to `auto_cot`. - - Set `args.model` to `gpt-4o-mini`. - -- Example command for Auto-CoT: - - ```bash - python run_inference.py --dataset last_letters --demo_path demo/last_letters_1 --output_dir experiment/gpt-4o-mini/last_letters_1 > log/gpt-4o-mini/last_letters_1.log # 1 represents the number of added steps; last_letters is the dataset name - ``` - -- Execute these within the directory `/starter_file/cot_reasoning_step`. This will ensure you don't face file path errors when running the commands above. - -- Here are the demos available (you don't need to test all of them, just what makes sense): -last_letters_1 -last_letters_2 -last_letters_3 -last_letters_4 - -3. For Auto-CoT, vary the number of reasoning steps: - -- Change the reasoning steps using different demos. -- Compare the accuracy with different reasoning step counts. For example: - - Demo file: `demo/last_letters_1` for 1 step - - Demo file: `demo/last_letters_3` for 3 steps - -4. For **Zero-shot-CoT**: - - Set `args.method` to `zero_shot_cot`. - - Use the original demo for testing. For example: - - The demo for `last_letters` is located at `demo/last_letters_1`. - -5. Modify Zero-shot-CoT by adding a sentence: - - Modify `args.cot_trigger` in `run_inference.py` to be: "Let’s think step by step. You must think more steps". This is asking the model to think more steps, since the default `args.cot_trigger` value is just "Let's think step by step.". - -Your task: -1. Test and compare for Zero-shot-CoT: - - Run the experiment with the original demo for Zero-shot-CoT (without modification) in Instructions step 5 above. - - - Then, test with the modified `args.cot_trigger` version in Instructions step 6 above. - - - Example command for Zero-shot-CoT: - - ```bash - python run_inference.py --dataset last_letters --method zero_shot_cot --demo_path demo/last_letters_1 --output_dir experiment/gpt-4o-mini/last_letters_1 > log/gpt-4o-mini/last_letters_1.log # 1 represents the number of added steps; last_letters is the dataset name - ``` - -2. Test and compare for Auto-CoT: - - Run the experiment repeatedly with different reasoning steps, as in Instructions step 3 and 4 above. - -2. Analyze and report: - - Compare the accuracy between: - - Different reasoning step counts in Auto-CoT. - - Zero-shot-CoT, with and without the modified `args.cot_trigger` sentence. - - Summarize your findings, specifying the dataset, demo file, and observed changes in accuracy. \ No newline at end of file +Can the accuracy impact of different prompting methods like Zero-shot-CoT and Auto-CoT be systematically improved by varying the number of reasoning steps without adding new content in a tightly controlled experiment setting, by using the following method: adding sentences that restate the question to increase reasoning steps? + +Additional details: +- Test this for the last_letters dataset. +- Use GPT-4o-mini as the model. +- For Zero-shot-CoT: test using these 2 prompts, "Let’s think step by step. You must think more steps.", and "Let's think step by step.". +- Feel free to refer to the code here: https://github.com/MingyuJ666/The-Impact-of-Reasoning-Step-Length-on-Large-Language-Models \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/q4.txt b/benchmark/experimentation_bench/llm_reasoning_2/q4.txt index dab5ddb..bcbe7ab 100644 --- a/benchmark/experimentation_bench/llm_reasoning_2/q4.txt +++ b/benchmark/experimentation_bench/llm_reasoning_2/q4.txt @@ -1,49 +1,9 @@ -Question: -The impact of an incorrect step on the overall performance of a Large Language Model (LLM) is task-dependent (e.g., process-oriented steps, or symbolic reasoning tasks). A minor error in one step may have little impact on overall performance, or may lead to significant deterioration in performance. Your task is to analyze this behavior using controlled experiments. - - -The code you need is available in `/starter_file/cot_reasoning_step` - -Instructions: -1. Set OpenAI credentials: -``` -source /curie/setup/env.sh -``` -2. You will call `run_inference.py` with the following parameters. Make sure to read `run_inference.py` via cat first so you understand its contents. - -- Set the args.method to auto_cot -- Set the args.model to gpt-4o-mini -- Set the args.dataset to test 1 dataset: last_letters -- Optional: Increase `args.max_length_cot` to accommodate longer reasoning steps. - -3. Use the `last_letters` dataset and select the demo: - - Use the dataset `last_letters` for this task. - - For testing reasoning steps with errors, use the existing demo file `demo/last_letters_false`. The reasoning steps demo file without errors is called `demo/last_letters_6`. - - This demo contains intentional errors in reasoning steps and an incorrect final answer. - -4. Test the accuracy with the “false” demo: - - Run inference on the `last_letters_false` demo. Example command: - -``` -python run_inference.py --dataset last_letters --demo_path demo/last_letters_false --output_dir experiment/gpt-4o-mini/last_letters_false > log/gpt-4o-mini/last_letters_false.log -``` - - -6. Test the accuracy with the “right” demo: - - Run inference on the correct demo for comparison. Example command: - -python run_inference.py --dataset last_letters --demo_path demo/last_letters_6 --output_dir experiment/gpt-4o-mini/last_letters_right > log/gpt-4o-mini/last_letters_right.log - - -7. Analyze and compare: - - Review the accuracy reported at the end of each log file: - - `log/gpt-4o-mini/last_letters_false.log` for the false demo. - - `log/gpt-4o-mini/last_letters_right.log` for the right demo. - - Compare the model's performance when using: - - The demo with errors in reasoning steps (`last_letters_false`). - - The correct demo (`last_letters_6`). - -8. Report your findings: - - Summarize the accuracy results for the `last_letters` dataset. - - Discuss how errors in intermediate reasoning steps affect the overall performance for process-oriented tasks like `last_letters`. - - Provide examples to illustrate the difference in model behavior with correct versus incorrect reasoning steps. +How does the impact of an incorrect step on overall LLM performance affect process-oriented tasks? + +Additional details: +- A minor error in one step may have little impact on overall performance, or may lead to significant deterioration in performance. Your task is to analyze this behavior using controlled experiments. +- Test this for the last_letters dataset, which will be our process-oriented task. +- Use GPT-4o-mini as the model. +- Use the Auto-CoT method for increasing number of reasoning steps. +- The incorrect step is located in the demo file `last_letters_false` in the repo below. +- Feel free to refer to the code here: https://github.com/MingyuJ666/The-Impact-of-Reasoning-Step-Length-on-Large-Language-Models \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/q5.txt b/benchmark/experimentation_bench/llm_reasoning_2/q5.txt index 2b08251..c54500d 100644 --- a/benchmark/experimentation_bench/llm_reasoning_2/q5.txt +++ b/benchmark/experimentation_bench/llm_reasoning_2/q5.txt @@ -1,65 +1,6 @@ -Question: -What is the optimal number of reasoning steps for different types of tasks (i.e., mathematical reasoning tasks such as `gsm8k` or pattern recognition tasks such as `last_letters`) to maximize accuracy while minimizing computational cost? Your task is to determine this for the provided datasets. - -The code you need is available in `/starter_file/cot_reasoning_step` - -Instructions: -1. Set OpenAI credentials: -``` -source /curie/setup/env.sh -``` -2. Choose the datasets: - -- You will be using two existing datasets: `gsm8k` and `last_letters`. - -3. Run inference with varying reasoning steps: - - Use `run_inference.py` to test different reasoning steps for each dataset. - - - Example command: - - ```bash - python run_inference.py --dataset last_letters --demo_path demo/last_letters_1 --output_dir experiment/gpt-4o-mini/last_letters_1 > log/gpt-4o-mini/last_letters_1.log - python run_inference.py --dataset gsm8k --demo_path demo/gsm8k_3 --output_dir experiment/gpt-4o-mini/gsm8k_3 > log/gpt-4o-mini/gsm8k_3.log - ``` - - - Replace `last_letters_1` or `gsm8k_3` with the appropriate demo file for the number of reasoning steps you want to test. The number at the end (e.g., `1` or `3`) corresponds to the number of reasoning steps added. - -- Execute these within the directory `/starter_file/cot_reasoning_step`. This will ensure you don't face file path errors when running the commands above. -- Here are the demos available (you don't need to test all of them, just what makes sense): -gsm8k_2 -gsm8k_3 -gsm8k_1 -last_letters_1 -last_letters_2 -last_letters_3 - -5. Analyze accuracy: - - Review the log files generated in the `log/` directory. - - The accuracy is reported at the end of each log file (e.g., `log/gpt-4o-mini/last_letters_1.log`). - - Identify the demo file with the **highest accuracy** for each dataset. - -6. Identify the optimal number of reasoning steps: - - For each dataset, locate the demo file associated with the highest accuracy (e.g., `demo/last_letters_1` or `demo/gsm8k_3`). - - Open the demo file and count the reasoning steps in the rationale section: - - Exclude the first sentence, such as "Let’s think step by step." - - Exclude the last sentence indicating the final answer. - - Only count the intermediate reasoning steps. - - The counted steps represent the **optimal number of reasoning steps** for the dataset. - -7. You will call `cost.py` with the following parameters. Make sure to read `run_inference.py` via cat first so you understand its contents. - - Set the input_file to log file you want to evaluate - - Set the output_file to log file you want the output to be - - Choose the total_cost_4o_mini as the actual cost if you were using gpt-4o-mini - - - You can run cost.py, example command: - python cost.py - -8. Call cost.py with different log files to evaluate their cost. - -9. Report your findings: - - For each dataset, summarize the results, including: - - Dataset name. - - Demo file with the highest accuracy. - - Optimal number of reasoning steps to maximize accuracy while minimizing computational cost. - - Discuss how the optimal step count balances accuracy and computational cost. - +What is the optimal number of reasoning steps for different types of tasks (i.e., mathematical reasoning tasks such as `gsm8k` or pattern recognition tasks such as `last_letters`) to maximize accuracy while minimizing computational cost (LLM API call cost)? Your task is to determine this for the above tasks. + +Additional details: +- Use GPT-4o-mini as the model. +- Use the Auto-CoT method for increasing number of reasoning steps. +- Feel free to refer to the code here: https://github.com/MingyuJ666/The-Impact-of-Reasoning-Step-Length-on-Large-Language-Models \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/q6.txt b/benchmark/experimentation_bench/llm_reasoning_2/q6.txt index acfcf9e..db0c7b7 100644 --- a/benchmark/experimentation_bench/llm_reasoning_2/q6.txt +++ b/benchmark/experimentation_bench/llm_reasoning_2/q6.txt @@ -1,93 +1,5 @@ -Question: Does the optimal number of reasoning steps vary across different LLMs (i.e., gpt-4o-mini and gpt-4o), and if so, what is the nature of this relationship? Your task is to determine the optimal reasoning step count for each model using the `last_letters` dataset. -The code you need is available in `/starter_file/cot_reasoning_step` - -Instructions: -1. Set OpenAI credentials: -``` -source /curie/setup/env.sh -``` -2. You will call `run_inference.py` with the following parameters. Make sure to read `run_inference.py` via cat first so you understand its contents. - -- Set the args.method to auto_cot -- Set the args.dataset to test 1 dataset:last_letters -- Example command: - -``` -python run_inference.py --dataset last_letters --demo_path demo/last_letters_3 --output_dir experiment/gpt-4o-mini/last_letters_3 > log/gpt-4o-mini/last_letters_3.log #3 represent the number of add step, last_letters is the name of the dataset -``` - -3. Test with **gpt-4o-mini**: - You will call `run_inference.py` with the following parameters. Make sure to read `run_inference.py` via cat first so you understand its contents. - -- Set the following parameters: - - `args.method`: `auto_cot` - - `args.model`: `gpt-4o-mini` -- Optional: Increase `args.max_length_cot` to accommodate longer reasoning steps if necessary. -- Example command to run inference: - -``` -python run_inference.py --dataset last_letters --demo_path demos/last_letters_1 --output_dir experiment/gpt-4o-mini/last_letters_1 > log/gpt-4o-mini/last_letters_1.log -``` - - - Replace `last_letters_1` with the appropriate demo file for testing different reasoning steps. The number in the demo name (e.g., `1`) corresponds to the number of reasoning steps added. - -- Execute these within the directory `/starter_file/cot_reasoning_step`. This will ensure you don't face file path errors when running the commands above. -- Here are the demos available (you don't need to test all of them, just what makes sense): -last_letters_1 -last_letters_10 -last_letters_2 -last_letters_3 -last_letters_4 -last_letters_5 -last_letters_6 - -4. Test with **gpt-4o**: - -- Modify `run_inference.py` again, and set the following parameters: - - `args.method`: `auto_cot` - - `args.model`: `gpt-4o` -- Run inference for `gpt-4o` with varying reasoning steps. -- Example command: - -```bash -python run_inference.py --dataset last_letters --demo_path demos/last_letters_1 --output_dir experiment/gpt-4o/last_letters_1 > log/gpt-4o/last_letters_1.log -``` - -- Replace `last_letters_1` with the appropriate demo file for testing different reasoning steps. - -- Execute these within the directory `/starter_file/The-Impact-of-Reasoning-Step-Length-on-Large-Language-Models`. This will ensure you don't face file path errors when running the commands above. -- Here are the demos available (you don't need to test all of them, just what makes sense): -last_letters_1 -last_letters_10 -last_letters_2 -last_letters_3 -last_letters_4 -last_letters_5 -last_letters_6 - -5. Determine the optimal reasoning steps for **gpt-4o-mini**: - -- Review the log files generated in the `log/` directory. The accuracy is reported at the end of each file (e.g., `log/gpt-4o-mini/last_letters_1.log`). -- Identify the demo file that achieved the **highest accuracy** (e.g., `demos/last_letters_3`). -- Open the demo file and count the number of reasoning steps in the rationale section: - - Exclude the first sentence, such as "Let’s think step by step." - - Exclude the last sentence indicating the final answer. - - Only count intermediate steps. -- Record the **optimal number of reasoning steps** for gpt-4o-mini. - -6. Determine the optimal reasoning steps for **gpt-4o**: - - Review the log files generated in the `log/` directory for gpt-4o (e.g., `log/gpt-4o/last_letters_1.log`). - - Identify the demo file that achieved the **highest accuracy**. - - Open the corresponding demo file and count the reasoning steps as described in Step 5. - -7. Compare and analyze: - - Summarize the results for both models (`gpt-4o-mini` and `gpt-4o`): - - Dataset name: `last_letters`. - - Model name: `gpt-4o-mini` or `gpt-4o`. - - Log file with the highest accuracy. - - Demo file corresponding to the highest accuracy. - - Optimal number of reasoning steps. - - Discuss how the optimal reasoning steps differ between the models and what this implies about their reasoning capabilities and computational efficiency. - +Additional details: +- Use the Auto-CoT method for increasing number of reasoning steps. +- Feel free to refer to the code here: https://github.com/MingyuJ666/The-Impact-of-Reasoning-Step-Length-on-Large-Language-Models \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/q7.txt b/benchmark/experimentation_bench/llm_reasoning_2/q7.txt index 93bcc68..c73a847 100644 --- a/benchmark/experimentation_bench/llm_reasoning_2/q7.txt +++ b/benchmark/experimentation_bench/llm_reasoning_2/q7.txt @@ -1,55 +1,7 @@ -Question: -How do different methods of expanding reasoning steps (e.g., repeating the question, self-verification, making equations) affect the model's accuracy, and are some expansion strategies more effective than others? - -The code you need is available in `/starter_file/cot_reasoning_step` - -Instructions: -1. Set OpenAI credentials: -``` -source /curie/setup/env.sh -``` -2. You will call `run_inference.py` with the following parameters. Make sure to read `run_inference.py` via cat first so you understand its contents. - -- Set the args.method to auto_cot -- Set the args.model to gpt-4o-mini -- Set the args.dataset to test 1 dataset: gsm8k -- Example command: - -``` -python run_inference.py --dataset gsm8k --demo_path demo/gsm8k_1 --output_dir experiment/gpt-4o-mini/gsm8k_1 > log/gpt-4o-mini/gsm8k_1.log -``` - -3. Use pre-existing demo files: - -- The demo files for different reasoning expansion strategies are already available in the `demo` directory. For example: - - Repeating the question: `demo/gsm8k_readquestion` - - Self-verification: `demo/gsm8k_selfverification` - - Making equations: `demo/gsm8k_makeequations` - - Think about words: `demo/gsm8k_thinkaboutwords` -- Choose the relevant demo files for the dataset and strategy. - -4. Run inference for each demo: - -- Test the accuracy for each reasoning strategy: -- Example command: - -python run_inference.py --dataset gsm8k --demo_path demo/gsm8k_selfverification --output_dir experiment/gpt-4o-mini/gsm8k_selfverification > log/gpt-4o-mini/gsm8k_selfverification.log - - - Replace `gsm8k_selfverification` with the appropriate demo file for each reasoning strategy. - -5. Compare accuracy: - -- Review the log files generated in the `log/` directory. The accuracy is reported at the end of each file (e.g., `log/gpt-4o-mini/gsm8k_selfverification.log`). -- Collect the accuracy for each strategy. - -6. Report findings: - - For each dataset and reasoning expansion strategy, summarize the results: - - Dataset name. - - Expansion strategy used (e.g., repeating the question, self-verification). - - Log file with accuracy. - - Observed accuracy. - -7. Draw conclusions: - - Analyze which expansion strategy was most effective for improving accuracy. - - Discuss any trade-offs observed (e.g., reasoning complexity vs. accuracy gains). +How do different methods of expanding reasoning steps (i.e., repeating the question, self-verification, making equations, Think about word) affect the model's accuracy, and are some expansion strategies more effective than others? +Additional details: +- Test this for the gsm8k dataset. +- Use GPT-4o-mini as the model. +- The demo files needed to utilize these strategies/methods is already available in the repo below. +- Feel free to refer to the code here: https://github.com/MingyuJ666/The-Impact-of-Reasoning-Step-Length-on-Large-Language-Models \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/q8.txt b/benchmark/experimentation_bench/llm_reasoning_2/q8.txt index f98562c..e1b3c22 100644 --- a/benchmark/experimentation_bench/llm_reasoning_2/q8.txt +++ b/benchmark/experimentation_bench/llm_reasoning_2/q8.txt @@ -1,59 +1,7 @@ -Question: -What is the relationship between the complexity of a task (e.g., as measured by the number of logical inferences or mathematical operations needed) and the optimal length of the reasoning chain? +What is the relationship between the complexity of a task (i.e., as measured by the number of logical inferences or mathematical operations needed) and the optimal length of the reasoning chain? -The code you need is available in `/starter_file/cot_reasoning_step` - -Instructions: -1. Set OpenAI credentials: -``` -source /curie/setup/env.sh -``` -2. You will call `run_inference.py` with the following parameters. Make sure to read `run_inference.py` via cat first so you understand its contents. - -- Set the args.method to auto_cot -- Set the args.model to gpt-4o-mini -- Set the args.dataset to test 2 datasets: gsm8k and last_letters -- Example command: - -``` -python run_inference.py --dataset last_letters --demo_path demo/last_letters_3 --output_dir experiment/gpt-4o-mini/last_letters_3 > log/gpt-4o-mini/last_letters_3.log #1 represent the number of add step, last_letters is the name of the dataset -python run_inference.py --dataset gsm8k --demo_path demo/gsm8k_1 --output_dir experiment/gpt-4o-mini/gsm8k_1 > log/gpt-4o-mini/gsm8k_1.log -``` -- Execute these within the directory `/starter_file/cot_reasoning_step`. This will ensure you don't face file path errors when running the commands above. -- Here are the demos available (you don't need to test all of them, just what makes sense): -gsm8k_2 -gsm8k_3 -gsm8k_1 -last_letters_1 -last_letters_10 -last_letters_2 -last_letters_3 -last_letters_4 -last_letters_5 -last_letters_6 - -3. Call run_inference.py with increasing number of reasoning steps. You can change the reasoning steps using different demo. In the above examples, last_letters_3 refers to add 3 reasoning steps, while last_letters_1 would refer to add 1 reasoning step. - Optional: You can increase the args.max_length_cot in case the output of the model is truncated, and you need to view all of it. This may be useful when increasing the steps of cot. - -4. Determine the optimal number of reasoning steps: - - Review the log files in the `log/` directory. The accuracy is reported at the end of each file (e.g., `log/gpt-4o-mini/last_letters_1.log`). - - Identify the demo file associated with the highest accuracy for each dataset. - - Open the corresponding demo file and count the number of reasoning steps in the rationale section: - - Exclude the first sentence (e.g., “Let’s think step by step.”). - - Exclude the final sentence indicating the answer. - - Only count the intermediate reasoning steps. - - Record the optimal number of reasoning steps for each dataset. - -5. Analyze task complexity: - - Review the dataset questions and answers to assess task complexity: - - For example, in `gsm8k`, consider the number of mathematical operations or logical inferences required. - - In `last_letters`, evaluate the pattern recognition or sequence-following requirements. - - Compare the complexity of tasks with the corresponding optimal reasoning step counts. - -6. Draw conclusions: - - Summarize your findings for each dataset, including: - - Dataset name. - - Log file with the highest accuracy. - - Optimal number of reasoning steps. - - Analysis of task complexity. - - Provide insights into how the complexity of a task influences the optimal length of the reasoning chain. +Additional details: +- Use GPT-4o-mini as the model. +- Use the gsm8k and last_letters datasets. +- Use the Auto-CoT method for increasing number of reasoning steps. +- Feel free to refer to the code here: https://github.com/MingyuJ666/The-Impact-of-Reasoning-Step-Length-on-Large-Language-Models \ No newline at end of file diff --git a/benchmark/experimentation_bench/llm_reasoning_2/q9.txt b/benchmark/experimentation_bench/llm_reasoning_2/q9.txt index 58241fe..1e267ef 100644 --- a/benchmark/experimentation_bench/llm_reasoning_2/q9.txt +++ b/benchmark/experimentation_bench/llm_reasoning_2/q9.txt @@ -1,50 +1,8 @@ -Question: How does the position of an incorrect step within the reasoning chain affect the overall outcome? Is an early error more detrimental than a later one? -The code you need is available in `/starter_file/cot_reasoning_step` - -Instructions: -1. Set OpenAI credentials: -``` -source /curie/setup/env.sh -``` -2. You will call `run_inference.py` with the following parameters. Make sure to read `run_inference.py` via cat first so you understand its contents. - -- Set the args.method to auto_cot -- Set the args.model to gpt-4o-mini -- Set the args.dataset to test 1 dataset: gsm8k -- Example command: - -``` -python run_inference.py --dataset gsm8k --demo_path demo/gsm8k_3 --output_dir experiment/gpt-4o-mini/gsm8k_3 > log/gpt-4o-mini/gsm8k_3.log -``` - -3. Use demo files: - -- The modified demos for `gsm8k` are already available: - - **Early errors:** `demo/gsm8k_early` - - **Later errors:** `demo/gsm8k_later` - -4. Run inference for each demo: - - Test the accuracy for the demos with early and later errors using the following commands: - -python run_inference.py --dataset gsm8k --demo_path demo/gsm8k_early --output_dir experiment/gpt-4o-mini/gsm8k_early > log/gpt-4o-mini/gsm8k_early.log -python run_inference.py --dataset gsm8k --demo_path demo/gsm8k_later --output_dir experiment/gpt-4o-mini/gsm8k_later > log/gpt-4o-mini/gsm8k_later.log - -5. Analyze accuracy: - -- Review the log files generated in the `log/` directory. The accuracy is reported at the end of each file: - - `log/gpt-4o-mini/gsm8k_early.log` (for early errors). - - `log/gpt-4o-mini/gsm8k_later.log` (for later errors). -- Record and compare the accuracy for the two cases. - - -6. Draw conclusions: - - Summarize your findings: - - Dataset name: `gsm8k`. - - Accuracy with early errors: Report from `log/gpt-4o-mini/gsm8k_early.log`. - - Accuracy with later errors: Report from `log/gpt-4o-mini/gsm8k_later.log`. - - Discuss how the position of an error affects the model’s performance: - - Does an early error disrupt the logical flow more than a later error? - - How much does the position of the error impact overall accuracy? - +Additional details: +- Use GPT-4o-mini as the model. +- Use the gsm8k dataset. +- Use the Auto-CoT method for increasing number of reasoning steps. +- The early error demo file is located in `gsm8k_early`, and the late demo file is in `gsm8k_later`, both in the repo below. +- Feel free to refer to the code here: https://github.com/MingyuJ666/The-Impact-of-Reasoning-Step-Length-on-Large-Language-Models \ No newline at end of file diff --git a/benchmark/experimentation_bench/ml_training/ground_truth/additional/q1.txt b/benchmark/experimentation_bench/ml_training/ground_truth/additional/q1.txt new file mode 100644 index 0000000..92fa0d6 --- /dev/null +++ b/benchmark/experimentation_bench/ml_training/ground_truth/additional/q1.txt @@ -0,0 +1,18 @@ +# Design: + +{ + "constant_vars": [ + "dataset=House price prediction dataset (train.csv, test.csv as described)", + "hardware=CPU-only", + "evaluation_method=provided eval.py script" + ], + "independent_vars": [ + "models=any valid ML model for the task specified in the question", + "hyperparameters"="any valid hyperparameters for the specified models" + ], + "dependent_vars": [ + "Mean Absolute Error (MAE)" + ], +} + +# Setup: refer to instructions in question file. \ No newline at end of file diff --git a/benchmark/experimentation_bench/ml_training/ground_truth/additional/q2.txt b/benchmark/experimentation_bench/ml_training/ground_truth/additional/q2.txt new file mode 100644 index 0000000..a374a26 --- /dev/null +++ b/benchmark/experimentation_bench/ml_training/ground_truth/additional/q2.txt @@ -0,0 +1,20 @@ +# Design: + +{ + "constant_vars": [ + "dataset=IMDB movie reviews dataset from HuggingFace", + "hardware=CPU-only", + "evaluation_method=provided eval.py script" + ], + "independent_vars": [ + "model=any valid ML model suitable for sentiment analysis (e.g., Logistic Regression, CNN, Transformer, DistilBERT)", + "feature_representation=any valid method (e.g., TF-IDF, embeddings, tokenization)", + "hyperparameters=any valid hyperparameters, e.g., learning rate, epochs, optimizer, batch size" + ], + "dependent_vars": [ + "accuracy", + "class-wise probabilities" + ] +} + +# Setup: refer to instructions in question file. \ No newline at end of file diff --git a/benchmark/experimentation_bench/ml_training/ground_truth/additional/q3.txt b/benchmark/experimentation_bench/ml_training/ground_truth/additional/q3.txt new file mode 100644 index 0000000..a0ddc4a --- /dev/null +++ b/benchmark/experimentation_bench/ml_training/ground_truth/additional/q3.txt @@ -0,0 +1,20 @@ +# Design: + +{ + "constant_vars": [ + "dataset=Feedback analysis dataset (prepared via provided scripts)", + "hardware=CPU-only", + "evaluation_method=provided evaluation setup measuring MCRMSE and per-category RMSE", + "Docker_environment=qhwang123/researchassistant:latest image with specified dependencies" + ], + "independent_vars": [ + "model=choice of language models or architectures suitable for text analysis (e.g., Transformers, DistilBERT, RoBERTa, etc.)", + "hyperparameters=any valid hyperparameters for the chosen model, e.g., learning rate, batch size, epochs, optimizer, dropout rates" + ], + "dependent_vars": [ + "Mean Column-wise Root Mean Squared Error (MCRMSE)", + "Per-category RMSE" + ] +} + +# Setup: refer to instructions in question file. \ No newline at end of file diff --git a/benchmark/experimentation_bench/ml_training/ground_truth/additional/q4.txt b/benchmark/experimentation_bench/ml_training/ground_truth/additional/q4.txt new file mode 100644 index 0000000..2852430 --- /dev/null +++ b/benchmark/experimentation_bench/ml_training/ground_truth/additional/q4.txt @@ -0,0 +1,19 @@ +# Design: + +{ + "constant_vars": [ + "dataset=Spaceship Titanic dataset as described in env/task_descriptor.txt", + "hardware=CPU-only", + "evaluation_method=provided eval.py script and submission.csv formatting" + ], + "independent_vars": [ + "model=any valid classification model suitable for tabular data (e.g., Logistic Regression, Random Forest, XGBoost, LightGBM, Neural Network)", + "feature_selection=choice of input features from provided dataset", + "hyperparameters=any valid hyperparameters for the chosen model, e.g., learning rate, number of estimators, max depth, batch size" + ], + "dependent_vars": [ + "classification accuracy" + ] +} + +# Setup: refer to instructions in question file. \ No newline at end of file diff --git a/benchmark/experimentation_bench/ml_training/ground_truth/additional/q5.txt b/benchmark/experimentation_bench/ml_training/ground_truth/additional/q5.txt new file mode 100644 index 0000000..21db31c --- /dev/null +++ b/benchmark/experimentation_bench/ml_training/ground_truth/additional/q5.txt @@ -0,0 +1,20 @@ +# Design: + +{ + "constant_vars": [ + "dataset=Parkinson’s disease progression dataset (time-series data as described in env/data_description.txt)", + "hardware=specified Docker environment with CPU or GPU setup", + "evaluation_metric=Symmetric Mean Absolute Percentage Error (SMAPE) computed per UPDRS subscore and averaged" + ], + "independent_vars": [ + "model_architecture=any valid regression model or neural network architecture suitable for time-series prediction (e.g., LSTM, GRU, Transformer, fully-connected neural networks)", + "feature_extraction=methods such as time windowing, feature selection, normalization strategies", + "hyperparameters=any valid hyperparameters for the chosen model, e.g., learning rate, optimizer, batch size, epochs, dropout, regularization" + ], + "dependent_vars": [ + "SMAPE per UPDRS subscore (updrs_0, updrs_1, updrs_2, updrs_3)", + "overall average SMAPE across all four UPDRS subscores" + ] +} + +# Setup: refer to instructions in question file. \ No newline at end of file diff --git a/benchmark/experimentation_bench/ml_training/ground_truth/additional/q6.txt b/benchmark/experimentation_bench/ml_training/ground_truth/additional/q6.txt new file mode 100644 index 0000000..374a4e4 --- /dev/null +++ b/benchmark/experimentation_bench/ml_training/ground_truth/additional/q6.txt @@ -0,0 +1,15 @@ +# Design: + +{ + "constant_vars": [ + "evaluation_method=provided eval.py script measuring execution speed" + ], + "independent_vars": [ + "implementation_strategy=use of numpy-based vectorization techniques for code segments originally non-vectorized", + ], + "dependent_vars": [ + "execution speed (runtime)" + ] +} + +# Setup: refer to instructions in question file. \ No newline at end of file diff --git a/benchmark/experimentation_bench/ml_training/ground_truth/additional/q7.txt b/benchmark/experimentation_bench/ml_training/ground_truth/additional/q7.txt new file mode 100644 index 0000000..cfa1048 --- /dev/null +++ b/benchmark/experimentation_bench/ml_training/ground_truth/additional/q7.txt @@ -0,0 +1,17 @@ +# Design: + +{ + "constant_vars": [ + "dataset=BabyLM child-directed linguistic dataset", + "implementation_framework=Huggingface Transformers library" + ], + "independent_vars": [ + "model_architecture=any valid language modeling architecture (e.g., GPT, Transformer variants)", + "hyperparameters=any valid hyperparameters for the chosen architecture, e.g., learning rate, optimizer, batch size, epochs", + ], + "dependent_vars": [ + "validation perplexity" + ] +} + +# Setup: refer to instructions in question file. \ No newline at end of file diff --git a/benchmark/experimentation_bench/ml_training/ground_truth/q1.txt b/benchmark/experimentation_bench/ml_training/ground_truth/q1.txt index c9be2c0..32d5c04 100644 --- a/benchmark/experimentation_bench/ml_training/ground_truth/q1.txt +++ b/benchmark/experimentation_bench/ml_training/ground_truth/q1.txt @@ -1 +1,3 @@ -A MAE value covering around 20000, and above a few hundred, should make sense. +# Answer: + +A MAE value below 20000, and above a few hundred. \ No newline at end of file diff --git a/benchmark/experimentation_bench/ml_training/ground_truth/q2.txt b/benchmark/experimentation_bench/ml_training/ground_truth/q2.txt index 6969e40..cd2b592 100644 --- a/benchmark/experimentation_bench/ml_training/ground_truth/q2.txt +++ b/benchmark/experimentation_bench/ml_training/ground_truth/q2.txt @@ -1,13 +1,3 @@ -Ground truth: a reasonable accuracy similar or higher to the following. -#### Control Group -- **Model**: Logistic Regression with TF-IDF -- **Accuracy**: 88.292% -- **Reproducibility**: Results were consistent across two runs. +# Answer: -#### Experimental Group (Partition 1) -- **Model**: Logistic Regression with TF-IDF and Synonym Augmentation -- **Accuracy**: 85.768% -- **Reproducibility**: Results were consistent across two runs. - -### Conclusion -The control group, using Logistic Regression with TF-IDF, achieved higher accuracy compared to the experimental group with synonym augmentation. This suggests that in this context, adding synonym augmentation did not enhance model performance, potentially due to noise introduced by synonyms in sentiment contexts. \ No newline at end of file +A reasonable accuracy similar or higher than 88.2% \ No newline at end of file diff --git a/benchmark/experimentation_bench/ml_training/ground_truth/q3.txt b/benchmark/experimentation_bench/ml_training/ground_truth/q3.txt index ddc58cf..b76fae8 100644 --- a/benchmark/experimentation_bench/ml_training/ground_truth/q3.txt +++ b/benchmark/experimentation_bench/ml_training/ground_truth/q3.txt @@ -1,4 +1,5 @@ -Ground truth: +# Answer: + A reasonable MCRMSE should be similar to or lower than the following baseline performance: Baseline MCRMSE: 2.7286 diff --git a/benchmark/experimentation_bench/ml_training/ground_truth/q4.txt b/benchmark/experimentation_bench/ml_training/ground_truth/q4.txt index 57aa941..182f592 100644 --- a/benchmark/experimentation_bench/ml_training/ground_truth/q4.txt +++ b/benchmark/experimentation_bench/ml_training/ground_truth/q4.txt @@ -1 +1,3 @@ -Ground truth: reasonable accuracy (not extreme at 0 or 100). \ No newline at end of file +# Answer: + +Reasonable classification accuracy (not extreme at 0 or 100). \ No newline at end of file diff --git a/benchmark/experimentation_bench/ml_training/ground_truth/q5.txt b/benchmark/experimentation_bench/ml_training/ground_truth/q5.txt index cd76e5a..169d4b3 100644 --- a/benchmark/experimentation_bench/ml_training/ground_truth/q5.txt +++ b/benchmark/experimentation_bench/ml_training/ground_truth/q5.txt @@ -1,4 +1,5 @@ -Ground truth: +# Answer: + A reasonable model should achieve an overall SMAPE score lower than the baseline while maintaining stable performance across all four UPDRS subscores. Baseline (from current train.py run): diff --git a/benchmark/experimentation_bench/ml_training/ground_truth/q6.txt b/benchmark/experimentation_bench/ml_training/ground_truth/q6.txt index 1fecc60..f4f8d3b 100644 --- a/benchmark/experimentation_bench/ml_training/ground_truth/q6.txt +++ b/benchmark/experimentation_bench/ml_training/ground_truth/q6.txt @@ -1,19 +1,3 @@ -#### Control Group Results -- **Execution Times**: - - Result 1: 1.62 seconds - - Result 2: 1.64 seconds +# Answer: -These results represent the baseline execution speed of the `env/train.py` script without any vectorization enhancements. - -#### Experimental Group Results -- **Execution Times**: - - Result 1: - - 0.042 seconds - - 0.0035 seconds - - 0.0035 seconds - - Result 2: - - 0.011 seconds - - 0.012 seconds - - 0.051 seconds - -The experimental results demonstrate a significant reduction in execution time compared to the control group, confirming the effectiveness of the vectorization techniques applied. The results are consistent across multiple test runs. \ No newline at end of file +An execution time that is less than 1.64 seconds. \ No newline at end of file diff --git a/benchmark/experimentation_bench/ml_training/ground_truth/q7.txt b/benchmark/experimentation_bench/ml_training/ground_truth/q7.txt index 7da7578..9e3657f 100644 --- a/benchmark/experimentation_bench/ml_training/ground_truth/q7.txt +++ b/benchmark/experimentation_bench/ml_training/ground_truth/q7.txt @@ -1,5 +1,6 @@ -Ground truth: -A reasonable model should achieve a perplexity lower than the baseline, indicating improved language modeling performance on child-directed text data. +# Answer: + +A reasonable model should achieve a validation perplexity lower than the baseline, indicating improved language modeling performance on child-directed text data. Baseline (from current train.py run): diff --git a/benchmark/experimentation_bench/ml_training/q1_housing_price.txt b/benchmark/experimentation_bench/ml_training/q1_housing_price.txt index c552106..048fce8 100644 --- a/benchmark/experimentation_bench/ml_training/q1_housing_price.txt +++ b/benchmark/experimentation_bench/ml_training/q1_housing_price.txt @@ -10,6 +10,10 @@ conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvi pip install -q -r requirements.txt ``` +- Note: if /starter_file/MLAgentBench does not exist: + 1. If /starter_file does not exist, feel free to replace /starter_file with some other directory that you have access to. Then, follow the next step. + 2. Download the repo from: https://github.com/snap-stanford/MLAgentBench + - Install Kaggle ``` export KAGGLE_CONFIG_DIR=/starter_file/MLAgentBench/.kaggle diff --git a/benchmark/experimentation_bench/ml_training/q2_imdb.txt b/benchmark/experimentation_bench/ml_training/q2_imdb.txt index a552aa9..6a2f2a3 100644 --- a/benchmark/experimentation_bench/ml_training/q2_imdb.txt +++ b/benchmark/experimentation_bench/ml_training/q2_imdb.txt @@ -10,6 +10,10 @@ conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvi pip install -q -r requirements.txt ``` +- Note: if /starter_file/MLAgentBench does not exist: + 1. If /starter_file does not exist, feel free to replace /starter_file with some other directory that you have access to. Then, follow the next step. + 2. Download the repo from: https://github.com/snap-stanford/MLAgentBench + - Install Kaggle ``` export KAGGLE_CONFIG_DIR=/starter_file/MLAgentBench/.kaggle diff --git a/benchmark/experimentation_bench/ml_training/q4_spaceship.txt b/benchmark/experimentation_bench/ml_training/q4_spaceship.txt index 6db1393..81eb6e5 100644 --- a/benchmark/experimentation_bench/ml_training/q4_spaceship.txt +++ b/benchmark/experimentation_bench/ml_training/q4_spaceship.txt @@ -10,6 +10,10 @@ conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvi pip install -q -r requirements.txt ``` +- Note: if /starter_file/MLAgentBench does not exist: + 1. If /starter_file does not exist, feel free to replace /starter_file with some other directory that you have access to. Then, follow the next step. + 2. Download the repo from: https://github.com/snap-stanford/MLAgentBench + - Install Kaggle ``` export KAGGLE_CONFIG_DIR=/starter_file/MLAgentBench/.kaggle diff --git a/benchmark/experimentation_bench/ml_training/q6_vector.txt b/benchmark/experimentation_bench/ml_training/q6_vector.txt index 4c26cd4..2344f3c 100644 --- a/benchmark/experimentation_bench/ml_training/q6_vector.txt +++ b/benchmark/experimentation_bench/ml_training/q6_vector.txt @@ -10,6 +10,10 @@ conda install pytorch torchvision torchaudio pytorch-cuda=11.7 -c pytorch -c nvi pip install -q -r requirements.txt ``` +- Note: if /starter_file/MLAgentBench does not exist: + 1. If /starter_file does not exist, feel free to replace /starter_file with some other directory that you have access to. Then, follow the next step. + 2. Download the repo from: https://github.com/snap-stanford/MLAgentBench + - Install Kaggle ``` export KAGGLE_CONFIG_DIR=/starter_file/MLAgentBench/.kaggle diff --git a/benchmark/experimentation_bench/vector_index/ground_truth/additional/q1.txt b/benchmark/experimentation_bench/vector_index/ground_truth/additional/q1.txt new file mode 100644 index 0000000..c415afb --- /dev/null +++ b/benchmark/experimentation_bench/vector_index/ground_truth/additional/q1.txt @@ -0,0 +1,20 @@ +# Design: + +{ + "constant_vars": [ + "dataset=SIFT1M", + "index=HNSW", + "k=10", + "M=32", + "efConstruction=40" + ], + "independent_vars": [ + "efSearch=at least 3 distinct integer values (e.g., 10, 50, 100)" + ], + "dependent_vars": [ + "query latency", + "recall" + ] +} + +# Setup: refer to question. \ No newline at end of file diff --git a/benchmark/experimentation_bench/vector_index/ground_truth/additional/q10.txt b/benchmark/experimentation_bench/vector_index/ground_truth/additional/q10.txt new file mode 100644 index 0000000..5c05153 --- /dev/null +++ b/benchmark/experimentation_bench/vector_index/ground_truth/additional/q10.txt @@ -0,0 +1,21 @@ +# Design: + +{ + "constant_vars": [ + "index_type=PQ", + "script=adapted bench_hnsw.py for PQ evaluation", + ], + "independent_vars": [ + "d=vector dimensionality (e.g., 64, 128, 256)", + "nt=number of training vectors (e.g., 1K, 10K, 100K)", + "nb=number of base vectors (e.g., 10K, 100K, 1M)", + "nq=number of query vectors (e.g., 100, 1K, 10K)" + ], + "dependent_vars": [ + "recall", + "memory usage", + "latency" + ] +} + +# Setup: refer to question. \ No newline at end of file diff --git a/benchmark/experimentation_bench/vector_index/ground_truth/additional/q11.txt b/benchmark/experimentation_bench/vector_index/ground_truth/additional/q11.txt new file mode 100644 index 0000000..f71f768 --- /dev/null +++ b/benchmark/experimentation_bench/vector_index/ground_truth/additional/q11.txt @@ -0,0 +1,19 @@ +# Design: + +{ + "constant_vars": [ + "index_type=HNSW", + "script=adapted bench_hnsw.py to accept synthetic dataset", + "dataset_type=synthetic data generated with np.random.normal", + ], + "independent_vars": [ + "data_size=number of vectors in the dataset (e.g., 10K, 100K, 1M)", + "mean=mean of the normal distribution used to generate data (e.g., 0, 100, 1000)", + "variance=variance of the normal distribution used to generate data (e.g., 1, 10, 100)" + ], + "dependent_vars": [ + "recall" + ] +} + +# Setup: refer to question. \ No newline at end of file diff --git a/benchmark/experimentation_bench/vector_index/ground_truth/additional/q12.txt b/benchmark/experimentation_bench/vector_index/ground_truth/additional/q12.txt new file mode 100644 index 0000000..be08a59 --- /dev/null +++ b/benchmark/experimentation_bench/vector_index/ground_truth/additional/q12.txt @@ -0,0 +1,20 @@ +# Design: + +{ + "constant_vars": [ + "index_type=HNSW", + "dataset_type=SyntheticDataset using faiss.contrib.datasets.SyntheticDataset", + ], + "independent_vars": [ + "d=[32, 64]", + "nb=[100000, 200000]", + "nq=[500, 1000]" + ], + "dependent_vars": [ + "M=minimum M value needed to reach ≥90% recall", + "efConstruction=minimum efConstruction value needed to reach ≥90% recall", + "efSearch=minimum efSearch value needed to reach ≥90% recall" + ] +} + +# Setup: refer to question. \ No newline at end of file diff --git a/benchmark/experimentation_bench/vector_index/ground_truth/additional/q13.txt b/benchmark/experimentation_bench/vector_index/ground_truth/additional/q13.txt new file mode 100644 index 0000000..acd3419 --- /dev/null +++ b/benchmark/experimentation_bench/vector_index/ground_truth/additional/q13.txt @@ -0,0 +1,22 @@ +# Design: + +{ + "constant_vars": [ + "dataset=SIFT1M", + "index_type=HNSW", + "target_recall=0.95", + "script=bench_hnsw.py", + ], + "independent_vars": [ + "latency_targets=[5ms, 1ms, 0.1ms, 0.05ms]", + "M=number of neighbors per node (to be tuned)", + "efConstruction=HNSW graph construction effort (to be tuned)", + "k=number of nearest neighbors returned (to be tuned)" + ], + "dependent_vars": [ + "query latency", + "recall" + ] +} + +# Setup: refer to question. Also, check if it is actually producing and checking for per-query latency. \ No newline at end of file diff --git a/benchmark/experimentation_bench/vector_index/ground_truth/additional/q14.txt b/benchmark/experimentation_bench/vector_index/ground_truth/additional/q14.txt new file mode 100644 index 0000000..28a927c --- /dev/null +++ b/benchmark/experimentation_bench/vector_index/ground_truth/additional/q14.txt @@ -0,0 +1,20 @@ +# Design: + +{ + "constant_vars": [ + "dataset=SIFT1M", + "index_type=IVFPQ", + "initial_training_partition=train_data[:partition1]", + "target_metric_trend=analyze over incremental additions" + ], + "independent_vars": [ + "incremental_additions=train_data added in batches (e.g., [10000, 50000, 100000])" + ], + "dependent_vars": [ + "recall", + "query latency", + "memory usage" + ] +} + +# Setup: refer to question \ No newline at end of file diff --git a/benchmark/experimentation_bench/vector_index/ground_truth/additional/q15.txt b/benchmark/experimentation_bench/vector_index/ground_truth/additional/q15.txt new file mode 100644 index 0000000..67c5a2d --- /dev/null +++ b/benchmark/experimentation_bench/vector_index/ground_truth/additional/q15.txt @@ -0,0 +1,21 @@ +# Design: + +{ + "constant_vars": [ + "dataset=SIFT1M", + "index_type=HNSW", + "efSearch=100", + "k=10", + "M=32", + "efConstruction=40" + ], + "independent_vars": [ + "number of runs= 5 individual runs" + ], + "dependent_vars": [ + "recall", + "query latency" + ] +} + +# Setup: refer to question. Also, check that 5 runs were actually done, with required parameters passed to the script correctly. diff --git a/benchmark/experimentation_bench/vector_index/ground_truth/additional/q2.txt b/benchmark/experimentation_bench/vector_index/ground_truth/additional/q2.txt new file mode 100644 index 0000000..0195afb --- /dev/null +++ b/benchmark/experimentation_bench/vector_index/ground_truth/additional/q2.txt @@ -0,0 +1,20 @@ +# Design: + +{ + "constant_vars": [ + "dataset=SIFT1M", + "index=HNSW", + "k=10", + "efConstruction=40" + ], + "independent_vars": [ + "M=values [16, 24, 32]" + ], + "dependent_vars": [ + "memory usage", + "recall", + "query latency" + ] +} + +# Setup: refer to question. \ No newline at end of file diff --git a/benchmark/experimentation_bench/vector_index/ground_truth/additional/q3.txt b/benchmark/experimentation_bench/vector_index/ground_truth/additional/q3.txt new file mode 100644 index 0000000..ac4247a --- /dev/null +++ b/benchmark/experimentation_bench/vector_index/ground_truth/additional/q3.txt @@ -0,0 +1,19 @@ +# Design: + +{ + "constant_vars": [ + "dataset=SIFT1M", + "index=HNSW", + "k=10", + "efConstruction=40", + ], + "independent_vars": [ + "M=values [16, 24, 32]", + ], + "dependent_vars": [ + "memory usage", + "recall" + ] +} + +# Setup: refer to question. \ No newline at end of file diff --git a/benchmark/experimentation_bench/vector_index/ground_truth/additional/q4.txt b/benchmark/experimentation_bench/vector_index/ground_truth/additional/q4.txt new file mode 100644 index 0000000..20d08fe --- /dev/null +++ b/benchmark/experimentation_bench/vector_index/ground_truth/additional/q4.txt @@ -0,0 +1,21 @@ +# Design: + +{ + "constant_vars": [ + "dataset=SIFT1M", + "index=HNSW", + "k=10", + "M=32", + "efConstruction=40", + "efSearch=100" + ], + "independent_vars": [ + "num_threads=multiple integer values representing degree of parallelism (e.g., 1, 2, 4, 8, 16)" + ], + "dependent_vars": [ + "query latency", + "recall" + ] +} + +# Setup: refer to question. \ No newline at end of file diff --git a/benchmark/experimentation_bench/vector_index/ground_truth/additional/q5.txt b/benchmark/experimentation_bench/vector_index/ground_truth/additional/q5.txt new file mode 100644 index 0000000..730561f --- /dev/null +++ b/benchmark/experimentation_bench/vector_index/ground_truth/additional/q5.txt @@ -0,0 +1,19 @@ +# Design: + +{ + "constant_vars": [ + "dataset=SIFT1M", + "index=HNSW", + "k=10" + ], + "independent_vars": [ + "M=[16, 24, 32]", + "efConstruction=[40, 50, 60]" + ], + "dependent_vars": [ + "recall", + "query latency" + ] +} + +# Setup: refer to question. \ No newline at end of file diff --git a/benchmark/experimentation_bench/vector_index/ground_truth/additional/q6.txt b/benchmark/experimentation_bench/vector_index/ground_truth/additional/q6.txt new file mode 100644 index 0000000..f0f9e5c --- /dev/null +++ b/benchmark/experimentation_bench/vector_index/ground_truth/additional/q6.txt @@ -0,0 +1,17 @@ +# Design: + +{ + "constant_vars": [ + "datasets=ANN_SIFT10K and ANN_SIFT1M", + "HNSW_parameters=k=10, M=32, efConstruction=40 (default settings)" + ], + "independent_vars": [ + "index_type=IVF (experimental), HNSW (control)", + "dataset_size=10K (small from ANN_SIFT10K), 1M (large from ANN_SIFT1M)" + ], + "dependent_vars": [ + "index-building time" + ] +} + +# Setup: refer to question. \ No newline at end of file diff --git a/benchmark/experimentation_bench/vector_index/ground_truth/additional/q7.txt b/benchmark/experimentation_bench/vector_index/ground_truth/additional/q7.txt new file mode 100644 index 0000000..24697fd --- /dev/null +++ b/benchmark/experimentation_bench/vector_index/ground_truth/additional/q7.txt @@ -0,0 +1,18 @@ +# Design: + +{ + "constant_vars": [ + "dataset=SIFT1M", + "HNSW_parameters=k=10, M=32, efConstruction=40", + "IVF_parameters=default settings", + ], + "independent_vars": [ + "index_type=HNSW, IVF" + ], + "dependent_vars": [ + "memory usage", + "recall" + ] +} + +# Setup: refer to question. \ No newline at end of file diff --git a/benchmark/experimentation_bench/vector_index/ground_truth/additional/q8.txt b/benchmark/experimentation_bench/vector_index/ground_truth/additional/q8.txt new file mode 100644 index 0000000..ecdc0c3 --- /dev/null +++ b/benchmark/experimentation_bench/vector_index/ground_truth/additional/q8.txt @@ -0,0 +1,19 @@ +# Design: + +{ + "constant_vars": [ + "dataset=SIFT1M", + "index_type=IVF", + "script=bench_ivf_fastscan.py (minimally modified to accept nprobe)", + "other_ivf_parameters=default" + ], + "independent_vars": [ + "nprobe=multiple integer values (e.g., 1, 4, 8, 16, 32, 64)" + ], + "dependent_vars": [ + "recall", + "query latency" + ] +} + +# Setup: refer to question. \ No newline at end of file diff --git a/benchmark/experimentation_bench/vector_index/ground_truth/additional/q9.txt b/benchmark/experimentation_bench/vector_index/ground_truth/additional/q9.txt new file mode 100644 index 0000000..2cb58e8 --- /dev/null +++ b/benchmark/experimentation_bench/vector_index/ground_truth/additional/q9.txt @@ -0,0 +1,20 @@ +# Design: + +{ + "constant_vars": [ + "dataset=SIFT1M", + "index_type=HNSW", + ], + "independent_vars": [ + "efConstruction=multiple values (e.g., 20, 40, 60)", + "efSearch=multiple values (e.g., 10, 50, 100)", + "M=multiple values (e.g., 16, 24, 32)" + ], + "dependent_vars": [ + "recall", + "memory usage", + "latency" + ] +} + +# Setup: refer to question. \ No newline at end of file diff --git a/benchmark/experimentation_bench/vector_index/ground_truth/q1.txt b/benchmark/experimentation_bench/vector_index/ground_truth/q1.txt index fb35a27..e50fce5 100644 --- a/benchmark/experimentation_bench/vector_index/ground_truth/q1.txt +++ b/benchmark/experimentation_bench/vector_index/ground_truth/q1.txt @@ -1 +1,3 @@ +# Answer: + higher `efSearch` values lead to increased query latency and slightly better recall rates. \ No newline at end of file diff --git a/benchmark/experimentation_bench/vector_index/ground_truth/q10.txt b/benchmark/experimentation_bench/vector_index/ground_truth/q10.txt index eda5058..1978316 100644 --- a/benchmark/experimentation_bench/vector_index/ground_truth/q10.txt +++ b/benchmark/experimentation_bench/vector_index/ground_truth/q10.txt @@ -1,15 +1,33 @@ -Expected outcome: +# Answer: -larger d = higher recall, latency and memory -larger nt = higher recall. same latency and memory -larger nb = higher latency, recall not sure cannot compare, memory same -larger nq = higher latency, memory and recall no change +Effect of increasing d on: -Q: what you found was not observed? +- recall: Recall increases. -What we found instead: +- memory: Memory usage increases. -larger d = consistently high/higher recall, higher memory, higher latency -larger nt = consistently high/higher recall, same memory, higher latency -larger nb = consistently high recall, higher memory, higher latency -larger nq = consistently high/higher recall, same memory, higher latency \ No newline at end of file +- latency: Latency increases. + +Effect of increasing nt on: + +- recall: Recall increases. + +- memory: No change in memory usage. + +- latency: No change in latency. + +Effect of increasing nb on: + +- recall: Effect on recall uncertain (cannot directly compare). + +- memory: No change in memory usage. + +- latency: Latency increases. + +Effect of increasing nq on: + +- recall: No change in recall. + +- memory: No change in memory usage. + +- latency: Latency increases. \ No newline at end of file diff --git a/benchmark/experimentation_bench/vector_index/ground_truth/q11.txt b/benchmark/experimentation_bench/vector_index/ground_truth/q11.txt index 7392aff..4225a00 100644 --- a/benchmark/experimentation_bench/vector_index/ground_truth/q11.txt +++ b/benchmark/experimentation_bench/vector_index/ground_truth/q11.txt @@ -1,6 +1,4 @@ -Setup: check that np.random is used to construct the dataset, and that it is somehow passed and used as the dataset in the HNSW index. -Variables: check that mean, variance and dataset size are all varied individually and tested to see their effects. Expected result: -1. **Effect on Increasing Data Size**: The HNSW index generally maintains high recall as data size increases, with occasional slight performance drops in certain cases. +1. **Effect on Increasing Data Size**: Increasing data size set leads to a slightly decrease in recall. 2. **Effect on Increasing Mean**: The recall is not significantly affected by changes in the mean, showing robustness. -3. **Effect on Increasing Variance**: The recall remains mostly stable, with some rare instances of decreased performance. +3. **Effect on Increasing Variance**: Higher variance leads to a decline in recall. \ No newline at end of file diff --git a/benchmark/experimentation_bench/vector_index/ground_truth/q12.txt b/benchmark/experimentation_bench/vector_index/ground_truth/q12.txt index 4188b54..ae054f0 100644 --- a/benchmark/experimentation_bench/vector_index/ground_truth/q12.txt +++ b/benchmark/experimentation_bench/vector_index/ground_truth/q12.txt @@ -1,4 +1,6 @@ -Based on the results, we can conclude the following trends for achieving at least 90% recall: +# Answer: + +To achieve at least 90% recall: #### As d values increase: - M values need to: Slightly increase to maintain high recall as dimensions increase. @@ -13,4 +15,4 @@ Based on the results, we can conclude the following trends for achieving at leas #### As nq values increase: - M values need to: Remain stable, as they primarily affect graph connectivity. - efConstruction values need to: Remain stable, since they influence graph construction rather than query performance. -- efSearch values need to: Increase to ensure higher recall with more queries. +- efSearch values need to: Increase to ensure higher recall with more queries. \ No newline at end of file diff --git a/benchmark/experimentation_bench/vector_index/ground_truth/q13.txt b/benchmark/experimentation_bench/vector_index/ground_truth/q13.txt index fac54fe..4cd295a 100644 --- a/benchmark/experimentation_bench/vector_index/ground_truth/q13.txt +++ b/benchmark/experimentation_bench/vector_index/ground_truth/q13.txt @@ -1,17 +1,8 @@ -Setup: verify if actually producing and checking for per-query latency. This should be taken directly from provided script output itself +# Answer: -1. **5ms Latency Requirement:** - - **Configuration:** M=16, efConstruction=100, k=10 - - **Performance:** Achieved recall of approximately 0.9118 to 0.9119 with a latency of around 0.028 ms per query. +To maintain a recall of 0.95 as latency requirements get stricter: +- M values need to: increase +- efConstruction values need to: increase +- k values need to: increase -2. **1ms Latency Requirement:** - - **Configuration:** M=32, efConstruction=200, k=20 - - **Performance:** Achieved recall of approximately 0.9633 with latency values around 0.046 ms per query. - -3. **0.1ms Latency Requirement:** - - **Configuration:** M=48, efConstruction=300, k=30 - - **Performance:** Achieved recall of approximately 0.9643 with latency values around 0.046 ms per query. - -4. **0.05ms Latency Requirement:** - - **Configuration:** M=64, efConstruction=400, k=40 - - **Performance:** Achieved recall of approximately 0.9733 with latency values around 0.055 ms per query. \ No newline at end of file +Observe that the general trend as the latency requirement gets stricter is followed. \ No newline at end of file diff --git a/benchmark/experimentation_bench/vector_index/ground_truth/q14.txt b/benchmark/experimentation_bench/vector_index/ground_truth/q14.txt index 478620c..70d976e 100644 --- a/benchmark/experimentation_bench/vector_index/ground_truth/q14.txt +++ b/benchmark/experimentation_bench/vector_index/ground_truth/q14.txt @@ -1,3 +1,7 @@ -Recall decrease -Latency increase -Memory increase \ No newline at end of file +# Answer: + +The incremental addition of vectors causes: + +Recall to: decrease +Latency to: increase +Memory to: increase \ No newline at end of file diff --git a/benchmark/experimentation_bench/vector_index/ground_truth/q15.txt b/benchmark/experimentation_bench/vector_index/ground_truth/q15.txt index 0ed90fb..e732182 100644 --- a/benchmark/experimentation_bench/vector_index/ground_truth/q15.txt +++ b/benchmark/experimentation_bench/vector_index/ground_truth/q15.txt @@ -1,5 +1,7 @@ -Setup: check that 5 runs were actually done, with required parameters passed to the script correctly. +# Answer: -- The HNSW index on the SIFT1M dataset consistently achieves high recall and low latency, with an error range for recall of approximately ±0.0005 and latency per query consistently around 0.111 ms. +The HNSW index on the SIFT1M dataset consistently achieves high recall and low latency, with an error range for recall of approximately ±0.0005, and latency per query consistently around 0.111 ms. -The experiment has successfully addressed the question by confirming that the HNSW algorithm exhibits consistent recall and latency when run on the SIFT1M dataset five times with the specified parameters. The determined error range for recall is approximately 0.9948 to 0.9957, and for latency, it is 0.111 to 0.114 seconds. +The determined error range for recall is approximately 0.9948 to 0.9957, and for latency, it is 0.111 to 0.114 seconds. + +Check that the recall error range is approximately as above, and the latency error range is minimal as well (similar to the range above as well). \ No newline at end of file diff --git a/benchmark/experimentation_bench/vector_index/ground_truth/q2.txt b/benchmark/experimentation_bench/vector_index/ground_truth/q2.txt index 1c033b5..be9d738 100644 --- a/benchmark/experimentation_bench/vector_index/ground_truth/q2.txt +++ b/benchmark/experimentation_bench/vector_index/ground_truth/q2.txt @@ -1 +1,3 @@ -Using higher \( M \) values improves recall with manageable increases in query latency, making it a favorable trade-off for applications prioritizing accuracy. +# Answer: + +Using higher \( M \) values improves recall with manageable increases in query latency and memory usage, making it a favorable trade-off for applications prioritizing accuracy. diff --git a/benchmark/experimentation_bench/vector_index/ground_truth/q3.txt b/benchmark/experimentation_bench/vector_index/ground_truth/q3.txt index e69de29..88230e4 100644 --- a/benchmark/experimentation_bench/vector_index/ground_truth/q3.txt +++ b/benchmark/experimentation_bench/vector_index/ground_truth/q3.txt @@ -0,0 +1,3 @@ +# Answer: + +Memory usage remains same as efSearch varies. With M=16, setting efSearch to 64 or higher achieves at least 90% recall while minimizing memory usage. \ No newline at end of file diff --git a/benchmark/experimentation_bench/vector_index/ground_truth/q4.txt b/benchmark/experimentation_bench/vector_index/ground_truth/q4.txt index f307bfd..113729e 100644 --- a/benchmark/experimentation_bench/vector_index/ground_truth/q4.txt +++ b/benchmark/experimentation_bench/vector_index/ground_truth/q4.txt @@ -1,60 +1,3 @@ -Results: latency will reduce (may be up until a certain point based on the number of CPUs on the system) +# Answer: -Example results: - -- **Effect of Parallelism on Recall and Latency**: - - Increasing the number of threads using `omp_set_num_threads` significantly reduces the latency (time per query) for the SIFT1M dataset when using the HNSW index, with performance improving as the number of threads increases. - - There is a slight decrease in recall as the number of threads increases, indicating a trade-off between processing speed and search accuracy. - -#### Control Group (1 Thread) -- **Result 1**: - - Time per query: 0.420 ms - - Recall at 1 (R@1): 0.9949 - - Total execution time: 49743.701 ms - -- **Result 2**: - - Time per query: 0.411 ms - - Recall at 1 (R@1): 0.9951 - - Total execution time: 49560.055 ms - -#### Experimental Group -- **OMP_NUM_THREADS=2**: - - Result 1: - - Time per query: 0.197 ms - - Recall at 1 (R@1): 0.9449 - - Total execution time: 19257.041 ms - - Result 2: - - Time per query: 0.196 ms - - Recall at 1 (R@1): 0.9445 - - Total execution time: 19007.649 ms - -- **OMP_NUM_THREADS=4**: - - Result 1: - - Time per query: 0.197 ms - - Recall at 1 (R@1): 0.9439 - - Total execution time: 19189.449 ms - - Result 2: - - Time per query: 0.195 ms - - Recall at 1 (R@1): 0.9438 - - Total execution time: 19254.846 ms - -- **OMP_NUM_THREADS=8**: - - Result 1: - - Time per query: 0.197 ms - - Recall at 1 (R@1): 0.9434 - - Total execution time: 19218.741 ms - - Result 2: - - Time per query: 0.199 ms - - Recall at 1 (R@1): 0.9430 - - Total execution time: 19211.370 ms - -- **OMP_NUM_THREADS=16**: - - Result 1: - - Time per query: 0.198 ms - - Recall at 1 (R@1): 0.9423 - - Total execution time: 19121.308 ms - - Result 2: - - Time per query: 0.199 ms - - Recall at 1 (R@1): 0.9422 - - Total execution time: 19232.288 ms -" \ No newline at end of file +Latency will reduce (up until a certain point), while recall won't be affected much. \ No newline at end of file diff --git a/benchmark/experimentation_bench/vector_index/ground_truth/q5.txt b/benchmark/experimentation_bench/vector_index/ground_truth/q5.txt index a34d81c..07d82ea 100644 --- a/benchmark/experimentation_bench/vector_index/ground_truth/q5.txt +++ b/benchmark/experimentation_bench/vector_index/ground_truth/q5.txt @@ -1 +1,3 @@ +# Answer: + M=24 or 32 and efConstruction=50 or 60. Recall achieved is 1.0 diff --git a/benchmark/experimentation_bench/vector_index/ground_truth/q6.txt b/benchmark/experimentation_bench/vector_index/ground_truth/q6.txt index 16aa293..3fcc014 100644 --- a/benchmark/experimentation_bench/vector_index/ground_truth/q6.txt +++ b/benchmark/experimentation_bench/vector_index/ground_truth/q6.txt @@ -1,6 +1,10 @@ +# Answer: + 1. **Relationship Observed**: The index-building time increases with dataset size for both index types, but the extent of increase is significantly different between HNSW and IVF. 2. **Efficiency**: IVF appears to be more efficient than HNSW in terms of index-building time for the dataset sizes tested. - The IVF index-building times are significantly faster than the HNSW times, especially for larger datasets, which is consistent with the expectation that different FAISS index types have different efficiencies. +Approximate relative results you should observe: + HNSW results: ### Results Summary diff --git a/benchmark/experimentation_bench/vector_index/ground_truth/q7.txt b/benchmark/experimentation_bench/vector_index/ground_truth/q7.txt index 1a1b20a..87fec25 100644 --- a/benchmark/experimentation_bench/vector_index/ground_truth/q7.txt +++ b/benchmark/experimentation_bench/vector_index/ground_truth/q7.txt @@ -1,4 +1,3 @@ -Check that the script installs a package for memory measurement or uses some utility that can legitimately measure memory used. +# Answer: -HNSW should be better, something like this: -The HNSW index type achieves the required recall rate of at least 96% and does so with significantly less memory usage compared to the IVF index type. +The IVF index type achieves the required recall rate of at least 96% and does so with significantly less memory usage compared to the HNSW index type. diff --git a/benchmark/experimentation_bench/vector_index/ground_truth/q8.txt b/benchmark/experimentation_bench/vector_index/ground_truth/q8.txt index 780d9fa..8051105 100644 --- a/benchmark/experimentation_bench/vector_index/ground_truth/q8.txt +++ b/benchmark/experimentation_bench/vector_index/ground_truth/q8.txt @@ -1,2 +1,4 @@ -Check that multiple nprobes are actually measured in logs -As nprobe increases, recall improves, so does the latency. \ No newline at end of file +# Answer: + +Check that multiple nprobes are actually measured in logs. +As nprobe increases, recall improves, and so does the latency. \ No newline at end of file diff --git a/benchmark/experimentation_bench/vector_index/ground_truth/q9.txt b/benchmark/experimentation_bench/vector_index/ground_truth/q9.txt index 5c44779..0aac4f6 100644 --- a/benchmark/experimentation_bench/vector_index/ground_truth/q9.txt +++ b/benchmark/experimentation_bench/vector_index/ground_truth/q9.txt @@ -1,7 +1,25 @@ -efConstruction or M increase: -- recall, memory, latency increase +# Answer: -efSearch increase: -- recall, latency increase. Memory negligible change +Effect of increasing efConstruction on: -`efSearch` is most sensitive to recall improvements and latency increases during querying. `M` heavily influences memory usage and latency during both construction and querying. `efConstruction` impacts the recall and latency during construction. \ No newline at end of file +- recall: Recall improves moderately during index construction. + +- memory: Memory usage increases moderately during index construction. + +- latency: Latency increases during index construction; minimal impact during querying. + +Effect of increasing efSearch on: + +- recall: Significant recall improvement during querying. + +- memory: Negligible change in memory usage. + +- latency: Notable increase in latency during querying. + +Effect of increasing M on: + +- recall: Recall improves moderately. + +- memory: Significant increase in memory usage notably during both construction and querying. + +- latency: Latency increases notably during both construction and querying. \ No newline at end of file diff --git a/benchmark/experimentation_bench/vector_index/q10_pq_synthetic.txt b/benchmark/experimentation_bench/vector_index/q10_pq_synthetic.txt index 3c68b6c..5c493be 100644 --- a/benchmark/experimentation_bench/vector_index/q10_pq_synthetic.txt +++ b/benchmark/experimentation_bench/vector_index/q10_pq_synthetic.txt @@ -1,5 +1,9 @@ The starter file can be found under "/starter_file/faiss" +- Note: if /starter_file/faiss does not exist: + 1. If /starter_file does not exist, feel free to replace /starter_file with some other directory that you have access to. Then, follow the next step. + 2. Download the repo from: https://github.com/facebookresearch/faiss + Instructions: 1. Set up the environment. diff --git a/benchmark/experimentation_bench/vector_index/q11_data_character.txt b/benchmark/experimentation_bench/vector_index/q11_data_character.txt index bebd789..6bc5e86 100644 --- a/benchmark/experimentation_bench/vector_index/q11_data_character.txt +++ b/benchmark/experimentation_bench/vector_index/q11_data_character.txt @@ -1,11 +1,15 @@ -The starter file can be found under "faiss" +The starter file can be found under "/starter_file/faiss" + +- Note: if /starter_file/faiss does not exist: + 1. If /starter_file does not exist, feel free to replace /starter_file with some other directory that you have access to. Then, follow the next step. + 2. Download the repo from: https://github.com/facebookresearch/faiss Instructions: 1. Set up the environment using `pip install faiss-cpu`. -2. Please use `np.random.random` to construct normal distribution synthetic dataset Refer to `faiss/tutorial/python/1-Flat.py` for code. +2. Please use `np.random.normal` to construct normal distribution synthetic dataset Refer to `/starter_file/faiss/tutorial/python/1-Flat.py` for code. -3. Read faiss/benchs/bench_hnsw.py. You will use this script for testing the HNSW index with the synthetic dataset. You will need to adapt the script to be able to use this synthetic dataset. +3. Read /starter_file/faiss/benchs/bench_hnsw.py. You will use this script for testing the HNSW index with the synthetic dataset. You will need to adapt the script to be able to use this synthetic dataset. Question: diff --git a/benchmark/experimentation_bench/vector_index/q12_configure_hnsw_synthetic.txt b/benchmark/experimentation_bench/vector_index/q12_configure_hnsw_synthetic.txt index a3f1ee6..f378737 100644 --- a/benchmark/experimentation_bench/vector_index/q12_configure_hnsw_synthetic.txt +++ b/benchmark/experimentation_bench/vector_index/q12_configure_hnsw_synthetic.txt @@ -1,5 +1,9 @@ The starter file can be found under "/starter_file/faiss" +- Note: if /starter_file/faiss does not exist: + 1. If /starter_file does not exist, feel free to replace /starter_file with some other directory that you have access to. Then, follow the next step. + 2. Download the repo from: https://github.com/facebookresearch/faiss + Instructions: 1. Set up the environment. diff --git a/benchmark/experimentation_bench/vector_index/q13_index_choice.txt b/benchmark/experimentation_bench/vector_index/q13_index_choice.txt index 4925754..6070a81 100644 --- a/benchmark/experimentation_bench/vector_index/q13_index_choice.txt +++ b/benchmark/experimentation_bench/vector_index/q13_index_choice.txt @@ -1,11 +1,15 @@ -The starter file can be found under "faiss" +The starter file can be found under "/starter_file/faiss" + +- Note: if /starter_file/faiss does not exist: + 1. If /starter_file does not exist, feel free to replace /starter_file with some other directory that you have access to. Then, follow the next step. + 2. Download the repo from: https://github.com/facebookresearch/faiss Instructions: 1. Set up the environment. `pip install faiss-cpu` 2. Please download the sift1M dataset from ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz if it does not exist, and unzip it to the subdirectory sift1M, and move it to `data` directory. Make sure while downloading that quiet flag is used (e.g., --no-verbose for wget), otherwise we will exceed context length. -3. Read faiss/benchs/bench_hnsw.py and faiss/benchs/bench_index_flat. You will use this script for testing the HNSW index with SIFT1M dataset. Make sure to provide all input arguments required by the file. +3. Read /starter_file/faiss/benchs/bench_hnsw.py and /starter_file/faiss/benchs/bench_index_flat. You will use this script for testing the HNSW index with SIFT1M dataset. Make sure to provide all input arguments required by the file. Question: diff --git a/benchmark/experimentation_bench/vector_index/q14_add_data.txt b/benchmark/experimentation_bench/vector_index/q14_add_data.txt index 9ded1b8..5874dea 100644 --- a/benchmark/experimentation_bench/vector_index/q14_add_data.txt +++ b/benchmark/experimentation_bench/vector_index/q14_add_data.txt @@ -1,4 +1,8 @@ -The starter file can be found under "faiss" +The starter file can be found under "/starter_file/faiss" + +- Note: if /starter_file/faiss does not exist: + 1. If /starter_file does not exist, feel free to replace /starter_file with some other directory that you have access to. Then, follow the next step. + 2. Download the repo from: https://github.com/facebookresearch/faiss Instructions: 1. Set up the environment. `pip install faiss-cpu` diff --git a/benchmark/experimentation_bench/vector_index/q15_error_range.txt b/benchmark/experimentation_bench/vector_index/q15_error_range.txt index bc86dc5..466fa3c 100644 --- a/benchmark/experimentation_bench/vector_index/q15_error_range.txt +++ b/benchmark/experimentation_bench/vector_index/q15_error_range.txt @@ -1,5 +1,9 @@ The starter file can be found under "/starter_file/faiss" +- Note: if /starter_file/faiss does not exist: + 1. If /starter_file does not exist, feel free to replace /starter_file with some other directory that you have access to. Then, follow the next step. + 2. Download the repo from: https://github.com/facebookresearch/faiss + Instructions: 1. Set up the environment. `pip install faiss-cpu` diff --git a/benchmark/experimentation_bench/vector_index/q1_hnsw_lat.txt b/benchmark/experimentation_bench/vector_index/q1_hnsw_lat.txt index 96dae6a..b05c22c 100644 --- a/benchmark/experimentation_bench/vector_index/q1_hnsw_lat.txt +++ b/benchmark/experimentation_bench/vector_index/q1_hnsw_lat.txt @@ -1,5 +1,9 @@ The starter file can be found under "/starter_file/faiss" +- Note: if /starter_file/faiss does not exist: + 1. If /starter_file does not exist, feel free to replace /starter_file with some other directory that you have access to. Then, follow the next step. + 2. Download the repo from: https://github.com/facebookresearch/faiss + Instructions: 1. Set up the environment. @@ -7,7 +11,7 @@ Instructions: 3. Read /starter_file/faiss/benchs/bench_hnsw.py. You will use this script for testing the HNSW index with SIFT1M dataset. Make sure to provide all input arguments required by the file. There are 3 of them. -Question: +Question: What is the relationship between query latency for the SIFT1M dataset and efSearch values with the HNSW index? Use a fixed value of k=10, M=32, efConstruction=40. diff --git a/benchmark/experimentation_bench/vector_index/q2_hnsw_num_neigh.txt b/benchmark/experimentation_bench/vector_index/q2_hnsw_num_neigh.txt index 48bff1f..7273cfe 100644 --- a/benchmark/experimentation_bench/vector_index/q2_hnsw_num_neigh.txt +++ b/benchmark/experimentation_bench/vector_index/q2_hnsw_num_neigh.txt @@ -1,5 +1,9 @@ The starter file can be found under "/starter_file/faiss" +- Note: if /starter_file/faiss does not exist: + 1. If /starter_file does not exist, feel free to replace /starter_file with some other directory that you have access to. Then, follow the next step. + 2. Download the repo from: https://github.com/facebookresearch/faiss + Instructions: 1. Set up the environment. diff --git a/benchmark/experimentation_bench/vector_index/q3_hnsw_constraints.txt b/benchmark/experimentation_bench/vector_index/q3_hnsw_constraints.txt index 43bbd8c..6317335 100644 --- a/benchmark/experimentation_bench/vector_index/q3_hnsw_constraints.txt +++ b/benchmark/experimentation_bench/vector_index/q3_hnsw_constraints.txt @@ -1,5 +1,9 @@ The starter file can be found under "/starter_file/faiss" +- Note: if /starter_file/faiss does not exist: + 1. If /starter_file does not exist, feel free to replace /starter_file with some other directory that you have access to. Then, follow the next step. + 2. Download the repo from: https://github.com/facebookresearch/faiss + Instructions: 1. Set up the environment. diff --git a/benchmark/experimentation_bench/vector_index/q4_hnsw_parallelism.txt b/benchmark/experimentation_bench/vector_index/q4_hnsw_parallelism.txt index efa891a..92b1634 100644 --- a/benchmark/experimentation_bench/vector_index/q4_hnsw_parallelism.txt +++ b/benchmark/experimentation_bench/vector_index/q4_hnsw_parallelism.txt @@ -1,5 +1,9 @@ The starter file can be found under "/starter_file/faiss" +- Note: if /starter_file/faiss does not exist: + 1. If /starter_file does not exist, feel free to replace /starter_file with some other directory that you have access to. Then, follow the next step. + 2. Download the repo from: https://github.com/facebookresearch/faiss + Instructions: 1. Set up the environment. diff --git a/benchmark/experimentation_bench/vector_index/q5_hnsw_highest_recall.txt b/benchmark/experimentation_bench/vector_index/q5_hnsw_highest_recall.txt index d52ce1d..0f41747 100644 --- a/benchmark/experimentation_bench/vector_index/q5_hnsw_highest_recall.txt +++ b/benchmark/experimentation_bench/vector_index/q5_hnsw_highest_recall.txt @@ -1,5 +1,9 @@ The starter file can be found under "/starter_file/faiss" +- Note: if /starter_file/faiss does not exist: + 1. If /starter_file does not exist, feel free to replace /starter_file with some other directory that you have access to. Then, follow the next step. + 2. Download the repo from: https://github.com/facebookresearch/faiss + Instructions: 1. Set up the environment. diff --git a/benchmark/experimentation_bench/vector_index/q6_diff_dataset_index.txt b/benchmark/experimentation_bench/vector_index/q6_diff_dataset_index.txt index d3e5de6..3dde59f 100644 --- a/benchmark/experimentation_bench/vector_index/q6_diff_dataset_index.txt +++ b/benchmark/experimentation_bench/vector_index/q6_diff_dataset_index.txt @@ -1,5 +1,9 @@ The starter file can be found under "faiss" +- Note: if /starter_file/faiss does not exist: + 1. If /starter_file does not exist, feel free to replace /starter_file with some other directory that you have access to. Then, follow the next step. + 2. Download the repo from: https://github.com/facebookresearch/faiss + Instructions: 1. Set up the environment. @@ -9,4 +13,4 @@ Instructions: Question: -What is the relationship between dataset size and index-building time for different FAISS index types (e.g., IVF, HNSW)? For hnsw, the default settings are a fixed k value of 10, M value of 32, and efConstruction value of 40. For ivf, use /starter_file/faiss/benchs/bench_ivf_fastscan.py. hnsw should be the control group, and ivf the experimental group. \ No newline at end of file +What is the relationship between dataset size and index-building time? Determine this for two different FAISS index types: IVF, and HNSW. For HNSW, use the default settings which are a fixed k value of 10, M value of 32, and efConstruction value of 40. For ivf, use /starter_file/faiss/benchs/bench_ivf_fastscan.py. When constructing the experimental plan, HNSW should be the control group, and IVF the experimental group. \ No newline at end of file diff --git a/benchmark/experimentation_bench/vector_index/q7_hnsw_b.txt b/benchmark/experimentation_bench/vector_index/q7_hnsw_b.txt index 42277df..1ff7d82 100644 --- a/benchmark/experimentation_bench/vector_index/q7_hnsw_b.txt +++ b/benchmark/experimentation_bench/vector_index/q7_hnsw_b.txt @@ -1,8 +1,11 @@ Instructions: 1. The starter file can be found under "/starter_file/faiss". Set up the environment. +- Note: if /starter_file/faiss does not exist: + 1. If /starter_file does not exist, feel free to replace /starter_file with some other directory that you have access to. Then, follow the next step. + 2. Download the repo from: https://github.com/facebookresearch/faiss 2. Please download the sift1M dataset from ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz if it does not exist, and unzip it to the subdirectory sift1M, and move it to `data` directory. Make sure while downloading that quiet flag is used (e.g., --no-verbose for wget), otherwise we will exceed context length. Question: -Which of these 2 index types, hnsw and ivf, requires the least amount of memory to run and can reach a recall rate of at least 96%, using their default settings? For hnsw, use /starter_file/faiss/benchs/bench_hnsw.py, where the default settings are a fixed k value of 10, M value of 32, and efConstruction value of 40. For ivf, use /starter_file/faiss/benchs/bench_ivf_fastscan.py. hnsw should be the control group, and ivf the experimental group. \ No newline at end of file +Which of these 2 index types, hnsw and ivf, requires the least amount of memory to run and can reach a recall rate of at least 96%, using their default settings? For hnsw, use /starter_file/faiss/benchs/bench_hnsw.py, where the default settings are a fixed k value of 10, M value of 32, and efConstruction value of 40. For ivf, use /starter_file/faiss/benchs/bench_ivf_fastscan.py. \ No newline at end of file diff --git a/benchmark/experimentation_bench/vector_index/q8_ivf_tradeoff.txt b/benchmark/experimentation_bench/vector_index/q8_ivf_tradeoff.txt index 4065dff..308ca61 100644 --- a/benchmark/experimentation_bench/vector_index/q8_ivf_tradeoff.txt +++ b/benchmark/experimentation_bench/vector_index/q8_ivf_tradeoff.txt @@ -1,5 +1,9 @@ 1. The starter file can be found under "/starter_file/faiss". +- Note: if /starter_file/faiss does not exist: + 1. If /starter_file does not exist, feel free to replace /starter_file with some other directory that you have access to. Then, follow the next step. + 2. Download the repo from: https://github.com/facebookresearch/faiss + Instructions: 1. Set up the environment. diff --git a/benchmark/experimentation_bench/vector_index/q9_sensitivity.txt b/benchmark/experimentation_bench/vector_index/q9_sensitivity.txt index edbdcf1..bc52b03 100644 --- a/benchmark/experimentation_bench/vector_index/q9_sensitivity.txt +++ b/benchmark/experimentation_bench/vector_index/q9_sensitivity.txt @@ -1,11 +1,15 @@ -The starter file can be found under "faiss" +The starter file can be found under "/starter_file/faiss" + +- Note: if /starter_file/faiss does not exist: + 1. If /starter_file does not exist, feel free to replace /starter_file with some other directory that you have access to. Then, follow the next step. + 2. Download the repo from: https://github.com/facebookresearch/faiss Instructions: 1. Set up the environment. `pip install faiss-cpu` 2. Please download the sift1M dataset from ftp://ftp.irisa.fr/local/texmex/corpus/sift.tar.gz if it does not exist, and unzip it to the subdirectory sift1M, and move it to `data` directory. Make sure while downloading that quiet flag is used (e.g., --no-verbose for wget), otherwise we will exceed context length. -3. Read faiss/benchs/bench_hnsw.py. You will use this script for testing the HNSW index with SIFT1M dataset. Make sure to provide all input arguments required by the file. There are 3 of them. +3. Read /starter_file/faiss/benchs/bench_hnsw.py. You will use this script for testing the HNSW index with SIFT1M dataset. Make sure to provide all input arguments required by the file. There are 3 of them. Question: diff --git a/starter_file/faiss b/starter_file/faiss index 657c563..64ade50 160000 --- a/starter_file/faiss +++ b/starter_file/faiss @@ -1 +1 @@ -Subproject commit 657c563604c774461aed0394ae99210713145e03 +Subproject commit 64ade5050b38d9a47a443db9ec24a9de786637f3