@@ -187,6 +187,9 @@ typedef void * thread_ret_t;
187187
188188typedef pthread_t ggml_thread_t ;
189189
190+ #define GGML_THREADPOOL_N_THREADS_MASK (0xffffU)
191+ #define GGML_THREADPOOL_N_THREADS_BITS (16)
192+
190193#if defined(__APPLE__ )
191194#include <unistd.h>
192195#include <mach/mach.h>
@@ -449,20 +452,18 @@ struct ggml_threadpool {
449452 struct ggml_cplan * cplan ;
450453
451454 // synchronization primitives
452- atomic_int n_graph ; // incremented when there is work to be done (i.e each graph)
455+ atomic_int n_graph ; // updated when there is work to be done (i.e each graph) holds graph and active thread counts.
453456 atomic_int GGML_CACHE_ALIGN n_barrier ;
454457 atomic_int GGML_CACHE_ALIGN n_barrier_passed ;
455458 atomic_int GGML_CACHE_ALIGN current_chunk ; // currently processing chunk during Mat_Mul, shared between all the threads.
456459
457460 // these are atomic as an annotation for thread-sanitizer
458461 atomic_bool stop ; // Used for stopping the threadpool altogether
459462 atomic_bool pause ; // Used for pausing the threadpool or individual threads
460- atomic_int abort ; // Used for aborting processing of a graph
463+ atomic_int abort ; // Used for aborting processing of a graph
461464
462465 struct ggml_compute_state * workers ; // per thread state
463- int n_threads_max ; // number of threads in the pool
464- atomic_int n_threads_cur ; // number of threads used in the current graph
465-
466+ int n_threads ; // Number of threads in the pool
466467 int32_t prio ; // Scheduling priority
467468 uint32_t poll ; // Polling level (0 - no polling)
468469
@@ -539,7 +540,7 @@ struct ggml_state {
539540static struct ggml_state g_state = {0 };
540541
541542void ggml_barrier (struct ggml_threadpool * tp ) {
542- int n_threads = atomic_load_explicit (& tp -> n_threads_cur , memory_order_relaxed );
543+ int n_threads = atomic_load_explicit (& tp -> n_graph , memory_order_relaxed ) & GGML_THREADPOOL_N_THREADS_MASK ;
543544 if (n_threads == 1 ) {
544545 return ;
545546 }
@@ -556,7 +557,7 @@ void ggml_barrier(struct ggml_threadpool * tp) {
556557 // last thread
557558 atomic_store_explicit (& tp -> n_barrier , 0 , memory_order_relaxed );
558559
559- // exit barrier (fill seq-cst fence)
560+ // exit barrier (full seq-cst fence)
560561 atomic_fetch_add_explicit (& tp -> n_barrier_passed , 1 , memory_order_seq_cst );
561562 return ;
562563 }
@@ -2628,7 +2629,7 @@ static void ggml_thread_cpumask_next(const bool * global_mask, bool * local_mask
26282629void ggml_threadpool_free (struct ggml_threadpool * threadpool ) {
26292630 if (!threadpool ) return ;
26302631
2631- const int n_threads = threadpool -> n_threads_max ;
2632+ const int n_threads = threadpool -> n_threads ;
26322633
26332634#ifndef GGML_USE_OPENMP
26342635 struct ggml_compute_state * workers = threadpool -> workers ;
@@ -2704,7 +2705,7 @@ struct ggml_cplan ggml_graph_plan(
27042705 //GGML_PRINT_DEBUG("Threadpool is not specified. Will create a disposable threadpool : n_threads %d\n", n_threads);
27052706 }
27062707 if (n_threads <= 0 ) {
2707- n_threads = threadpool ? threadpool -> n_threads_max : GGML_DEFAULT_N_THREADS ;
2708+ n_threads = threadpool ? threadpool -> n_threads : GGML_DEFAULT_N_THREADS ;
27082709 }
27092710
27102711#if defined(__EMSCRIPTEN__ ) && !defined(__EMSCRIPTEN_PTHREADS__ )
@@ -2912,12 +2913,14 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
29122913
29132914 struct ggml_compute_params params = {
29142915 /*.ith =*/ state -> ith ,
2915- /*.nth =*/ atomic_load_explicit (& tp -> n_threads_cur , memory_order_relaxed ),
2916+ /*.nth =*/ atomic_load_explicit (& tp -> n_graph , memory_order_relaxed ) & GGML_THREADPOOL_N_THREADS_MASK ,
29162917 /*.wsize =*/ cplan -> work_size ,
29172918 /*.wdata =*/ cplan -> work_data ,
29182919 /*.threadpool=*/ tp ,
29192920 };
29202921
2922+ GGML_PRINT_DEBUG ("thread #%d compute-start cplan %p last-graph %d \n" , state -> ith , cplan , state -> last_graph );
2923+
29212924 for (int node_n = 0 ; node_n < cgraph -> n_nodes && atomic_load_explicit (& tp -> abort , memory_order_relaxed ) != node_n ; node_n ++ ) {
29222925 struct ggml_tensor * node = cgraph -> nodes [node_n ];
29232926
@@ -2939,34 +2942,32 @@ static thread_ret_t ggml_graph_compute_thread(void * data) {
29392942 }
29402943 }
29412944
2945+ GGML_PRINT_DEBUG ("thread #%d compute-done cplan %p last-graph %d \n" , state -> ith , cplan , state -> last_graph );
2946+
29422947 ggml_barrier (state -> threadpool );
29432948
29442949 return 0 ;
29452950}
29462951
29472952#ifndef GGML_USE_OPENMP
29482953
2949- // check if thread is active
2950- static inline bool ggml_graph_compute_thread_active (struct ggml_compute_state * state ) {
2951- struct ggml_threadpool * threadpool = state -> threadpool ;
2952- int n_threads = atomic_load_explicit (& threadpool -> n_threads_cur , memory_order_relaxed );
2953- return (state -> ith < n_threads );
2954- }
2955-
29562954// check if thread is ready to proceed (exit from polling or sleeping)
2955+ // returns true if loops should exit, sets state->pending to indicate new work
29572956static inline bool ggml_graph_compute_thread_ready (struct ggml_compute_state * state ) {
29582957 struct ggml_threadpool * threadpool = state -> threadpool ;
29592958
29602959 if (state -> pending || threadpool -> stop || threadpool -> pause ) { return true; }
29612960
29622961 // check for new graph/work
2963- int new_graph = atomic_load_explicit (& threadpool -> n_graph , memory_order_relaxed );
2964- if (new_graph != state -> last_graph ) {
2965- state -> pending = ggml_graph_compute_thread_active (state );
2966- state -> last_graph = new_graph ;
2962+ int n_graph = atomic_load_explicit (& threadpool -> n_graph , memory_order_relaxed );
2963+ int n_threads = n_graph & GGML_THREADPOOL_N_THREADS_MASK ;
2964+ if (n_graph != state -> last_graph ) {
2965+ state -> pending = (state -> ith < n_threads );
2966+ state -> last_graph = n_graph ;
2967+ return true;
29672968 }
29682969
2969- return state -> pending ;
2970+ return false ;
29702971}
29712972
29722973// sync thread state after polling
@@ -2983,11 +2984,6 @@ static inline void ggml_graph_compute_thread_sync(struct ggml_compute_state * st
29832984static inline bool ggml_graph_compute_poll_for_work (struct ggml_compute_state * state ) {
29842985 struct ggml_threadpool * threadpool = state -> threadpool ;
29852986
2986- // Skip polling for unused threads
2987- if (!ggml_graph_compute_thread_active (state )) {
2988- return state -> pending ;
2989- }
2990-
29912987 // This seems to make 0 ... 100 a decent range for polling level across modern processors.
29922988 // Perhaps, we can adjust it dynamically based on load and things.
29932989 const uint64_t n_rounds = 1024UL * 128 * threadpool -> poll ;
@@ -3049,7 +3045,6 @@ static thread_ret_t ggml_graph_compute_secondary_thread(void* data) {
30493045 ggml_graph_compute_check_for_work (state );
30503046 if (state -> pending ) {
30513047 state -> pending = false;
3052-
30533048 ggml_graph_compute_thread (state );
30543049 }
30553050 }
@@ -3064,14 +3059,15 @@ static void ggml_graph_compute_kickoff(struct ggml_threadpool * threadpool, int
30643059
30653060 ggml_mutex_lock (& threadpool -> mutex );
30663061
3067- GGML_PRINT_DEBUG ("threadpool: n_threads_cur %d n_threads %d\n" , threadpool -> n_threads_cur , n_threads );
3062+ // Update the number of active threads and the graph count
3063+ int n_graph = atomic_load_explicit (& threadpool -> n_graph , memory_order_relaxed ) >> GGML_THREADPOOL_N_THREADS_BITS ;
3064+ n_graph = ((n_graph + 1 ) << GGML_THREADPOOL_N_THREADS_BITS ) | (n_threads & GGML_THREADPOOL_N_THREADS_MASK );
30683065
3069- // Update the number of active threads
3070- atomic_store_explicit (& threadpool -> n_threads_cur , n_threads , memory_order_relaxed );
3066+ GGML_PRINT_DEBUG ("compute-kickoff: n_threads %d n_graph %d\n" , n_threads , n_graph );
30713067
30723068 // Indicate the graph is ready to be processed
30733069 // We need the full seq-cst fence here because of the polling threads (used in thread_sync)
3074- atomic_fetch_add_explicit (& threadpool -> n_graph , 1 , memory_order_seq_cst );
3070+ atomic_store_explicit (& threadpool -> n_graph , n_graph , memory_order_seq_cst );
30753071
30763072 if (threadpool -> pause ) {
30773073 // Update main thread prio and affinity to match the threadpool settings
@@ -3109,8 +3105,7 @@ static struct ggml_threadpool * ggml_threadpool_new_impl(
31093105 threadpool -> pause = tpp -> paused ;
31103106 threadpool -> abort = -1 ;
31113107 threadpool -> workers = NULL ;
3112- threadpool -> n_threads_max = tpp -> n_threads ;
3113- threadpool -> n_threads_cur = tpp -> n_threads ;
3108+ threadpool -> n_threads = tpp -> n_threads ;
31143109 threadpool -> poll = tpp -> poll ;
31153110 threadpool -> prio = tpp -> prio ;
31163111 threadpool -> ec = GGML_STATUS_SUCCESS ;
@@ -3205,7 +3200,7 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
32053200 {
32063201 // update the number of threads from the actual number of threads that we got from OpenMP
32073202 n_threads = omp_get_num_threads ();
3208- atomic_store_explicit (& threadpool -> n_threads_cur , n_threads , memory_order_relaxed );
3203+ atomic_store_explicit (& threadpool -> n_graph , n_threads , memory_order_relaxed );
32093204 }
32103205
32113206 // Apply thread CPU mask and priority
@@ -3218,13 +3213,13 @@ enum ggml_status ggml_graph_compute(struct ggml_cgraph * cgraph, struct ggml_cpl
32183213 ggml_graph_compute_thread (& threadpool -> workers [ith ]);
32193214 }
32203215 } else {
3221- atomic_store_explicit (& threadpool -> n_threads_cur , 1 , memory_order_relaxed );
3216+ atomic_store_explicit (& threadpool -> n_graph , 1 , memory_order_relaxed );
32223217 ggml_graph_compute_thread (& threadpool -> workers [0 ]);
32233218 }
32243219#else
3225- if (n_threads > threadpool -> n_threads_max ) {
3226- GGML_LOG_WARN ("cplan requested more threads (%d) than available (%d)\n" , n_threads , threadpool -> n_threads_max );
3227- n_threads = threadpool -> n_threads_max ;
3220+ if (n_threads > threadpool -> n_threads ) {
3221+ GGML_LOG_WARN ("cplan requested more threads (%d) than available (%d)\n" , n_threads , threadpool -> n_threads );
3222+ n_threads = threadpool -> n_threads ;
32283223 }
32293224
32303225 // Kick all threads to start the new graph
0 commit comments