After watching this talk (Dennis Gustafsson – Parallelizing the physics solver – BSC 2025): https://www.youtube.com/watch?v=Kvsvd67XUKw I decided to give up on job systems for now and try using a simple parallel for. Roast the code if you want, as I’m sure it has issues:
package parallel
import "core:fmt"
import "core:sync"
import "core:thread"
import "core:time"
pn :: fmt.println
pfn :: fmt.printfln
Instance :: struct {
userdata: rawptr,
callback: proc(),
id, count: int,
thread_count: int,
running: int,
sema: sync.Sema,
finished: bool,
}
init :: proc(self: ^Instance, thread_count: int) {
self.thread_count = thread_count
}
loop :: proc(self: ^Instance, count: int, data: rawptr, callback: proc()) {
self.userdata = data
self.callback = callback
self.count = count
self.id = 0
self.running = self.thread_count
sync.post(&self.sema, self.thread_count)
do_work(self)
join(self)
}
@(private)
join :: proc(self: ^Instance) {
count := 0
for {
count += 1
running := sync.atomic_load_explicit(&self.running, .Relaxed)
if running <= 0 do break
}
// pn("waited for: ", count)
}
finish :: proc(self: ^Instance) {
self.finished = true
sync.post(&self.sema, self.thread_count)
}
@(private)
do_work :: proc(self: ^Instance) {
for {
id := sync.atomic_add_explicit(&self.id, 1, .Relaxed)
(id < self.count) or_break
// pfn("thread_id: %v, id: %v", context.user_index, id)
context.user_index = id
context.user_ptr = self.userdata
self.callback()
}
}
pull :: proc($T: typeid) -> (int, ^T) {
ptr := transmute(^T)context.user_ptr
return context.user_index, ptr
}
thread_proc :: proc() {
self := transmute(^Instance)context.user_ptr
for {
sync.wait(&self.sema)
if self.finished do return
do_work(self)
sync.atomic_sub_explicit(&self.running, 1, .Release)
}
}
@(private)
example :: proc() {
THREAD_COUNT :: 4
instance: Instance;p := &instance
init(p, THREAD_COUNT)
context.user_ptr = p
threads: [THREAD_COUNT]^thread.Thread
for _, i in threads {
context.user_index = i + 1
threads[i] = thread.create_and_start(thread_proc, context)
}
SIZE :: THREAD_COUNT * 4
Ints :: [SIZE]int
data: Ints;d := &data
work :: proc() {time.sleep(50 * time.Microsecond)}
loop(p, SIZE, d, proc() {
id, d := pull(Ints)
d[id] = id + 1
work()
})
pn(data)
data[0] *= -20
loop(p, SIZE, d, proc() {
id, d := pull(Ints)
d[id] -= 2 * id
work()
})
pn(data)
finish(p)
for t in threads {
thread.destroy(t)
}
}
main :: proc() {
example()
}