988 Fundamental

988 Thread Local

Functional Programming

Tutorial

The Problem

Demonstrate thread-local storage (TLS) in Rust using the thread_local! macro. Each thread gets its own independent copy of the storage — no locks or synchronization needed. Show a thread-local counter where threads set independent values, and a thread-local accumulator that aggregates per-thread sums without shared state.

🎯 Learning Outcomes

• Declare thread-local storage with thread_local! { static NAME: RefCell<T> = ... }

• Access TLS via .with(|cell| ...) — the closure receives a reference to the thread-local value

• Use RefCell for interior mutability: borrow() for read, borrow_mut() for write

• Understand that thread_local! values are not Send and cannot be moved between threads

• Recognize the use cases: per-thread performance counters, per-request contexts, PRNG state

Code Example

#![allow(clippy::all)]
// 988: Thread-Local Storage
// Rust: thread_local! macro — each thread gets its own instance

use std::cell::RefCell;
use std::sync::{Arc, Mutex};
use std::thread;

// --- Approach 1: thread_local! with Cell (simple counter) ---
thread_local! {
    static COUNTER: RefCell<i32> = const { RefCell::new(0) };
}

fn thread_local_counter() -> Vec<i32> {
    let results = Arc::new(Mutex::new(Vec::new()));

    let handles: Vec<_> = (0..5i32)
        .map(|i| {
            let results = Arc::clone(&results);
            thread::spawn(move || {
                // Each thread has its own COUNTER — no sharing
                COUNTER.with(|c| *c.borrow_mut() = i * 10);
                thread::yield_now();
                let v = COUNTER.with(|c| *c.borrow());
                results.lock().unwrap().push(v);
            })
        })
        .collect();

    for h in handles {
        h.join().unwrap();
    }
    let mut v = results.lock().unwrap().clone();
    v.sort();
    v
}

// --- Approach 2: Thread-local accumulator (no shared state needed) ---
thread_local! {
    static LOCAL_SUM: RefCell<i64> = const { RefCell::new(0) };
}

fn thread_local_sum(id: i64) -> i64 {
    LOCAL_SUM.with(|s| {
        *s.borrow_mut() = 0; // reset for this thread
        for i in 1..=10 {
            *s.borrow_mut() += i * id;
        }
        *s.borrow()
    })
}

fn parallel_sums() -> i64 {
    let results = Arc::new(Mutex::new(Vec::new()));

    let handles: Vec<_> = (0..4i64)
        .map(|id| {
            let results = Arc::clone(&results);
            thread::spawn(move || {
                let s = thread_local_sum(id);
                results.lock().unwrap().push(s);
            })
        })
        .collect();

    for h in handles {
        h.join().unwrap();
    }
    let x = results.lock().unwrap().iter().sum();
    x
}

// --- Approach 3: Thread-local cache (computed once per thread) ---
thread_local! {
    static THREAD_ID_CACHE: RefCell<Option<String>> = const { RefCell::new(None) };
}

fn get_thread_name(name: &str) -> String {
    THREAD_ID_CACHE.with(|cache| {
        let mut c = cache.borrow_mut();
        if c.is_none() {
            *c = Some(format!("thread-{}", name));
        }
        c.clone().unwrap()
    })
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_thread_local_isolation() {
        let counts = thread_local_counter();
        assert_eq!(counts, vec![0, 10, 20, 30, 40]);
    }

    #[test]
    fn test_parallel_sums() {
        // 0 + 55 + 110 + 165 = 330
        assert_eq!(parallel_sums(), 330);
    }

    #[test]
    fn test_thread_local_doesnt_leak_across_threads() {
        COUNTER.with(|c| *c.borrow_mut() = 999);
        let val_in_new_thread = thread::spawn(|| {
            COUNTER.with(|c| *c.borrow()) // should be 0, not 999
        })
        .join()
        .unwrap();
        assert_eq!(val_in_new_thread, 0);
    }

    #[test]
    fn test_thread_name_cached() {
        let n1 = get_thread_name("x");
        let n2 = get_thread_name("y"); // returns cached value, not "thread-y"
        assert_eq!(n1, n2); // same thread — cached
    }
}

(* 988: Thread-Local Storage *)
(* OCaml 5: Domain.DLS (domain-local storage). OCaml < 5: Thread.self() map *)

(* --- Approach 1: Simulate thread-local via Thread.self() hash table --- *)

let tls : (int, int ref) Hashtbl.t = Hashtbl.create 16
let tls_m = Mutex.create ()

let get_tls () =
  let id = Thread.id (Thread.self ()) in
  Mutex.lock tls_m;
  let v = match Hashtbl.find_opt tls id with
    | Some r -> r
    | None -> let r = ref 0 in Hashtbl.add tls id r; r
  in
  Mutex.unlock tls_m;
  v

let set_tls v =
  let cell = get_tls () in
  cell := v

let read_tls () = !(get_tls ())

let () =
  let results = ref [] in
  let m = Mutex.create () in
  let threads = List.init 5 (fun i ->
    Thread.create (fun () ->
      set_tls (i * 10);
      (* Other thread's changes don't affect ours *)
      Thread.yield ();
      let v = read_tls () in
      Mutex.lock m;
      results := v :: !results;
      Mutex.unlock m
    ) ()
  ) in
  List.iter Thread.join threads;
  let sorted = List.sort compare !results in
  assert (sorted = [0; 10; 20; 30; 40]);
  Printf.printf "Approach 1 (thread-local): [%s]\n"
    (String.concat "; " (List.map string_of_int sorted))

(* --- Approach 2: Per-thread accumulator (independent state) --- *)

let () =
  let all_sums = ref [] in
  let m = Mutex.create () in
  let threads = List.init 4 (fun id ->
    Thread.create (fun () ->
      (* Each thread accumulates independently *)
      let local_sum = ref 0 in
      for i = 1 to 10 do
        local_sum := !local_sum + i * id
      done;
      Mutex.lock m;
      all_sums := !local_sum :: !all_sums;
      Mutex.unlock m
    ) ()
  ) in
  List.iter Thread.join threads;
  (* sum of: 0, 55, 110, 165 = 330 *)
  let total = List.fold_left (+) 0 !all_sums in
  assert (total = 330);
  Printf.printf "Approach 2 (per-thread sum): total=%d\n" total

let () = Printf.printf "✓ All tests passed\n"

Key Differences

Aspect	Rust	OCaml
Declaration	`thread_local! { static N: T = ... }`	No built-in (pre-5.0); `Domain.DLS` (5.0+)
Access	`.with(\|r\| ...)` closure	`Domain.DLS.get key`
Interior mutability	`RefCell<T>` in TLS	Mutable domain-local slot
Lock-free	Yes — no concurrent access possible	Yes (domain-local)

TLS is ideal for per-thread random number generators, per-request logging contexts, and accumulating performance counters that are merged at the end. The key advantage over Mutex<T> is zero synchronization overhead.

OCaml Approach

(* OCaml: Thread.self() as key into a Hashtbl — manual TLS *)
let tls_table : (int, int) Hashtbl.t = Hashtbl.create 16
let tls_mutex = Mutex.create ()

let tls_set v =
  let tid = Thread.id (Thread.self ()) in
  Mutex.lock tls_mutex;
  Hashtbl.replace tls_table tid v;
  Mutex.unlock tls_mutex

let tls_get () =
  let tid = Thread.id (Thread.self ()) in
  Mutex.protect tls_mutex (fun () ->
    Hashtbl.find_opt tls_table tid
  )

(* OCaml 5.0+: Domain.DLS for domain-local storage *)
let key = Domain.DLS.new_key (fun () -> 0)
let set v = Domain.DLS.set key v
let get () = Domain.DLS.get key

OCaml before 5.0 lacks built-in TLS — it requires a Hashtbl keyed by thread ID with manual locking. OCaml 5.0+'s Domain.DLS provides domain-local storage analogous to Rust's thread_local!.

Full Source

#![allow(clippy::all)]
// 988: Thread-Local Storage
// Rust: thread_local! macro — each thread gets its own instance

use std::cell::RefCell;
use std::sync::{Arc, Mutex};
use std::thread;

// --- Approach 1: thread_local! with Cell (simple counter) ---
thread_local! {
    static COUNTER: RefCell<i32> = const { RefCell::new(0) };
}

fn thread_local_counter() -> Vec<i32> {
    let results = Arc::new(Mutex::new(Vec::new()));

    let handles: Vec<_> = (0..5i32)
        .map(|i| {
            let results = Arc::clone(&results);
            thread::spawn(move || {
                // Each thread has its own COUNTER — no sharing
                COUNTER.with(|c| *c.borrow_mut() = i * 10);
                thread::yield_now();
                let v = COUNTER.with(|c| *c.borrow());
                results.lock().unwrap().push(v);
            })
        })
        .collect();

    for h in handles {
        h.join().unwrap();
    }
    let mut v = results.lock().unwrap().clone();
    v.sort();
    v
}

// --- Approach 2: Thread-local accumulator (no shared state needed) ---
thread_local! {
    static LOCAL_SUM: RefCell<i64> = const { RefCell::new(0) };
}

fn thread_local_sum(id: i64) -> i64 {
    LOCAL_SUM.with(|s| {
        *s.borrow_mut() = 0; // reset for this thread
        for i in 1..=10 {
            *s.borrow_mut() += i * id;
        }
        *s.borrow()
    })
}

fn parallel_sums() -> i64 {
    let results = Arc::new(Mutex::new(Vec::new()));

    let handles: Vec<_> = (0..4i64)
        .map(|id| {
            let results = Arc::clone(&results);
            thread::spawn(move || {
                let s = thread_local_sum(id);
                results.lock().unwrap().push(s);
            })
        })
        .collect();

    for h in handles {
        h.join().unwrap();
    }
    let x = results.lock().unwrap().iter().sum();
    x
}

// --- Approach 3: Thread-local cache (computed once per thread) ---
thread_local! {
    static THREAD_ID_CACHE: RefCell<Option<String>> = const { RefCell::new(None) };
}

fn get_thread_name(name: &str) -> String {
    THREAD_ID_CACHE.with(|cache| {
        let mut c = cache.borrow_mut();
        if c.is_none() {
            *c = Some(format!("thread-{}", name));
        }
        c.clone().unwrap()
    })
}

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_thread_local_isolation() {
        let counts = thread_local_counter();
        assert_eq!(counts, vec![0, 10, 20, 30, 40]);
    }

    #[test]
    fn test_parallel_sums() {
        // 0 + 55 + 110 + 165 = 330
        assert_eq!(parallel_sums(), 330);
    }

    #[test]
    fn test_thread_local_doesnt_leak_across_threads() {
        COUNTER.with(|c| *c.borrow_mut() = 999);
        let val_in_new_thread = thread::spawn(|| {
            COUNTER.with(|c| *c.borrow()) // should be 0, not 999
        })
        .join()
        .unwrap();
        assert_eq!(val_in_new_thread, 0);
    }

    #[test]
    fn test_thread_name_cached() {
        let n1 = get_thread_name("x");
        let n2 = get_thread_name("y"); // returns cached value, not "thread-y"
        assert_eq!(n1, n2); // same thread — cached
    }
}

(* 988: Thread-Local Storage *)
(* OCaml 5: Domain.DLS (domain-local storage). OCaml < 5: Thread.self() map *)

(* --- Approach 1: Simulate thread-local via Thread.self() hash table --- *)

let tls : (int, int ref) Hashtbl.t = Hashtbl.create 16
let tls_m = Mutex.create ()

let get_tls () =
  let id = Thread.id (Thread.self ()) in
  Mutex.lock tls_m;
  let v = match Hashtbl.find_opt tls id with
    | Some r -> r
    | None -> let r = ref 0 in Hashtbl.add tls id r; r
  in
  Mutex.unlock tls_m;
  v

let set_tls v =
  let cell = get_tls () in
  cell := v

let read_tls () = !(get_tls ())

let () =
  let results = ref [] in
  let m = Mutex.create () in
  let threads = List.init 5 (fun i ->
    Thread.create (fun () ->
      set_tls (i * 10);
      (* Other thread's changes don't affect ours *)
      Thread.yield ();
      let v = read_tls () in
      Mutex.lock m;
      results := v :: !results;
      Mutex.unlock m
    ) ()
  ) in
  List.iter Thread.join threads;
  let sorted = List.sort compare !results in
  assert (sorted = [0; 10; 20; 30; 40]);
  Printf.printf "Approach 1 (thread-local): [%s]\n"
    (String.concat "; " (List.map string_of_int sorted))

(* --- Approach 2: Per-thread accumulator (independent state) --- *)

let () =
  let all_sums = ref [] in
  let m = Mutex.create () in
  let threads = List.init 4 (fun id ->
    Thread.create (fun () ->
      (* Each thread accumulates independently *)
      let local_sum = ref 0 in
      for i = 1 to 10 do
        local_sum := !local_sum + i * id
      done;
      Mutex.lock m;
      all_sums := !local_sum :: !all_sums;
      Mutex.unlock m
    ) ()
  ) in
  List.iter Thread.join threads;
  (* sum of: 0, 55, 110, 165 = 330 *)
  let total = List.fold_left (+) 0 !all_sums in
  assert (total = 330);
  Printf.printf "Approach 2 (per-thread sum): total=%d\n" total

let () = Printf.printf "✓ All tests passed\n"

✓ Tests Rust test suite

#[cfg(test)]
mod tests {
    use super::*;

    #[test]
    fn test_thread_local_isolation() {
        let counts = thread_local_counter();
        assert_eq!(counts, vec![0, 10, 20, 30, 40]);
    }

    #[test]
    fn test_parallel_sums() {
        // 0 + 55 + 110 + 165 = 330
        assert_eq!(parallel_sums(), 330);
    }

    #[test]
    fn test_thread_local_doesnt_leak_across_threads() {
        COUNTER.with(|c| *c.borrow_mut() = 999);
        let val_in_new_thread = thread::spawn(|| {
            COUNTER.with(|c| *c.borrow()) // should be 0, not 999
        })
        .join()
        .unwrap();
        assert_eq!(val_in_new_thread, 0);
    }

    #[test]
    fn test_thread_name_cached() {
        let n1 = get_thread_name("x");
        let n2 = get_thread_name("y"); // returns cached value, not "thread-y"
        assert_eq!(n1, n2); // same thread — cached
    }
}

Deep Comparison

Thread-Local Storage — Comparison

Core Insight

Thread-local storage is the answer to "I want mutable state but don't want synchronization overhead." Each thread has its own private copy — no races possible, no locks needed.

OCaml Approach

• OCaml 5: Domain.DLS.new_key / Domain.DLS.get / Domain.DLS.set (domain-local)

• OCaml < 5: Simulate with Thread.id → Hashtbl (requires mutex for the table itself)

• Domains ≠ threads in OCaml 5 — one domain can run many lightweight threads

• Typical use: per-domain RNG seeds, error buffers, caches

Rust Approach

• thread_local! { static NAME: Type = init; } declares the variable

• .with(|v| ...) is the only access method — ensures scoped lifetime

• Usually paired with Cell<T> (copy types) or RefCell<T> (arbitrary types)

• Initialized lazily on first access per thread

• Dropped when thread exits

Comparison Table

Concept	OCaml	Rust
Declare	`Domain.DLS.new_key (fun () -> init)`	`thread_local! { static X: T }`
Read	`Domain.DLS.get key`	`X.with(\\|v\\| *v.borrow())`
Write	`Domain.DLS.set key val`	`X.with(\\|v\\| *v.borrow_mut() = x)`
Interior mutability	Mutable by nature	`Cell<T>` or `RefCell<T>`
Initialization	Closure passed at creation	Expression in macro
Isolation	Per-domain (not per-thread in OCaml 5)	Per-OS-thread
No sync needed	Yes	Yes — the whole point

std vs tokio

Aspect	std version	tokio version
Runtime	OS threads via `std::thread`	Async tasks on tokio runtime
Synchronization	`std::sync::Mutex`, `Condvar`	`tokio::sync::Mutex`, channels
Channels	`std::sync::mpsc` (unbounded)	`tokio::sync::mpsc` (bounded, async)
Blocking	Thread blocks on lock/recv	Task yields, runtime switches tasks
Overhead	One OS thread per task	Many tasks per thread (M:N)
Best for	CPU-bound, simple concurrency	I/O-bound, high-concurrency servers

Exercises

Implement a thread-local RNG: each thread seeds its own rand::thread_rng() equivalent.

Implement per-thread allocation counters that are summed at program end without a shared counter.

Implement a "request ID" TLS that is set at thread entry and read by all functions without passing it as a parameter.

Demonstrate that modifying COUNTER in one thread does not affect another thread's COUNTER value.

Implement thread_local_cache<K: Hash+Eq, V> — a per-thread HashMap that serves as a local cache before hitting a shared Mutex<HashMap>.

Open Source Repos

functional-rust

View the source for this example on GitHub — OCaml and Rust side by side in the repo.

Rust