I'm working with a fiber-optic reflective memory card (RFM2g model), which lacks atomic read-modify-write operations. My goal is to implement a spinlock across multiple processes (potentially running on different computers) using only the shared memory and interrupts provided by this card.
API documentation: https://file1.dzsc.com/product/14/11/11/1053997_101743426.pdf
I implemented a spinlock using memory operations. The logic (simplified) is as follows:
#include <chrono> #include <iostream> #include <random> #include <thread> void spin_lock(volatile void *lock_area, const uint32_t lock_id) { uint32_t lock = static_cast<volatile uint32_t *>(lock_area)[0]; spin: while (lock != 0 && lock != lock_id) { lock = static_cast<volatile uint32_t *>(lock_area)[0]; } static_cast<volatile uint32_t *>(lock_area)[0] = lock_id; auto start = std::chrono::high_resolution_clock::now(); while (std::chrono::duration_cast<std::chrono::nanoseconds>(std::chrono::high_resolution_clock::now() - start). count() < 10) { lock = static_cast<volatile uint32_t *>(lock_area)[0]; } lock = static_cast<volatile uint32_t *>(lock_area)[0]; if (lock != lock_id) { goto spin; } } void spin_unlock(volatile void *lock_area) { static_cast<volatile uint32_t *>(lock_area)[0] = 0; } uint32_t generate_lock_id(const uint32_t node_id) { static std::random_device random_device{}; static std::uniform_int_distribution<uint16_t> distribution(0, 32768); const auto process_id = distribution(random_device); return (static_cast<uint32_t>(node_id) << 16) + process_id; } void add_data(volatile void *lock_area, volatile void *data_area, const uint32_t lock_id) { spin_lock(lock_area, lock_id); const auto count = static_cast<volatile uint32_t *>(data_area)[0]; static_cast<volatile uint32_t *>(data_area)[0] = count + 1; spin_unlock(lock_area); } int main(int argc, char **argv) { const auto start = std::chrono::high_resolution_clock::now(); uint16_t node_id = 1; volatile auto lock_area = malloc(4); volatile auto data_area = malloc(4); memset(data_area, 0, 4); memset(lock_area, 0, 4); auto thread1 = std::thread([lock_area, data_area, node_id]() { const auto lock_id = generate_lock_id(node_id); for (int i = 0; i < 200000; i++) { add_data(lock_area, data_area, lock_id); } }); auto thread2 = std::thread([lock_area, data_area, node_id]() { const auto lock_id = generate_lock_id(node_id); for (int i = 0; i < 200000; i++) { add_data(lock_area, data_area, lock_id); } }); thread1.join(); thread2.join(); const auto end = std::chrono::high_resolution_clock::now(); const auto nanoseconds = std::chrono::duration_cast<std::chrono::nanoseconds>(end - start).count(); const auto seconds = static_cast<double>(nanoseconds) / 1000000000.0; const auto count = static_cast<volatile uint32_t *>(data_area)[0]; std::cout << "Time usage: " << seconds << "\nCount: " << count << std::endl; free(lock_area); free(data_area); return 0; } Questions:
Potential flaws: Aside from the possible duplicate
process_idissue ingenerate_lock_id(), are there other problems with this implementation? For example, is the assumption "the method works if the waiting time (10ns in code) exceeds the maximum memory synchronization latency" valid?Alternative implementations: Are there other pure memory-based spinlock approaches (without CPU atomic operations) suitable for cross-machine scenarios?
Key constraints:
- Programs may run on different computers (same-CPU atomic RMW operations are unavailable).
- Only shared memory and interrupts via the RFM2g card can be used.