Performance Optimization
Performance optimization in Rust involves understanding how the compiler works, measuring performance accurately, and applying targeted optimizations. Rust's zero-cost abstractions and fine-grained control over memory make it excellent for high-performance applications.
Performance Measurement and Profiling
Benchmarking with Criterion
First, add to Cargo.toml:
[dev-dependencies] criterion = { version = "0.5", features = ["html_reports"] } [[bench]] name = "my_benchmark" harness = false Basic benchmarking:
// benches/my_benchmark.rs use criterion::{black_box, criterion_group, criterion_main, Criterion}; fn fibonacci_recursive(n: u64) -> u64 { match n { 0 => 1, 1 => 1, n => fibonacci_recursive(n - 1) + fibonacci_recursive(n - 2), } } fn fibonacci_iterative(n: u64) -> u64 { if n == 0 || n == 1 { return 1; } let mut a = 1; let mut b = 1; for _ in 2..=n { let temp = a + b; a = b; b = temp; } b } fn fibonacci_lookup(n: u64) -> u64 { const FIBONACCI: [u64; 21] = [ 1, 1, 2, 3, 5, 8, 13, 21, 34, 55, 89, 144, 233, 377, 610, 987, 1597, 2584, 4181, 6765, 10946 ]; if n < 21 { FIBONACCI[n as usize] } else { fibonacci_iterative(n) } } fn criterion_benchmark(c: &mut Criterion) { let mut group = c.benchmark_group("fibonacci"); for i in [10u64, 15, 20].iter() { group.bench_with_input(format!("recursive/{}", i), i, |b, i| { b.iter(|| fibonacci_recursive(black_box(*i))) }); group.bench_with_input(format!("iterative/{}", i), i, |b, i| { b.iter(|| fibonacci_iterative(black_box(*i))) }); group.bench_with_input(format!("lookup/{}", i), i, |b, i| { b.iter(|| fibonacci_lookup(black_box(*i))) }); } group.finish(); } criterion_group!(benches, criterion_benchmark); criterion_main!(benches); Run benchmarks:
cargo bench Built-in Benchmarking (Nightly)
#![feature(test)] extern crate test; use test::Bencher; #[bench] fn bench_vector_sum_loop(b: &mut Bencher) { let data: Vec<i32> = (0..1000).collect(); b.iter(|| { let mut sum = 0; for &item in &data { sum += item; } sum }); } #[bench] fn bench_vector_sum_iterator(b: &mut Bencher) { let data: Vec<i32> = (0..1000).collect(); b.iter(|| data.iter().sum::<i32>()); } #[bench] fn bench_vector_sum_fold(b: &mut Bencher) { let data: Vec<i32> = (0..1000).collect(); b.iter(|| data.iter().fold(0, |acc, &x| acc + x)); } Profiling with perf and flamegraph
Install dependencies:
cargo install flamegraph # On Linux: install perf Profile your application:
# Generate flamegraph cargo flamegraph --bin my_app # Profile with perf perf record --call-graph=dwarf target/release/my_app perf report Example application to profile:
// src/main.rs use std::collections::HashMap; fn expensive_computation() -> HashMap<String, u64> { let mut map = HashMap::new(); for i in 0..1_000_000 { let key = format!("key_{}", i); let value = (i as u64).pow(2); map.insert(key, value); } map } fn process_data(map: &HashMap<String, u64>) -> u64 { map.values().filter(|&&v| v % 2 == 0).sum() } fn main() { let map = expensive_computation(); let result = process_data(&map); println!("Result: {}", result); } Compiler Optimizations
Release Profile Configuration
# Cargo.toml [profile.release] opt-level = 3 # Maximum optimization debug = false # No debug info debug-assertions = false overflow-checks = false lto = true # Link-time optimization codegen-units = 1 # Better optimization, slower compile panic = 'abort' # Smaller binary size [profile.release-lto] inherits = "release" lto = "fat" # More aggressive LTO codegen-units = 1 Target-Specific Optimizations
# .cargo/config.toml [build] rustflags = [ "-C", "target-cpu=native", # Optimize for current CPU "-C", "target-feature=+crt-static", # Static linking ] [target.x86_64-unknown-linux-gnu] rustflags = [ "-C", "link-arg=-fuse-ld=lld", # Use faster linker "-C", "target-cpu=haswell", # Target specific CPU ] Profile-Guided Optimization (PGO)
# Step 1: Build instrumented binary RUSTFLAGS="-Cprofile-generate=/tmp/pgo-data" \ cargo build --release --target-dir=/tmp/pgo # Step 2: Run typical workload /tmp/pgo/release/my_app # Step 3: Build optimized binary RUSTFLAGS="-Cprofile-use=/tmp/pgo-data" \ cargo build --release Memory Optimization
Reducing Allocations
// Bad: Many allocations fn process_strings_bad(input: &[&str]) -> Vec<String> { let mut result = Vec::new(); for s in input { let processed = s.to_uppercase(); let formatted = format!("Processed: {}", processed); result.push(formatted); } result } // Good: Pre-allocate and reuse fn process_strings_good(input: &[&str]) -> Vec<String> { let mut result = Vec::with_capacity(input.len()); let mut buffer = String::new(); for s in input { buffer.clear(); buffer.push_str("Processed: "); for ch in s.chars() { buffer.extend(ch.to_uppercase()); } result.push(buffer.clone()); } result } // Better: Avoid cloning fn process_strings_better(input: &[&str], output: &mut Vec<String>) { output.clear(); output.reserve(input.len()); for s in input { let formatted = format!("Processed: {}", s.to_uppercase()); output.push(formatted); } } // Best: Use iterators for zero-allocation processing when possible fn process_strings_best(input: &[&str]) -> impl Iterator<Item = String> + '_ { input.iter().map(|s| format!("Processed: {}", s.to_uppercase())) } Memory Pool Pattern
use std::collections::VecDeque; struct ObjectPool<T> { objects: VecDeque<T>, factory: fn() -> T, } impl<T> ObjectPool<T> { fn new(factory: fn() -> T) -> Self { ObjectPool { objects: VecDeque::new(), factory, } } fn get(&mut self) -> T { self.objects.pop_front().unwrap_or_else(self.factory) } fn return_object(&mut self, obj: T) { if self.objects.len() < 100 { // Limit pool size self.objects.push_back(obj); } } } // Usage example fn with_pool_optimization() { let mut string_pool = ObjectPool::new(String::new); for i in 0..1000 { let mut s = string_pool.get(); s.clear(); s.push_str(&format!("Item {}", i)); // Process string... println!("Processing: {}", s); string_pool.return_object(s); } } Stack vs Heap Allocation
// Stack allocation (faster) fn stack_allocation() -> [i32; 1000] { [0; 1000] // Allocated on stack } // Heap allocation (more flexible but slower) fn heap_allocation() -> Vec<i32> { vec![0; 1000] // Allocated on heap } // Hybrid approach: stack for small, heap for large fn adaptive_allocation(size: usize) -> Box<[i32]> { if size <= 1000 { // Use stack-allocated array and move to heap let mut vec = Vec::with_capacity(size); vec.resize(size, 0); vec.into_boxed_slice() } else { // Direct heap allocation vec![0; size].into_boxed_slice() } } // SmallVec for stack optimization use smallvec::{SmallVec, smallvec}; fn using_smallvec() { // Store up to 8 elements on stack, then heap let mut vec: SmallVec<[i32; 8]> = smallvec![1, 2, 3, 4]; vec.push(5); println!("SmallVec: {:?}", vec); } Data Structure Optimization
Choose Efficient Data Structures
use std::collections::{HashMap, BTreeMap, HashSet}; use indexmap::IndexMap; // Different data structures for different use cases fn data_structure_comparison() { // HashMap: O(1) average access, no ordering let mut hash_map = HashMap::new(); hash_map.insert("key1", "value1"); // BTreeMap: O(log n) access, sorted order let mut btree_map = BTreeMap::new(); btree_map.insert("key1", "value1"); // IndexMap: O(1) access, insertion order preserved let mut index_map = IndexMap::new(); index_map.insert("key1", "value1"); // Vector: O(1) indexed access, O(n) search let mut vec = Vec::new(); vec.push(("key1", "value1")); } // Custom data structure for specific use case struct CompactStringSet { data: Vec<u8>, offsets: Vec<usize>, } impl CompactStringSet { fn new() -> Self { CompactStringSet { data: Vec::new(), offsets: Vec::new(), } } fn insert(&mut self, s: &str) { self.offsets.push(self.data.len()); self.data.extend_from_slice(s.as_bytes()); self.data.push(0); // Null terminator } fn get(&self, index: usize) -> Option<&str> { if index >= self.offsets.len() { return None; } let start = self.offsets[index]; let end = if index + 1 < self.offsets.len() { self.offsets[index + 1] - 1 // -1 for null terminator } else { self.data.len() - 1 }; std::str::from_utf8(&self.data[start..end]).ok() } fn len(&self) -> usize { self.offsets.len() } } Bit Manipulation Optimizations
// Bit-packed boolean array struct BitSet { data: Vec<u64>, len: usize, } impl BitSet { fn new(len: usize) -> Self { let words = (len + 63) / 64; // Round up BitSet { data: vec![0; words], len, } } fn set(&mut self, index: usize, value: bool) { if index >= self.len { return; } let word_index = index / 64; let bit_index = index % 64; if value { self.data[word_index] |= 1u64 << bit_index; } else { self.data[word_index] &= !(1u64 << bit_index); } } fn get(&self, index: usize) -> bool { if index >= self.len { return false; } let word_index = index / 64; let bit_index = index % 64; (self.data[word_index] & (1u64 << bit_index)) != 0 } fn count_ones(&self) -> u32 { self.data.iter().map(|word| word.count_ones()).sum() } } // Bit manipulation tricks fn bit_tricks() { let mut x = 42u32; // Check if power of 2 let is_power_of_2 = x != 0 && (x & (x - 1)) == 0; // Next power of 2 x -= 1; x |= x >> 1; x |= x >> 2; x |= x >> 4; x |= x >> 8; x |= x >> 16; x += 1; // Count trailing zeros (fast) let trailing_zeros = 42u32.trailing_zeros(); // Population count (number of 1 bits) let pop_count = 42u32.count_ones(); println!("Bit tricks: power_of_2={}, next_pow2={}, trailing_zeros={}, pop_count={}", is_power_of_2, x, trailing_zeros, pop_count); } Algorithm Optimization
Cache-Friendly Algorithms
// Cache-unfriendly: row-major access of column-major data fn matrix_multiply_bad(a: &[Vec<f64>], b: &[Vec<f64>]) -> Vec<Vec<f64>> { let n = a.len(); let m = b[0].len(); let p = b.len(); let mut result = vec![vec![0.0; m]; n]; for i in 0..n { for j in 0..m { for k in 0..p { result[i][j] += a[i][k] * b[k][j]; // Poor cache locality } } } result } // Cache-friendly: blocked/tiled multiplication fn matrix_multiply_good(a: &[Vec<f64>], b: &[Vec<f64>]) -> Vec<Vec<f64>> { let n = a.len(); let m = b[0].len(); let p = b.len(); let mut result = vec![vec![0.0; m]; n]; let block_size = 64; // Optimize for cache line size for ii in (0..n).step_by(block_size) { for jj in (0..m).step_by(block_size) { for kk in (0..p).step_by(block_size) { let i_end = (ii + block_size).min(n); let j_end = (jj + block_size).min(m); let k_end = (kk + block_size).min(p); for i in ii..i_end { for j in jj..j_end { for k in kk..k_end { result[i][j] += a[i][k] * b[k][j]; } } } } } } result } // SIMD-optimized operations fn simd_sum(data: &[f32]) -> f32 { // Manual SIMD (requires nightly and target features) #[cfg(target_arch = "x86_64")] { use std::arch::x86_64::*; unsafe { let mut sum_vec = _mm256_setzero_ps(); let chunks = data.chunks_exact(8); for chunk in chunks { let chunk_vec = _mm256_loadu_ps(chunk.as_ptr()); sum_vec = _mm256_add_ps(sum_vec, chunk_vec); } // Horizontal sum let high = _mm256_extractf128_ps(sum_vec, 1); let low = _mm256_extractf128_ps(sum_vec, 0); let sum128 = _mm_add_ps(high, low); let sum64 = _mm_add_ps(sum128, _mm_movehl_ps(sum128, sum128)); let sum32 = _mm_add_ss(sum64, _mm_shuffle_ps(sum64, sum64, 1)); let mut result = _mm_cvtss_f32(sum32); // Handle remainder let remainder = data.chunks_exact(8).remainder(); result += remainder.iter().sum::<f32>(); result } } #[cfg(not(target_arch = "x86_64"))] { data.iter().sum() } } Parallel Processing
use rayon::prelude::*; // Sequential processing fn process_sequential(data: &[i32]) -> Vec<i32> { data.iter().map(|&x| x * x + 1).collect() } // Parallel processing fn process_parallel(data: &[i32]) -> Vec<i32> { data.par_iter().map(|&x| x * x + 1).collect() } // Parallel reduction fn parallel_sum(data: &[i32]) -> i64 { data.par_iter().map(|&x| x as i64).sum() } // Custom parallel algorithm fn parallel_quicksort<T: Ord + Send>(mut data: Vec<T>) -> Vec<T> { if data.len() <= 1 { return data; } let pivot_index = data.len() / 2; let pivot = data.remove(pivot_index); let (left, right): (Vec<_>, Vec<_>) = data.into_par_iter() .partition(|item| item < &pivot); if left.len() > 1000 && right.len() > 1000 { // Parallel recursive calls for large partitions let (mut sorted_left, mut sorted_right) = rayon::join( || parallel_quicksort(left), || parallel_quicksort(right), ); sorted_left.push(pivot); sorted_left.extend(sorted_right); sorted_left } else { // Sequential for small partitions let mut result = parallel_quicksort(left); result.push(pivot); result.extend(parallel_quicksort(right)); result } } String and Text Optimization
Efficient String Processing
// Avoiding allocations in string processing fn count_words_efficient(text: &str) -> usize { text.split_whitespace().count() // No allocations } fn extract_numbers_efficient(text: &str) -> Vec<i32> { text.split_whitespace() .filter_map(|s| s.parse().ok()) .collect() } // String interning for repeated strings use std::collections::HashMap; struct StringInterner { strings: Vec<String>, map: HashMap<String, usize>, } impl StringInterner { fn new() -> Self { StringInterner { strings: Vec::new(), map: HashMap::new(), } } fn intern(&mut self, s: &str) -> usize { if let Some(&id) = self.map.get(s) { id } else { let id = self.strings.len(); self.strings.push(s.to_string()); self.map.insert(s.to_string(), id); id } } fn get(&self, id: usize) -> Option<&str> { self.strings.get(id).map(|s| s.as_str()) } } // Byte string processing (faster than UTF-8) fn process_ascii_bytes(data: &[u8]) -> usize { data.iter().filter(|&&b| b.is_ascii_alphabetic()).count() } // SIMD string search fn find_byte_simd(haystack: &[u8], needle: u8) -> Option<usize> { // Use memchr crate for production code haystack.iter().position(|&b| b == needle) } I/O Optimization
Buffered I/O and Batch Processing
use std::io::{BufRead, BufReader, BufWriter, Write}; use std::fs::File; // Efficient file reading fn read_lines_efficient(filename: &str) -> Result<Vec<String>, std::io::Error> { let file = File::open(filename)?; let reader = BufReader::new(file); reader.lines().collect() } // Efficient file writing fn write_lines_efficient(filename: &str, lines: &[String]) -> Result<(), std::io::Error> { let file = File::create(filename)?; let mut writer = BufWriter::new(file); for line in lines { writeln!(writer, "{}", line)?; } writer.flush()?; Ok(()) } // Memory-mapped files for large data use memmap2::MmapOptions; fn process_large_file_mmap(filename: &str) -> Result<usize, Box<dyn std::error::Error>> { let file = File::open(filename)?; let mmap = unsafe { MmapOptions::new().map(&file)? }; // Process memory-mapped data directly let line_count = mmap.split(|&b| b == b'\n').count(); Ok(line_count) } // Async I/O for high concurrency async fn async_file_processing(filenames: Vec<&str>) -> Result<Vec<String>, tokio::io::Error> { use tokio::fs; use futures::future::try_join_all; let tasks: Vec<_> = filenames.into_iter() .map(|filename| async move { fs::read_to_string(filename).await }) .collect(); try_join_all(tasks).await } Compilation and Build Optimization
Cargo Configuration
# .cargo/config.toml [build] rustflags = ["-C", "target-cpu=native"] [profile.release] debug = true # Keep symbols for profiling debug-assertions = false overflow-checks = false lto = "thin" # Balance between compile time and performance codegen-units = 16 # Parallel compilation incremental = false # Disable for release builds panic = "abort" # Smaller binaries # Custom profile for maximum performance [profile.max-perf] inherits = "release" opt-level = 3 lto = "fat" codegen-units = 1 panic = "abort" Link-Time Optimization
# Enable LTO export RUSTFLAGS="-C link-args=-fuse-ld=lld" cargo build --release Cross-compilation optimization
# Target-specific optimizations cargo build --release --target x86_64-unknown-linux-musl cargo build --release --target x86_64-pc-windows-gnu # CPU-specific targeting export RUSTFLAGS="-C target-cpu=skylake" cargo build --release Advanced Optimization Techniques
Const Evaluation and Compile-Time Computation
// Compile-time computation const fn factorial(n: u64) -> u64 { if n == 0 { 1 } else { n * factorial(n - 1) } } const FACTORIAL_10: u64 = factorial(10); // Computed at compile time // Const generics for optimization struct Matrix<const N: usize, const M: usize> { data: [[f64; M]; N], } impl<const N: usize, const M: usize> Matrix<N, M> { const fn zeros() -> Self { Matrix { data: [[0.0; M]; N] } } // Optimized matrix multiplication for known sizes fn multiply<const P: usize>(&self, other: &Matrix<M, P>) -> Matrix<N, P> { let mut result = Matrix::<N, P>::zeros(); for i in 0..N { for j in 0..P { for k in 0..M { result.data[i][j] += self.data[i][k] * other.data[k][j]; } } } result } } // Lookup tables const SINE_TABLE: [f32; 360] = { let mut table = [0.0; 360]; let mut i = 0; while i < 360 { table[i] = (i as f32 * std::f32::consts::PI / 180.0).sin(); i += 1; } table }; fn fast_sine(degrees: usize) -> f32 { SINE_TABLE[degrees % 360] } Branch Prediction Optimization
// Help branch predictor with likely/unlikely fn process_with_hints(data: &[i32]) -> i32 { let mut sum = 0; for &value in data { if likely(value > 0) { sum += value; } else { sum -= value; } } sum } // Branchless programming fn branchless_max(a: i32, b: i32) -> i32 { let diff = a - b; a - (diff & (diff >> 31)) } fn branchless_abs(x: i32) -> i32 { let mask = x >> 31; (x + mask) ^ mask } // Jump table instead of long if-else chain fn dispatch_operation(op: u8, a: i32, b: i32) -> i32 { const OPERATIONS: [fn(i32, i32) -> i32; 4] = [ |a, b| a + b, // 0: add |a, b| a - b, // 1: subtract |a, b| a * b, // 2: multiply |a, b| a / b, // 3: divide ]; if (op as usize) < OPERATIONS.len() { OPERATIONS[op as usize](a, b) } else { 0 } } // Compiler hints (requires nightly) #[inline(always)] fn likely(b: bool) -> bool { #[cfg(target_arch = "x86_64")] unsafe { std::intrinsics::likely(b) } #[cfg(not(target_arch = "x86_64"))] b } Performance Testing and Validation
Comprehensive Benchmarking Suite
// benches/comprehensive.rs use criterion::{criterion_group, criterion_main, BenchmarkId, Criterion, Throughput}; fn bench_algorithms(c: &mut Criterion) { let mut group = c.benchmark_group("sorting"); for size in [100, 1000, 10000].iter() { let data: Vec<i32> = (0..*size).rev().collect(); group.throughput(Throughput::Elements(*size as u64)); group.bench_with_input( BenchmarkId::new("std_sort", size), size, |b, _size| { b.iter_batched( || data.clone(), |mut data| data.sort(), criterion::BatchSize::SmallInput, ) }, ); group.bench_with_input( BenchmarkId::new("custom_sort", size), size, |b, _size| { b.iter_batched( || data.clone(), |data| parallel_quicksort(data), criterion::BatchSize::SmallInput, ) }, ); } group.finish(); } criterion_group!(benches, bench_algorithms); criterion_main!(benches); Memory Usage Profiling
// Memory allocation tracking #[global_allocator] static ALLOC: jemallocator::Jemalloc = jemallocator::Jemalloc; fn track_memory_usage() { use jemalloc_ctl::{stats, epoch}; epoch::advance().unwrap(); let allocated = stats::allocated::read().unwrap(); println!("Allocated: {} bytes", allocated); // Your code here let _data: Vec<i32> = (0..1000000).collect(); epoch::advance().unwrap(); let allocated_after = stats::allocated::read().unwrap(); println!("Allocated after: {} bytes", allocated_after); println!("Difference: {} bytes", allocated_after - allocated); } Best Practices Summary
- Measure First: Always profile before optimizing
- Target Bottlenecks: Focus on the critical path
- Cache-Friendly Code: Consider memory access patterns
- Minimize Allocations: Reuse memory when possible
- Choose Right Data Structures: Match structure to access pattern
- Leverage Parallelism: Use rayon for CPU-bound tasks
- Optimize Compilation: Use appropriate release flags
- SIMD When Applicable: For data-parallel operations
- Avoid Premature Optimization: Keep code readable
- Validate Optimizations: Ensure correctness is maintained
Performance optimization in Rust is about understanding the system, measuring carefully, and applying targeted improvements. The language's zero-cost abstractions and control over memory layout provide excellent opportunities for optimization while maintaining safety.